-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate.py
143 lines (122 loc) · 4.44 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
## Imports
import re
import numpy as np
import torch
from constants import *
from utils import set_seed, device, collate_wrapper
## Setup
set_seed()
def generate_inference(
model,
tokenizer,
device,
method=None,
k=None,
p_nucleus=None,
temp=None,
max_new_tokens=None,
cond="",
deterministic=None,
):
"""
Wrapper for generating text using the specified model. Generates unconditionally if cond=None.
Inputs:
-model: Decoder model to be used for text generation
-tokenizer: Compatible tokenizer
-device: Device of model (CPU/CUDA)
-method (str): Decoding method for text generation ('multinomial', 'temperature', 'greedy', 'nucleus', or 'top-k')
-k (int): Positive integer for top-k logits to sample if top-k decoding
-p_nucleus (float/int): Cumulative probability cutoff if nucleus/top-p decoding
-temp (float/int): Temperature if temperature decoding
-max_new_tokens (int): Maximum number of tokens to generate
-cond (str=''): If provided, will serve as conditional prompt for text generation
-deterministic (int): If deterministic, uses the specified seed for model generation
Returns:
-res (str): Generated text string
"""
assert method in [
"multinomial",
"temperature",
"greedy",
"nucleus",
"top-k",
], "method must be 'multinomial', 'temperature', 'greedy', 'nucleus', or 'top-k'"
if method == "temperature":
assert (
(temp is not None)
and isinstance(temp, (int, float))
and (0 < temp)
and (temp <= 1)
), "temp must be defined as a number between (0, 1]"
if method == "nucleus":
assert (
(p_nucleus is not None)
and isinstance(p_nucleus, (int, float))
and (0 < p_nucleus)
and (p_nucleus <= 1)
), "p_nucleus must be defined as a number between (0, 1]"
# if method == 'num_beams':
# assert isinstance(num_beams, int) and (num_beams) > 0 and (num_beams) < 100
if method == "top-k":
assert (
(k is not None) and isinstance(k, int) and (k > 0) and (k < SEQ_LENGTH)
), "k must be defined as an integer greater than 0 and less than the model sequence length"
if max_new_tokens is None:
print("No max_new_tokens provided, using a default value of 250\n")
max_new_tokens = 250
assert (
(max_new_tokens is not None)
and isinstance(max_new_tokens, int)
and (max_new_tokens) > 0
and (max_new_tokens) <= 1000
), "max_new_tokens must be an integer between (0, 1000]"
if deterministic is not None:
set_seed(deterministic)
if cond != "":
cond_tokens = tokenizer(cond).input_ids
gen_tokens = model.generate(
torch.tensor(cond_tokens).unsqueeze(0).long().to(device),
method=method,
k=k,
p_nucleus=p_nucleus,
temp=temp,
max_new_tokens=max_new_tokens,
)[0]
# Insert delimiter to indicate where prompt ends
gen_prep = torch.zeros(
len(gen_tokens) + 2
).long() # make space for two more tokens for delimiter
gen_prep -= 1
gen_prep[: len(cond_tokens)] = gen_tokens[: len(cond_tokens)]
gen_prep[-(len(gen_tokens) - len(cond_tokens)) :] = gen_tokens[
-(len(gen_tokens) - len(cond_tokens)) :
]
gen_prep[gen_prep == -1] = torch.tensor(
tokenizer.encode(" || ")
) # insert tokens for || in between
res = tokenizer.decode(gen_prep)
else:
empty_tokens = torch.full((1, 1), tokenizer.bos_token_id, dtype=torch.long).to(
device
)
res = tokenizer.batch_decode(
model.generate(
empty_tokens,
method=method,
k=k,
p_nucleus=p_nucleus,
temp=temp,
max_new_tokens=max_new_tokens,
)
)[0]
res = re.sub(
re.escape(tokenizer.bos_token), "", res, count=2
) # Remove start and end tokens
# Clean up Unicode character issues
# '“' then 'â€' = opening and closing double quotes
# '’' = apostrophe
res = re.sub(r"“", '"', res)
res = re.sub(r"’", "'", res)
res = re.sub(r"â€", '"', res)
res = res + " <|endoftext|>" ## better end token with space
return res