-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference-cerebras-111m.py
56 lines (46 loc) · 1.56 KB
/
inference-cerebras-111m.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
pipeline,
)
# Step 1: Load the model and the tokenizer
def load_model_and_tokenizer():
# Load the tokenizer and the model for GPT-111M
tokenizer = AutoTokenizer.from_pretrained(
"claysauruswrecks/cerebras-gpt-111m-pretrain-stack-smol-0-15k-chkp"
)
model = AutoModelForCausalLM.from_pretrained(
"claysauruswrecks/cerebras-gpt-111m-pretrain-stack-smol-0-15k-chkp"
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer, model
def main():
torch.set_num_threads(24)
torch.set_num_interop_threads(24)
# Uncomment for (default) A100 GPU usage
# torch.set_num_threads(6)
# torch.set_num_interop_threads(6)
print(
f"threads: (num_threads, num_interop_threads) ({torch.get_num_threads()}, {torch.get_num_interop_threads()})"
)
tokenizer, model = load_model_and_tokenizer()
text = "Generative AI is "
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
generated_text = pipe(text, max_length=50, do_sample=False, no_repeat_ngram_size=2)[
0
]
print(generated_text["generated_text"])
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(
**inputs,
num_beams=5,
max_new_tokens=50,
early_stopping=True,
no_repeat_ngram_size=2,
)
text_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(text_output[0])
if __name__ == "__main__":
main()