Skip to content

Commit da6ee36

Browse files
authored
Add example of adding streaming support to run llm with transformers (#68)
* Add example of adding streaming support to run llm with transformers * lint
1 parent 2f3443e commit da6ee36

File tree

3 files changed

+73
-7
lines changed

3 files changed

+73
-7
lines changed

advanced/earning-sage/main.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,11 @@ def ui(self):
6161

6262
with blocks:
6363
gr.Markdown("# 🧙🏼 Earning Report Assistant")
64-
gr.Markdown("""
64+
gr.Markdown(
65+
"""
6566
This is an earning report assistant built for investors can't make the earning call on time. This sample is using Apple 2023 Q2 report. Feel free to reach out to [email protected] for more advanced features.
66-
""")
67+
"""
68+
)
6769
with gr.Row():
6870
chatbot = gr.Chatbot(label="Model")
6971
with gr.Row():

advanced/hf-stream-llm/photon.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import os
2+
from threading import Thread
3+
from queue import Queue
4+
5+
from loguru import logger
6+
from leptonai.photon import Photon, StreamingResponse
7+
8+
9+
class HfStreamLLM(Photon):
10+
11+
deployment_template = {
12+
"resource_shape": "gpu.a10.6xlarge",
13+
"env": {
14+
"MODEL_PATH": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
15+
},
16+
"secret": [
17+
"HUGGING_FACE_HUB_TOKEN",
18+
],
19+
}
20+
21+
requirement_dependency = [
22+
"transformers",
23+
]
24+
25+
handler_max_concurrency = 4
26+
27+
def init(self):
28+
from transformers import AutoModelForCausalLM, AutoTokenizer
29+
30+
model_path = os.environ["MODEL_PATH"]
31+
32+
self._tok = AutoTokenizer.from_pretrained(model_path)
33+
self._model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")
34+
35+
self._generation_queue = Queue()
36+
37+
for _ in range(self.handler_max_concurrency):
38+
Thread(target=self._generate, daemon=True).start()
39+
40+
def _generate(self):
41+
while True:
42+
streamer, args, kwargs = self._generation_queue.get()
43+
try:
44+
self._model.generate(*args, **kwargs)
45+
except Exception as e:
46+
logger.error(f"Error in generation: {e}")
47+
streamer.text_queue.put(streamer.stop_signal)
48+
49+
@Photon.handler
50+
def run(self, text: str, max_new_tokens: int = 100) -> StreamingResponse:
51+
from transformers import TextIteratorStreamer
52+
53+
streamer = TextIteratorStreamer(self._tok, skip_prompt=True, timeout=60)
54+
inputs = self._tok(text, return_tensors="pt").to("cuda")
55+
self._generation_queue.put_nowait(
56+
(
57+
streamer,
58+
(),
59+
dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens),
60+
)
61+
)
62+
return streamer

advanced/segment-anything/sam.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -229,11 +229,13 @@ def generate_mask(self, url: str) -> PNGResponse:
229229
# The below rendering code is copied from the segment-anything repo to draw the mask
230230
# on top of the original image.
231231
sorted_anns = sorted(masks, key=(lambda x: x["area"]), reverse=True)
232-
mask_img = np.ones((
233-
sorted_anns[0]["segmentation"].shape[0],
234-
sorted_anns[0]["segmentation"].shape[1],
235-
3,
236-
))
232+
mask_img = np.ones(
233+
(
234+
sorted_anns[0]["segmentation"].shape[0],
235+
sorted_anns[0]["segmentation"].shape[1],
236+
3,
237+
)
238+
)
237239
for ann in sorted_anns:
238240
mask_img[ann["segmentation"]] = np.random.random(3)
239241
alpha = 0.35

0 commit comments

Comments
 (0)