Skip to content

Commit 66626d8

Browse files
authored
Merge pull request #96 from homebrewltd/dev/bach
add code for latency tesing for qwen2 and cascaded system
2 parents 84443c0 + 06f1838 commit 66626d8

3 files changed

Lines changed: 189 additions & 0 deletions

File tree

latency_testing/cascaded_system.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import logging
2+
import numpy as np
3+
import torch
4+
import os
5+
import time
6+
import librosa
7+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoTokenizer, AutoModelForCausalLM
8+
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
9+
wav_files= os.listdir("./testing_audio/")
10+
audio_paths = [f"./testing_audio/{file}" for file in wav_files]
11+
whisper_model_path = "openai/whisper-large-v3"
12+
llama_model_path = "meta-llama/Llama-3.1-8B-Instruct"
13+
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True, device_map="cuda")
14+
whisper_processor = AutoProcessor.from_pretrained(whisper_model_path)
15+
whisper_pipe = pipeline(
16+
"automatic-speech-recognition",
17+
model=whisper_model,
18+
tokenizer=whisper_processor.tokenizer,
19+
feature_extractor=whisper_processor.feature_extractor,
20+
max_new_tokens=128,
21+
chunk_length_s=30,
22+
batch_size=1,
23+
return_timestamps=True,
24+
torch_dtype=torch.float16,
25+
device_map="cuda",
26+
)
27+
whisper_model.eval()
28+
29+
llm_tokenizer = AutoTokenizer.from_pretrained(llama_model_path, padding_side='left')
30+
llm_tokenizer.pad_token = llm_tokenizer.eos_token
31+
llm_model = AutoModelForCausalLM.from_pretrained(llama_model_path, device_map="cuda", torch_dtype=torch.bfloat16)
32+
llm_model.eval()
33+
latencies = []
34+
for audio_path in audio_paths:
35+
start_time = time.perf_counter()
36+
whisper_output = whisper_pipe(audio_path, generate_kwargs={"language": "en"})['text'].strip()
37+
# print(whisper_output)
38+
querry = """{whisper_output}"""
39+
messages = [
40+
{"role": "user", "content": querry},
41+
]
42+
sample_templated = llm_tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False)
43+
encoded_batch = llm_tokenizer(sample_templated, return_tensors="pt").to("cuda")
44+
generated_ids = llm_model.generate(**encoded_batch, max_new_tokens=1, pad_token_id=llm_tokenizer.eos_token_id)
45+
end_time = time.perf_counter()
46+
latencies.append((end_time - start_time) * 1000)
47+
print(f"Latencies: {latencies}")
48+
latencies.pop(0)
49+
avg_latency = sum(latencies) / len(latencies)
50+
print(f"Mean latency: {np.mean(latencies)} ms")

latency_testing/ichigo.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import time
2+
import torch
3+
import torchaudio
4+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
5+
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
6+
from whisperspeech.vq_stoks import RQBottleneckTransformer
7+
import os
8+
def setup_pipeline(model_path, use_4bit=False, use_8bit=False):
9+
tokenizer = AutoTokenizer.from_pretrained(model_path)
10+
11+
model_kwargs = {"device_map": "cuda"}
12+
13+
if use_4bit:
14+
model_kwargs["quantization_config"] = BitsAndBytesConfig(
15+
load_in_4bit=True,
16+
bnb_4bit_compute_dtype=torch.bfloat16,
17+
bnb_4bit_use_double_quant=True,
18+
bnb_4bit_quant_type="nf4",
19+
)
20+
elif use_8bit:
21+
model_kwargs["quantization_config"] = BitsAndBytesConfig(
22+
load_in_8bit=True,
23+
bnb_8bit_compute_dtype=torch.bfloat16,
24+
bnb_8bit_use_double_quant=True,
25+
)
26+
else:
27+
model_kwargs["torch_dtype"] = torch.bfloat16
28+
29+
model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
30+
31+
return pipeline("text-generation", model=model, tokenizer=tokenizer)
32+
device = "cuda"
33+
vqmodel = RQBottleneckTransformer.load_model(
34+
"whisper-vq-stoks-v3-7lang-fixed.model"
35+
).to(device)
36+
vqmodel.ensure_whisper(device)
37+
tokenizer = AutoTokenizer.from_pretrained("homebrewltd/Ichigo-llama3.1-s-instruct-v0.3-phase-3")
38+
llm_path = "homebrewltd/Ichigo-llama3.1-s-instruct-v0.3-phase-3"
39+
pipe = setup_pipeline(llm_path, use_8bit=False)
40+
def audio_to_sound_tokens(audio_path):
41+
wav, sr = torchaudio.load(audio_path)
42+
if sr != 16000:
43+
wav = torchaudio.functional.resample(wav, sr, 16000)
44+
with torch.no_grad():
45+
codes = vqmodel.encode_audio(wav.to("cuda"))
46+
codes = codes[0].cpu().tolist()
47+
48+
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
49+
return f'<|sound_start|>{result}<|sound_end|>'
50+
wav_files= os.listdir("./testing_audio/")
51+
audio_paths = [f"./testing_audio/{file}" for file in wav_files]
52+
def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False):
53+
generation_args = {
54+
"max_new_tokens": max_new_tokens,
55+
"return_full_text": False,
56+
"temperature": temperature,
57+
"do_sample": do_sample,
58+
}
59+
60+
output = pipe(messages, **generation_args)
61+
return output[0]['generated_text']
62+
63+
# Usage
64+
def measure_latency(pipe, audio_paths):
65+
latencies = []
66+
latencies_enc = []
67+
for audio_path in audio_paths:
68+
69+
start_time_2 = time.perf_counter()
70+
sound_tokens = audio_to_sound_tokens(audio_path)
71+
end_time_2 = time.perf_counter()
72+
latency_enc = (end_time_2 - start_time_2) * 1000
73+
latencies_enc.append(latency_enc)
74+
start_time = time.perf_counter()
75+
messages = [
76+
{"role": "user", "content": sound_tokens},
77+
]
78+
with torch.no_grad():
79+
generated_text = generate_text(pipe, messages, max_new_tokens=1)
80+
end_time = time.perf_counter()
81+
82+
latency = (end_time - start_time) * 1000 # Convert to milliseconds
83+
latencies.append(latency)
84+
print(f"Latencies: {latencies}")
85+
latencies.pop(0)
86+
latencies_enc.pop(0)
87+
avg_latency_enc = sum(latencies_enc) / len(latencies_enc)
88+
avg_latency = sum(latencies) / len(latencies)
89+
return avg_latency, avg_latency_enc
90+
91+
avg_latency, avg_latency_enc = measure_latency(pipe, audio_paths)
92+
print(f"Average time to first token: {avg_latency+avg_latency_enc:.2f} ms")
93+
print(f"Average time to encode audio: {avg_latency_enc:.2f} ms")

latency_testing/qwen2audio.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from io import BytesIO
2+
from urllib.request import urlopen
3+
import librosa
4+
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
5+
import os
6+
import time
7+
import torch
8+
# allow only cuda:0
9+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
10+
wav_files= os.listdir("./testing_audio/")
11+
audio_paths = [f"./testing_audio/{file}" for file in wav_files]
12+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
13+
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="cuda:0")
14+
latencies = []
15+
for audio_path in audio_paths:
16+
conversation = [
17+
{"role": "user", "content": [
18+
{"type": "audio", "audio_url": f"{audio_path}"},
19+
]},
20+
]
21+
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
22+
# print(text)
23+
audios = []
24+
for message in conversation:
25+
if isinstance(message["content"], list):
26+
for ele in message["content"]:
27+
if ele["type"] == "audio":
28+
audios.append(librosa.load(
29+
ele['audio_url'],
30+
sr=processor.feature_extractor.sampling_rate)[0]
31+
)
32+
start_time = time.perf_counter()
33+
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
34+
inputs.input_ids = inputs.input_ids.to("cuda:0")
35+
inputs = inputs.to("cuda:0")
36+
37+
generate_ids = model.generate(**inputs, max_new_tokens=1)
38+
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39+
end_time = time.perf_counter()
40+
latencies.append((end_time - start_time) * 1000)
41+
# print(f"Latencies: {latencies}")
42+
# pop the first element cause model need first run to warm up
43+
latencies.pop(0)
44+
latency = sum(latencies) / len(latencies)
45+
print(f"Latencies: {latency}")
46+

0 commit comments

Comments
 (0)