1+ import time
2+ import torch
3+ import torchaudio
4+ from transformers import AutoModelForCausalLM , AutoTokenizer , BitsAndBytesConfig , pipeline
5+ from transformers import Qwen2AudioForConditionalGeneration , AutoProcessor
6+ from whisperspeech .vq_stoks import RQBottleneckTransformer
7+ import os
8+ def setup_pipeline (model_path , use_4bit = False , use_8bit = False ):
9+ tokenizer = AutoTokenizer .from_pretrained (model_path )
10+
11+ model_kwargs = {"device_map" : "cuda" }
12+
13+ if use_4bit :
14+ model_kwargs ["quantization_config" ] = BitsAndBytesConfig (
15+ load_in_4bit = True ,
16+ bnb_4bit_compute_dtype = torch .bfloat16 ,
17+ bnb_4bit_use_double_quant = True ,
18+ bnb_4bit_quant_type = "nf4" ,
19+ )
20+ elif use_8bit :
21+ model_kwargs ["quantization_config" ] = BitsAndBytesConfig (
22+ load_in_8bit = True ,
23+ bnb_8bit_compute_dtype = torch .bfloat16 ,
24+ bnb_8bit_use_double_quant = True ,
25+ )
26+ else :
27+ model_kwargs ["torch_dtype" ] = torch .bfloat16
28+
29+ model = AutoModelForCausalLM .from_pretrained (model_path , ** model_kwargs )
30+
31+ return pipeline ("text-generation" , model = model , tokenizer = tokenizer )
32+ device = "cuda"
33+ vqmodel = RQBottleneckTransformer .load_model (
34+ "whisper-vq-stoks-v3-7lang-fixed.model"
35+ ).to (device )
36+ vqmodel .ensure_whisper (device )
37+ tokenizer = AutoTokenizer .from_pretrained ("homebrewltd/Ichigo-llama3.1-s-instruct-v0.3-phase-3" )
38+ llm_path = "homebrewltd/Ichigo-llama3.1-s-instruct-v0.3-phase-3"
39+ pipe = setup_pipeline (llm_path , use_8bit = False )
40+ def audio_to_sound_tokens (audio_path ):
41+ wav , sr = torchaudio .load (audio_path )
42+ if sr != 16000 :
43+ wav = torchaudio .functional .resample (wav , sr , 16000 )
44+ with torch .no_grad ():
45+ codes = vqmodel .encode_audio (wav .to ("cuda" ))
46+ codes = codes [0 ].cpu ().tolist ()
47+
48+ result = '' .join (f'<|sound_{ num :04d} |>' for num in codes )
49+ return f'<|sound_start|>{ result } <|sound_end|>'
50+ wav_files = os .listdir ("./testing_audio/" )
51+ audio_paths = [f"./testing_audio/{ file } " for file in wav_files ]
52+ def generate_text (pipe , messages , max_new_tokens = 64 , temperature = 0.0 , do_sample = False ):
53+ generation_args = {
54+ "max_new_tokens" : max_new_tokens ,
55+ "return_full_text" : False ,
56+ "temperature" : temperature ,
57+ "do_sample" : do_sample ,
58+ }
59+
60+ output = pipe (messages , ** generation_args )
61+ return output [0 ]['generated_text' ]
62+
63+ # Usage
64+ def measure_latency (pipe , audio_paths ):
65+ latencies = []
66+ latencies_enc = []
67+ for audio_path in audio_paths :
68+
69+ start_time_2 = time .perf_counter ()
70+ sound_tokens = audio_to_sound_tokens (audio_path )
71+ end_time_2 = time .perf_counter ()
72+ latency_enc = (end_time_2 - start_time_2 ) * 1000
73+ latencies_enc .append (latency_enc )
74+ start_time = time .perf_counter ()
75+ messages = [
76+ {"role" : "user" , "content" : sound_tokens },
77+ ]
78+ with torch .no_grad ():
79+ generated_text = generate_text (pipe , messages , max_new_tokens = 1 )
80+ end_time = time .perf_counter ()
81+
82+ latency = (end_time - start_time ) * 1000 # Convert to milliseconds
83+ latencies .append (latency )
84+ print (f"Latencies: { latencies } " )
85+ latencies .pop (0 )
86+ latencies_enc .pop (0 )
87+ avg_latency_enc = sum (latencies_enc ) / len (latencies_enc )
88+ avg_latency = sum (latencies ) / len (latencies )
89+ return avg_latency , avg_latency_enc
90+
91+ avg_latency , avg_latency_enc = measure_latency (pipe , audio_paths )
92+ print (f"Average time to first token: { avg_latency + avg_latency_enc :.2f} ms" )
93+ print (f"Average time to encode audio: { avg_latency_enc :.2f} ms" )
0 commit comments