Merge pull request #96 from homebrewltd/dev/bach

tikikun · web-flow · commit 66626d8339cb · 2024-10-22T10:56:02.000+07:00
add code for latency tesing for qwen2 and cascaded system
diff --git a/latency_testing/cascaded_system.py b/latency_testing/cascaded_system.py
@@ -0,0 +1,50 @@
+import logging
+import numpy as np
+import torch
+import os
+import time
+import librosa
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoTokenizer, AutoModelForCausalLM
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+wav_files= os.listdir("./testing_audio/")
+audio_paths = [f"./testing_audio/{file}" for file in wav_files] 
+whisper_model_path = "openai/whisper-large-v3"
+llama_model_path = "meta-llama/Llama-3.1-8B-Instruct"
+whisper_model     = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True, device_map="cuda")
+whisper_processor = AutoProcessor.from_pretrained(whisper_model_path)
+whisper_pipe      = pipeline(
+                "automatic-speech-recognition",
+                model=whisper_model,
+                tokenizer=whisper_processor.tokenizer,
+                feature_extractor=whisper_processor.feature_extractor,
+                max_new_tokens=128,
+                chunk_length_s=30,
+                batch_size=1,
+                return_timestamps=True,
+                torch_dtype=torch.float16,
+                device_map="cuda",
+            )
+whisper_model.eval()
+
+llm_tokenizer           = AutoTokenizer.from_pretrained(llama_model_path, padding_side='left')
+llm_tokenizer.pad_token = llm_tokenizer.eos_token
+llm_model               = AutoModelForCausalLM.from_pretrained(llama_model_path, device_map="cuda", torch_dtype=torch.bfloat16)
+llm_model.eval()
+latencies = []
+for audio_path in audio_paths:
+    start_time = time.perf_counter()
+    whisper_output = whisper_pipe(audio_path, generate_kwargs={"language": "en"})['text'].strip()
+    # print(whisper_output)
+    querry = """{whisper_output}""" 
+    messages = [
+        {"role": "user", "content": querry},
+    ]
+    sample_templated = llm_tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False)
+    encoded_batch        = llm_tokenizer(sample_templated, return_tensors="pt").to("cuda")
+    generated_ids        = llm_model.generate(**encoded_batch, max_new_tokens=1, pad_token_id=llm_tokenizer.eos_token_id)
+    end_time = time.perf_counter()
+    latencies.append((end_time - start_time) * 1000)
+print(f"Latencies: {latencies}")    
+latencies.pop(0)
+avg_latency = sum(latencies) / len(latencies)
+print(f"Mean latency: {np.mean(latencies)} ms")
diff --git a/latency_testing/ichigo.py b/latency_testing/ichigo.py
@@ -0,0 +1,93 @@
+import time
+import torch
+import torchaudio
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
+from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+from whisperspeech.vq_stoks import RQBottleneckTransformer
+import os
+def setup_pipeline(model_path, use_4bit=False, use_8bit=False):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    model_kwargs = {"device_map": "cuda"}
+
+    if use_4bit:
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    elif use_8bit:
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_compute_dtype=torch.bfloat16,
+            bnb_8bit_use_double_quant=True,
+        )
+    else:
+        model_kwargs["torch_dtype"] = torch.bfloat16
+
+    model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
+
+    return pipeline("text-generation", model=model, tokenizer=tokenizer)
+device = "cuda"
+vqmodel = RQBottleneckTransformer.load_model(
+        "whisper-vq-stoks-v3-7lang-fixed.model"
+    ).to(device)
+vqmodel.ensure_whisper(device)
+tokenizer = AutoTokenizer.from_pretrained("homebrewltd/Ichigo-llama3.1-s-instruct-v0.3-phase-3")
+llm_path = "homebrewltd/Ichigo-llama3.1-s-instruct-v0.3-phase-3"
+pipe = setup_pipeline(llm_path, use_8bit=False)
+def audio_to_sound_tokens(audio_path):
+    wav, sr = torchaudio.load(audio_path)
+    if sr != 16000:
+        wav = torchaudio.functional.resample(wav, sr, 16000)
+    with torch.no_grad():
+        codes = vqmodel.encode_audio(wav.to("cuda"))
+        codes = codes[0].cpu().tolist()
+    
+    result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
+    return f'<|sound_start|>{result}<|sound_end|>'
+wav_files= os.listdir("./testing_audio/")
+audio_paths = [f"./testing_audio/{file}" for file in wav_files] 
+def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False):
+    generation_args = {
+        "max_new_tokens": max_new_tokens,
+        "return_full_text": False,
+        "temperature": temperature,
+        "do_sample": do_sample,
+    }
+
+    output = pipe(messages, **generation_args)
+    return output[0]['generated_text']
+
+# Usage
+def measure_latency(pipe, audio_paths):
+    latencies = []
+    latencies_enc = []
+    for audio_path in audio_paths:
+
+        start_time_2 = time.perf_counter()
+        sound_tokens = audio_to_sound_tokens(audio_path)
+        end_time_2 = time.perf_counter()
+        latency_enc = (end_time_2 - start_time_2) * 1000
+        latencies_enc.append(latency_enc)
+        start_time = time.perf_counter()
+        messages = [
+                {"role": "user", "content": sound_tokens},
+            ]
+        with torch.no_grad():
+            generated_text = generate_text(pipe, messages, max_new_tokens=1)
+        end_time = time.perf_counter()
+
+        latency = (end_time - start_time) * 1000  # Convert to milliseconds
+        latencies.append(latency)
+    print(f"Latencies: {latencies}")
+    latencies.pop(0)
+    latencies_enc.pop(0)
+    avg_latency_enc = sum(latencies_enc) / len(latencies_enc)
+    avg_latency = sum(latencies) / len(latencies)
+    return avg_latency, avg_latency_enc
+
+avg_latency, avg_latency_enc = measure_latency(pipe, audio_paths)
+print(f"Average time to first token: {avg_latency+avg_latency_enc:.2f} ms")
+print(f"Average time to encode audio: {avg_latency_enc:.2f} ms")
diff --git a/latency_testing/qwen2audio.py b/latency_testing/qwen2audio.py
@@ -0,0 +1,46 @@
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
+from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+import os
+import time
+import torch
+# allow only cuda:0
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+wav_files= os.listdir("./testing_audio/")
+audio_paths = [f"./testing_audio/{file}" for file in wav_files] 
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
+model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="cuda:0")
+latencies = []  
+for audio_path in audio_paths:
+    conversation = [
+        {"role": "user", "content": [
+            {"type": "audio", "audio_url": f"{audio_path}"},
+        ]},
+    ]
+    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    # print(text)
+    audios = []
+    for message in conversation:
+        if isinstance(message["content"], list):
+            for ele in message["content"]:
+                if ele["type"] == "audio":
+                    audios.append(librosa.load(
+                        ele['audio_url'], 
+                        sr=processor.feature_extractor.sampling_rate)[0]
+                    )
+    start_time = time.perf_counter()
+    inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+    inputs.input_ids = inputs.input_ids.to("cuda:0")
+    inputs = inputs.to("cuda:0")
+
+    generate_ids = model.generate(**inputs,  max_new_tokens=1)
+    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    end_time = time.perf_counter()
+    latencies.append((end_time - start_time) * 1000)
+# print(f"Latencies: {latencies}")
+# pop the first element cause model need first run to warm up
+latencies.pop(0)
+latency = sum(latencies) / len(latencies)   
+print(f"Latencies: {latency}")
+