Skip to content

Commit 839676b

Browse files
committed
update gradio
1 parent 27397d9 commit 839676b

File tree

1 file changed

+43
-15
lines changed

1 file changed

+43
-15
lines changed

app.py

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
def run_inference(
5555
text_input: str,
5656
audio_prompt_input: Optional[Tuple[int, np.ndarray]],
57+
transcription_input: Optional[str],
5758
max_new_tokens: int,
5859
cfg_scale: float,
5960
temperature: float,
@@ -78,6 +79,10 @@ def run_inference(
7879
prompt_path_for_generate = None
7980
if audio_prompt_input is not None:
8081
sr, audio_data = audio_prompt_input
82+
# Enforce maximum duration of 10 seconds for the audio prompt
83+
duration_sec = len(audio_data) / float(sr) if sr else 0
84+
if duration_sec > 10.0:
85+
raise gr.Error("Audio prompt must be 10 seconds or shorter.")
8186
# Check if audio_data is valid
8287
if audio_data is None or audio_data.size == 0 or audio_data.max() == 0: # Check for silence/empty
8388
gr.Warning("Audio prompt seems empty or silent, ignoring prompt.")
@@ -131,15 +136,23 @@ def run_inference(
131136

132137
# Use torch.inference_mode() context manager for the generation call
133138
with torch.inference_mode():
139+
# Concatenate transcription (if provided) to the main text
140+
combined_text = (
141+
text_input.strip() + "\n" + transcription_input.strip()
142+
if transcription_input and not transcription_input.isspace()
143+
else text_input
144+
)
145+
134146
output_audio_np = model.generate(
135-
text_input,
147+
combined_text,
136148
max_tokens=max_new_tokens,
137149
cfg_scale=cfg_scale,
138150
temperature=temperature,
139151
top_p=top_p,
140152
cfg_filter_top_k=cfg_filter_top_k, # Pass the value here
141153
use_torch_compile=False, # Keep False for Gradio stability
142154
audio_prompt=prompt_path_for_generate,
155+
verbose=True,
143156
)
144157

145158
end_time = time.time()
@@ -241,11 +254,16 @@ def run_inference(
241254
lines=5, # Increased lines
242255
)
243256
audio_prompt_input = gr.Audio(
244-
label="Audio Prompt (Optional)",
257+
label="Audio Prompt (≤ 10 s, Optional)",
245258
show_label=True,
246259
sources=["upload", "microphone"],
247260
type="numpy",
248261
)
262+
transcription_input = gr.Textbox(
263+
label="Audio Prompt Transcription (Optional)",
264+
placeholder="Enter transcription of your audio prompt here...",
265+
lines=3,
266+
)
249267
with gr.Accordion("Generation Parameters", open=False):
250268
max_new_tokens = gr.Slider(
251269
label="Max New Tokens (Audio Length)",
@@ -266,14 +284,14 @@ def run_inference(
266284
temperature = gr.Slider(
267285
label="Temperature (Randomness)",
268286
minimum=1.0,
269-
maximum=1.5,
270-
value=1.3, # Default from inference.py
287+
maximum=2.5,
288+
value=1.8, # Default from inference.py
271289
step=0.05,
272290
info="Lower values make the output more deterministic, higher values increase randomness.",
273291
)
274292
top_p = gr.Slider(
275293
label="Top P (Nucleus Sampling)",
276-
minimum=0.80,
294+
minimum=0.70,
277295
maximum=1.0,
278296
value=0.95, # Default from inference.py
279297
step=0.01,
@@ -282,16 +300,16 @@ def run_inference(
282300
cfg_filter_top_k = gr.Slider(
283301
label="CFG Filter Top K",
284302
minimum=15,
285-
maximum=50,
286-
value=30,
303+
maximum=100,
304+
value=45,
287305
step=1,
288306
info="Top k filter for CFG guidance.",
289307
)
290308
speed_factor_slider = gr.Slider(
291309
label="Speed Factor",
292310
minimum=0.8,
293311
maximum=1.0,
294-
value=0.94,
312+
value=1.0,
295313
step=0.02,
296314
info="Adjusts the speed of the generated audio (1.0 = original speed).",
297315
)
@@ -311,6 +329,7 @@ def run_inference(
311329
inputs=[
312330
text_input,
313331
audio_prompt_input,
332+
transcription_input,
314333
max_new_tokens,
315334
cfg_scale,
316335
temperature,
@@ -330,29 +349,38 @@ def run_inference(
330349
None,
331350
3072,
332351
3.0,
333-
1.3,
352+
1.8,
334353
0.95,
335-
35,
336-
0.94,
354+
45,
355+
1.0,
337356
],
338357
[
339358
"[S1] Open weights text to dialogue model. \n[S2] You get full control over scripts and voices. \n[S1] I'm biased, but I think we clearly won. \n[S2] Hard to disagree. (laughs) \n[S1] Thanks for listening to this demo. \n[S2] Try it now on Git hub and Hugging Face. \n[S1] If you liked our model, please give us a star and share to your friends. \n[S2] This was Nari Labs.",
340359
example_prompt_path if Path(example_prompt_path).exists() else None,
341360
3072,
342361
3.0,
343-
1.3,
362+
1.8,
344363
0.95,
345-
35,
346-
0.94,
364+
45,
365+
1.0,
347366
],
348367
]
349368

350369
if examples_list:
351370
gr.Examples(
352-
examples=examples_list,
371+
examples=[
372+
[
373+
ex[0], # text
374+
ex[1], # audio prompt path
375+
"", # transcription placeholder
376+
*ex[2:],
377+
]
378+
for ex in examples_list
379+
],
353380
inputs=[
354381
text_input,
355382
audio_prompt_input,
383+
transcription_input,
356384
max_new_tokens,
357385
cfg_scale,
358386
temperature,

0 commit comments

Comments
 (0)