Skip to content

Commit 19cdf89

Browse files
committed
Updates Round 3
1 parent 398335e commit 19cdf89

13 files changed

Lines changed: 127 additions & 44 deletions
Binary file not shown.
Binary file not shown.

_user_history/Surn/history.jsonl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"image_path": "G:\\Projects\\audiocraft\\_user_history\\Surn\\images\\4bdae45ae7ca48449ad5b6647eed48ea_tmpk20ym4na.png", "video_path": "G:\\Projects\\audiocraft\\_user_history\\Surn\\videos\\edm_my_vampires_74dff59ee9da4b30babfc049e1fdbbc3.mp4", "audio_path": "G:\\Projects\\audiocraft\\_user_history\\Surn\\audios\\edm_my_vampiresyp05x2i9_d1435bab236e4695a1e26d6bc1b6739f.wav", "document_path": "None", "label": "4/4 120bpm 320kbps 48khz, a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions", "metadata": {"prompt": "4/4 120bpm 320kbps 48khz, a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions", "negative_prompt": "", "Seed": 131541594365878, "steps": 1, "width": "768px", "height": "512px", "Dimension": 2, "Top-k": 280, "Top-p": 1150, "Randomness": 0.7, "cfg": 8.5, "overlap": 1, "Melody Condition": "vampire-mid", "Sample Segment": -1, "Duration": 5, "Audio": "C:\\Users\\CHARLE~1\\AppData\\Local\\Temp\\edm_my_vampiresyp05x2i9.wav", "font": "./assets/arial.ttf", "font_color": "rgba(200, 5.000000000000011, 5.000000000000011, 1)", "harmony_only": "Yes", "background": "C:\\Users\\CHARLE~1\\AppData\\Local\\Temp\\tmpk20ym4na.png", "include_title": true, "include_settings": false, "profile": "<gradio.components.state.State object at 0x00000242E1327400>", "commit": "398335e4525555d286b390e6618b76302bd7c85b", "tag": "v0.0.1-98-g398335e", "version": "https://huggingface.co/spaces/Surn/UnlimitedMusicGen/commit/398335e4525555d286b390e6618b76302bd7c85b", "model_version": "1.2.Surn", "model_name": "facebook/musicgen-stereo-melody-large", "model_description": "2 channels, 32000 Hz", "melody_name": "vampire-mid", "melody_extension": ".mp3", "hostname": "https://huggingface.co/spaces/Surn/UnlimitedMusicGen", "python": "3.10.11 (tags/v3.10.11:7d4cc5a, Apr 5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]", "torch": "2.6.0+cu124", "xformers": "0.0.29.post3", "gradio": "5.23.3", "huggingface_space": "", "CUDA": "CUDA is available. device: NVIDIA GeForce RTX 4090 version: 12.4", "datetime": "2025-04-05 02:37:32.476231"}}
976 KB
Loading
976 KB
Loading
Binary file not shown.
Binary file not shown.

app.py

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import time
1818
import typing as tp
1919
import warnings
20+
from tqdm import tqdm
2021
from audiocraft.models import MusicGen
2122
from audiocraft.data.audio import audio_write
2223
from audiocraft.data.audio_utils import apply_fade, apply_tafade, apply_splice_effect
@@ -48,6 +49,7 @@
4849
os.environ['USE_FLASH_ATTENTION'] = '1'
4950
os.environ['XFORMERS_FORCE_DISABLE_TRITON']= '1'
5051

52+
5153
def interrupt_callback():
5254
return INTERRUPTED
5355

@@ -162,7 +164,7 @@ def load_melody_filepath(melody_filepath, title, assigned_model):
162164

163165
return gr.update(value=melody_name), gr.update(maximum=MAX_PROMPT_INDEX, value=0), gr.update(value=assigned_model, interactive=True)
164166

165-
def predict(model, text, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True, harmony_only = False):
167+
def predict(model, text, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True, harmony_only = False, profile = gr.OAuthProfile, progress=gr.Progress(track_tqdm=True)):
166168
global MODEL, INTERRUPTED, INTERRUPTING, MOVE_TO_CPU
167169
output_segments = None
168170
melody_name = "Not Used"
@@ -228,14 +230,16 @@ def predict(model, text, melody_filepath, duration, dimension, topk, topp, tempe
228230
cfg_coef=cfg_coef,
229231
duration=segment_duration,
230232
two_step_cfg=False,
233+
extend_stride=10,
231234
rep_penalty=0.5
232235
)
236+
MODEL.set_custom_progress_callback(gr.Progress(track_tqdm=True))
233237

234238
try:
235239
if melody:
236240
# return excess duration, load next model and continue in loop structure building up output_segments
237241
if duration > MODEL.lm.cfg.dataset.segment_duration:
238-
output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.lm.cfg.dataset.segment_duration, prompt_index, harmony_only=False)
242+
output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.lm.cfg.dataset.segment_duration, prompt_index, harmony_only=False, progress=gr.Progress(track_tqdm=True))
239243
else:
240244
# pure original code
241245
sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
@@ -247,20 +251,20 @@ def predict(model, text, melody_filepath, duration, dimension, topk, topp, tempe
247251
descriptions=[text],
248252
melody_wavs=melody,
249253
melody_sample_rate=sr,
250-
progress=True
254+
progress=True, progress_callback=gr.Progress(track_tqdm=True)
251255
)
252256
# All output_segments are populated, so we can break the loop or set duration to 0
253257
break
254258
else:
255259
#output = MODEL.generate(descriptions=[text], progress=False)
256260
if not output_segments:
257-
next_segment = MODEL.generate(descriptions=[text], progress=True)
261+
next_segment = MODEL.generate(descriptions=[text], progress=True, progress_callback=gr.Progress(track_tqdm=True))
258262
duration -= segment_duration
259263
else:
260264
last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
261-
next_segment = MODEL.generate_continuation(last_chunk, MODEL.sample_rate, descriptions=[text], progress=True)
265+
next_segment = MODEL.generate_continuation(last_chunk, MODEL.sample_rate, descriptions=[text], progress=True, progress_callback=gr.Progress(track_tqdm=True))
262266
duration -= segment_duration - overlap
263-
if next_segment != None:
267+
if next_segment != None:
264268
output_segments.append(next_segment)
265269
except Exception as e:
266270
print(f"Error generating audio: {e}")
@@ -312,7 +316,7 @@ def predict(model, text, melody_filepath, duration, dimension, topk, topp, tempe
312316
return None, None, seed
313317
else:
314318
output = output.detach().cpu().float()[0]
315-
profile: gr.OAuthProfile | None = None
319+
316320
title_file_name = convert_title_to_filename(title)
317321
with NamedTemporaryFile("wb", suffix=".wav", delete=False, prefix = title_file_name) as file:
318322
video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Model: {model}\n Melody Condition:{melody_name}\n Sample Segment: {prompt_index}"
@@ -357,7 +361,7 @@ def predict(model, text, melody_filepath, duration, dimension, topk, topp, tempe
357361
"background": background,
358362
"include_title": include_title,
359363
"include_settings": include_settings,
360-
"profile": profile,
364+
"profile": "Satoshi Nakamoto" if profile.value is None else profile.value.username,
361365
"commit": commit_hash(),
362366
"tag": git_tag(),
363367
"version": gr.__version__,
@@ -396,11 +400,11 @@ def predict(model, text, melody_filepath, duration, dimension, topk, topp, tempe
396400

397401
if waveform_video_path:
398402
modules.user_history.save_file(
399-
profile=profile,
403+
profile=profile.value,
400404
image=background,
401-
audio=file,
405+
audio=file.name,
402406
video=waveform_video_path,
403-
label=text,
407+
label=title,
404408
metadata=metadata,
405409
)
406410

@@ -413,9 +417,9 @@ def predict(model, text, melody_filepath, duration, dimension, topk, topp, tempe
413417
torch.cuda.ipc_collect()
414418
return waveform_video_path, file.name, seed
415419

416-
gr.set_static_paths(paths=["fonts/","assets/"])
420+
gr.set_static_paths(paths=["fonts/","assets/", "images/"])
417421
def ui(**kwargs):
418-
with gr.Blocks(title="UnlimitedMusicGen",css_paths="style_20250331.css", theme='Surn/beeuty') as interface:
422+
with gr.Blocks(title="UnlimitedMusicGen",css_paths="style_20250331.css", theme='Surn/beeuty') as demo:
419423
with gr.Tab("UnlimitedMusicGen"):
420424
gr.Markdown(
421425
"""
@@ -482,12 +486,12 @@ def ui(**kwargs):
482486
with gr.Column() as c:
483487
output = gr.Video(label="Generated Music")
484488
wave_file = gr.File(label=".wav file", elem_id="output_wavefile", interactive=True)
485-
seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
489+
seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
486490

487491
radio.change(toggle_audio_src, radio, [melody_filepath], queue=False, show_progress=False)
488492
melody_filepath.change(load_melody_filepath, inputs=[melody_filepath, title, model], outputs=[title, prompt_index , model], api_name="melody_filepath_change", queue=False)
489493
reuse_seed.click(fn=lambda x: x, inputs=[seed_used], outputs=[seed], queue=False, api_name="reuse_seed")
490-
submit.click(predict, inputs=[model, text,melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap, prompt_index, include_title, include_settings, harmony_only], outputs=[output, wave_file, seed_used], api_name="submit")
494+
491495
gr.Examples(
492496
examples=[
493497
[
@@ -524,9 +528,24 @@ def ui(**kwargs):
524528
inputs=[text, melody_filepath, model, title],
525529
outputs=[output]
526530
)
527-
gr.HTML(value=versions_html(), visible=True, elem_id="versions")
531+
528532
with gr.Tab("User History") as history_tab:
529533
modules.user_history.render()
534+
user_profile = gr.State(None)
535+
536+
with gr.Row("Versions") as versions_row:
537+
gr.HTML(value=versions_html(), visible=True, elem_id="versions")
538+
539+
submit.click(
540+
modules.user_history.get_profile,
541+
inputs=[],
542+
outputs=[user_profile],
543+
queue=True,
544+
api_name="submit"
545+
).then(
546+
predict,
547+
inputs=[model, text,melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap, prompt_index, include_title, include_settings, harmony_only, user_profile],
548+
outputs=[output, wave_file, seed_used])
530549

531550
# Show the interface
532551
launch_kwargs = {}

audiocraft/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
# flake8: noqa
88
from . import data, modules, models
99

10-
__version__ = '1.4.Surn'
10+
__version__ = '1.2.Surn'

audiocraft/models/musicgen.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import omegaconf
1717
import torch
18+
import gradio as gr
1819

1920
from .encodec import CompressionModel
2021
from .lm import LMModel
@@ -67,7 +68,7 @@ def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
6768
self.device = next(iter(lm.parameters())).device
6869
self.generation_params: dict = {}
6970
self.set_generation_params(duration=self.duration) # 15 seconds by default
70-
self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
71+
self._progress_callback: tp.Union[tp.Callable[[int, int], None], gr.Progress] = None
7172
if self.device.type == 'cpu':
7273
self.autocast = TorchAutocast(enabled=False)
7374
else:
@@ -142,7 +143,7 @@ def get_pretrained(name: str = 'melody-large', device=None):
142143
def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
143144
top_p: float = 0.0, temperature: float = 1.0,
144145
duration: float = 30.0, cfg_coef: float = 3.0,
145-
two_step_cfg: bool = False, extend_stride: float = 18, rep_penalty: float = None):
146+
two_step_cfg: bool = False, extend_stride: float = 10, rep_penalty: float = None):
146147
"""Set the generation parameters for MusicGen.
147148
148149
Args:
@@ -173,12 +174,12 @@ def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
173174
'two_step_cfg': two_step_cfg,
174175
}
175176

176-
def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
177+
def set_custom_progress_callback(self, progress_callback: tp.Union[tp.Callable[[int, int], None],gr.Progress] = None):
177178
"""Override the default progress callback."""
178179
self._progress_callback = progress_callback
179180

180181
def generate_unconditional(self, num_samples: int, progress: bool = False,
181-
return_tokens: bool = False) -> tp.Union[torch.Tensor,
182+
return_tokens: bool = False, progress_callback: gr.Progress = None) -> tp.Union[torch.Tensor,
182183
tp.Tuple[torch.Tensor, torch.Tensor]]:
183184
"""Generate samples in an unconditional manner.
184185
@@ -194,7 +195,7 @@ def generate_unconditional(self, num_samples: int, progress: bool = False,
194195
return self.generate_audio(tokens), tokens
195196
return self.generate_audio(tokens)
196197

197-
def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False) \
198+
def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False, progress_callback: gr.Progress = None) \
198199
-> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
199200
"""Generate samples conditioned on text.
200201
@@ -212,7 +213,7 @@ def generate(self, descriptions: tp.List[str], progress: bool = False, return_to
212213

213214
def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
214215
melody_sample_rate: int, progress: bool = False,
215-
return_tokens: bool = False) -> tp.Union[torch.Tensor,
216+
return_tokens: bool = False, progress_callback=gr.Progress(track_tqdm=True)) -> tp.Union[torch.Tensor,
216217
tp.Tuple[torch.Tensor, torch.Tensor]]:
217218
"""Generate samples conditioned on text and melody.
218219
@@ -250,7 +251,7 @@ def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyTy
250251
return self.generate_audio(tokens)
251252

252253
def generate_with_all(self, descriptions: tp.List[str], melody_wavs: MelodyType,
253-
sample_rate: int, progress: bool = False, prompt: tp.Optional[torch.Tensor] = None, return_tokens: bool = False) \
254+
sample_rate: int, progress: bool = False, prompt: tp.Optional[torch.Tensor] = None, return_tokens: bool = False, progress_callback: gr.Progress = None) \
254255
-> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
255256
"""Generate samples conditioned on text and melody and audio prompts.
256257
Args:
@@ -307,7 +308,7 @@ def generate_with_all(self, descriptions: tp.List[str], melody_wavs: MelodyType,
307308

308309
def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
309310
descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
310-
progress: bool = False, return_tokens: bool = False) \
311+
progress: bool = False, return_tokens: bool = False, progress_callback: gr.Progress = None) \
311312
-> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
312313
"""Generate samples conditioned on audio prompts.
313314
@@ -317,7 +318,8 @@ def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
317318
prompt_sample_rate (int): Sampling rate of the given audio waveforms.
318319
descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
319320
progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
320-
return_tokens (bool, optional): If True, also return the generated tokens. Defaults to False.
321+
return_tokens (bool, optional): If True, also return the generated tokens. Defaults to False.\
322+
This is truly a hack and does not follow the progression of conditioning melody or previously generated audio.
321323
"""
322324
if prompt.dim() == 2:
323325
prompt = prompt[None]
@@ -338,7 +340,8 @@ def _prepare_tokens_and_attributes(
338340
self,
339341
descriptions: tp.Sequence[tp.Optional[str]],
340342
prompt: tp.Optional[torch.Tensor],
341-
melody_wavs: tp.Optional[MelodyList] = None,
343+
melody_wavs: tp.Optional[MelodyList] = None,
344+
progress_callback: tp.Optional[gr.Progress] = None
342345
) -> tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
343346
"""Prepare model inputs.
344347
@@ -392,7 +395,7 @@ def _prepare_tokens_and_attributes(
392395
return attributes, prompt_tokens
393396

394397
def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
395-
prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -> torch.Tensor:
398+
prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False, progress_callback: gr.Progress = None) -> torch.Tensor:
396399
"""Generate discrete audio tokens given audio prompt and/or conditions.
397400
398401
Args:
@@ -411,17 +414,19 @@ def _progress_callback(generated_tokens: int, tokens_to_generate: int):
411414
if self._progress_callback is not None:
412415
# Note that total_gen_len might be quite wrong depending on the
413416
# codebook pattern used, but with delay it is almost accurate.
414-
self._progress_callback(generated_tokens, total_gen_len)
415-
else:
417+
self._progress_callback((generated_tokens / total_gen_len), f"Generated {generated_tokens}/{total_gen_len} tokens")
418+
if progress_callback is not None:
419+
# Update Gradio progress bar
420+
progress_callback((generated_tokens / total_gen_len), f"Generated {generated_tokens}/{total_gen_len} tokens")
421+
if progress:
416422
print(f'{generated_tokens: 6d} / {total_gen_len: 6d}', end='\r')
417423

418424
if prompt_tokens is not None:
419425
assert max_prompt_len >= prompt_tokens.shape[-1], \
420426
"Prompt is longer than audio to generate"
421427

422-
callback = None
423-
if progress:
424-
callback = _progress_callback
428+
# callback = None
429+
callback = _progress_callback
425430

426431
if self.duration <= self.max_duration:
427432
# generate by sampling from LM, simple case.
@@ -481,7 +486,7 @@ def _progress_callback(generated_tokens: int, tokens_to_generate: int):
481486

482487
# generate audio
483488

484-
def generate_audio(self, gen_tokens: torch.Tensor):
489+
def generate_audio(self, gen_tokens: torch.Tensor):
485490
try:
486491
"""Generate Audio from tokens"""
487492
assert gen_tokens.dim() == 3

0 commit comments

Comments
 (0)