Removed confusing and inconsistent parameters primary_stem_output_path / secondary_stem_output_path, improved logic to clear stem output paths between separation runs to fix issue with overwritten output files

beveradb · beveradb · commit 3c23878fafff · 2024-03-14T22:13:21.000-05:00
diff --git a/README.md b/README.md
@@ -212,10 +212,9 @@ separator = Separator()
 separator.load_model()
 
 # Perform the separation on specific audio files without reloading the model
-primary_stem_output_path, secondary_stem_output_path = separator.separate('audio1.wav')
+output_files = separator.separate('audio1.wav')
 
-print(f'Primary stem saved at {primary_stem_output_path}')
-print(f'Secondary stem saved at {secondary_stem_output_path}')
+print(f"Separation complete! Output file(s): {' '.join(output_files)}")
 ```
 
 #### Batch processing, or processing with multiple models
@@ -253,8 +252,6 @@ output_file_paths_6 = separator.separate('audio3.wav')
 - log_formatter: (Optional) The log format. Default: None, which falls back to '%(asctime)s - %(levelname)s - %(module)s - %(message)s'
 - model_file_dir: (Optional) Directory to cache model files in. Default: /tmp/audio-separator-models/
 - output_dir: (Optional) Directory where the separated files will be saved. If not specified, uses the current directory.
-- primary_stem_output_path: (Optional) The path for saving the primary stem. Default: None
-- secondary_stem_output_path: (Optional) The path for saving the secondary stem. Default: None
 - output_format: (Optional) Format to encode output files, any common format (WAV, MP3, FLAC, M4A, etc.). Default: WAV
 - normalization_threshold: (Optional) The threshold for audio normalization. Default: 0.9
 - output_single_stem: (Optional) Output only a single stem, either 'instrumental' or 'vocals'. Default: None
diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py
@@ -102,8 +102,6 @@ def __init__(self, common_config, arch_config):
         self.secondary_source = None
         self.audio_file_path = None
         self.audio_file_base = None
-        self.secondary_source_map = None
-        self.primary_source_map = None
 
     def load_model(self):
         """
@@ -138,14 +136,11 @@ def separate(self, audio_file_path):
         Returns:
             list: A list of paths to the output files generated by the separation process.
         """
-        self.primary_source = None
-        self.secondary_source = None
-
         self.audio_file_path = audio_file_path
         self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
 
         # Prepare the mix for processing
-        self.logger.debug("Preparing mix...")
+        self.logger.debug(f"Preparing mix for input audio file {self.audio_file_path}...")
         mix = self.prepare_mix(self.audio_file_path)
 
         self.logger.debug("Normalizing mix before demixing...")
@@ -180,20 +175,20 @@ def separate(self, audio_file_path):
 
         # Save and process the secondary stem if needed
         if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
-            self.logger.info(f"Saving {self.secondary_stem_name} stem...")
-            if not self.secondary_stem_output_path:
-                self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
-            self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+
+            self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
+            self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
             output_files.append(self.secondary_stem_output_path)
 
         # Save and process the primary stem if needed
         if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
-            self.logger.info(f"Saving {self.primary_stem_name} stem...")
-            if not self.primary_stem_output_path:
-                self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
             if not isinstance(self.primary_source, np.ndarray):
                 self.primary_source = source.T
-            self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+
+            self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
+            self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
             output_files.append(self.primary_stem_output_path)
 
         # Not yet implemented from UVR features:
diff --git a/audio_separator/separator/architectures/mdxc_separator.py b/audio_separator/separator/architectures/mdxc_separator.py
@@ -53,8 +53,6 @@ def __init__(self, common_config, arch_config):
         self.secondary_source = None
         self.audio_file_path = None
         self.audio_file_base = None
-        self.primary_source_map = None
-        self.secondary_source_map = None
 
         self.logger.info("MDXC Separator initialisation complete")
 
@@ -94,7 +92,7 @@ def separate(self, audio_file_path):
         self.audio_file_path = audio_file_path
         self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
 
-        self.logger.debug("Preparing mix...")
+        self.logger.debug(f"Preparing mix for input audio file {self.audio_file_path}...")
         mix = self.prepare_mix(self.audio_file_path)
 
         self.logger.debug("Normalizing mix before demixing...")
@@ -115,19 +113,20 @@ def separate(self, audio_file_path):
             self.secondary_source = spec_utils.normalize(wave=source[self.secondary_stem_name], max_peak=self.normalization_threshold).T
 
         if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
-            self.logger.info(f"Saving {self.secondary_stem_name} stem...")
-            if not self.secondary_stem_output_path:
-                self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
-            self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+
+            self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
+            self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
             output_files.append(self.secondary_stem_output_path)
 
         if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
-            self.logger.info(f"Saving {self.primary_stem_name} stem...")
-            if not self.primary_stem_output_path:
-                self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            
             if not isinstance(self.primary_source, np.ndarray):
                 self.primary_source = source.T
-            self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+
+            self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
+            self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
             output_files.append(self.primary_stem_output_path)
         return output_files
 
diff --git a/audio_separator/separator/architectures/vr_separator.py b/audio_separator/separator/architectures/vr_separator.py
@@ -127,7 +127,7 @@ def separate(self, audio_file_path):
         self.audio_file_path = audio_file_path
         self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
 
-        self.logger.debug("Starting inference...")
+        self.logger.debug(f"Starting separation for input audio file {self.audio_file_path}...")
 
         nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]  # default
         vr_5_1_models = [56817, 218409]
@@ -167,27 +167,22 @@ def separate(self, audio_file_path):
 
         # Save and process the primary stem if needed
         if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
-            self.logger.info(f"Saving {self.primary_stem_name} stem...")
-            if not self.primary_stem_output_path:
-                self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
-
+            self.logger.debug(f"Processing primary stem: {self.primary_stem_name}")
             if not isinstance(self.primary_source, np.ndarray):
                 self.primary_source = self.spec_to_wav(y_spec).T
                 self.logger.debug("Converting primary source spectrogram to waveform.")
                 if not self.model_samplerate == 44100:
                     self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
                     self.logger.debug("Resampling primary source to 44100Hz.")
 
-            self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
-            self.logger.debug("Primary stem processed.")
+            self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+
+            self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
+            self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
             output_files.append(self.primary_stem_output_path)
 
         # Save and process the secondary stem if needed
         if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
-            self.logger.info(f"Saving {self.secondary_stem_name} stem...")
-            if not self.secondary_stem_output_path:
-                self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
-
             self.logger.debug(f"Processing secondary stem: {self.secondary_stem_name}")
             if not isinstance(self.secondary_source, np.ndarray):
                 self.secondary_source = self.spec_to_wav(v_spec).T
@@ -196,8 +191,10 @@ def separate(self, audio_file_path):
                     self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
                     self.logger.debug("Resampling secondary source to 44100Hz.")
 
-            self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
-            self.logger.debug("Secondary stem processed.")
+            self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+
+            self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
+            self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
             output_files.append(self.secondary_stem_output_path)
 
         # Not yet implemented from UVR features:
diff --git a/audio_separator/separator/common_separator.py b/audio_separator/separator/common_separator.py
@@ -65,12 +65,6 @@ def __init__(self, config):
         self.model_path = config.get("model_path")
         self.model_data = config.get("model_data")
 
-        # Optional custom output paths for the primary and secondary stems
-        # If left as None, the arch-specific class decides the output filename, e.g. something like:
-        # f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}"
-        self.primary_stem_output_path = config.get("primary_stem_output_path")
-        self.secondary_stem_output_path = config.get("secondary_stem_output_path")
-
         # Output directory and format
         self.output_dir = config.get("output_dir")
         self.output_format = config.get("output_format")
@@ -90,7 +84,6 @@ def __init__(self, config):
         self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0)
 
         self.logger.debug(f"Common params: model_name={self.model_name}, model_path={self.model_path}")
-        self.logger.debug(f"Common params: primary_stem_output_path={self.primary_stem_output_path}, secondary_stem_output_path={self.secondary_stem_output_path}")
         self.logger.debug(f"Common params: output_dir={self.output_dir}, output_format={self.output_format}")
         self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}")
         self.logger.debug(f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}")
@@ -99,6 +92,16 @@ def __init__(self, config):
         self.logger.debug(f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}")
         self.logger.debug(f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}")
 
+        # File-specific variables which need to be cleared between processing different audio inputs
+        self.audio_file_path = None
+        self.audio_file_base = None
+
+        self.primary_source = None
+        self.secondary_source = None
+
+        self.primary_stem_output_path = None
+        self.secondary_stem_output_path = None
+
         self.cached_sources_map = {}
 
     def separate(self, audio_file_path):
@@ -266,3 +269,18 @@ def clear_gpu_cache(self):
         if self.torch_device == torch.device("cuda"):
             self.logger.debug("Clearing CUDA cache...")
             torch.cuda.empty_cache()
+
+    def clear_file_specific_paths(self):
+        """
+        Clears the file-specific variables which need to be cleared between processing different audio inputs.
+        """
+        self.logger.info("Clearing input audio file paths, sources and stems...")
+
+        self.audio_file_path = None
+        self.audio_file_base = None
+
+        self.primary_source = None
+        self.secondary_source = None
+
+        self.primary_stem_output_path = None
+        self.secondary_stem_output_path = None
diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
@@ -33,8 +33,6 @@ class Separator:
         log_formatter (logging.Formatter): The logging formatter.
         model_file_dir (str): The directory where model files are stored.
         output_dir (str): The directory where output files will be saved.
-        primary_stem_output_path (str): The path for saving the primary stem.
-        secondary_stem_output_path (str): The path for saving the secondary stem.
         output_format (str): The format of the output audio file.
         normalization_threshold (float): The threshold for audio normalization.
         output_single_stem (str): Option to output a single stem.
@@ -63,12 +61,10 @@ class Separator:
 
     def __init__(
         self,
-        log_level=logging.DEBUG,
+        log_level=logging.INFO,
         log_formatter=None,
         model_file_dir="/tmp/audio-separator-models/",
         output_dir=None,
-        primary_stem_output_path=None,
-        secondary_stem_output_path=None,
         output_format="WAV",
         normalization_threshold=0.9,
         output_single_stem=None,
@@ -105,12 +101,6 @@ def __init__(
         self.model_file_dir = model_file_dir
         self.output_dir = output_dir
 
-        # Allow the user to specify the output paths for the primary and secondary stems
-        # If left as None, the arch-specific class decides the output filename, typically e.g. something like:
-        # f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}"
-        self.primary_stem_output_path = primary_stem_output_path
-        self.secondary_stem_output_path = secondary_stem_output_path
-
         # Create the model directory if it does not exist
         os.makedirs(self.model_file_dir, exist_ok=True)
 
@@ -141,10 +131,6 @@ def __init__(
 
         self.onnx_execution_provider = None
         self.model_instance = None
-        self.audio_file_path = None
-        self.audio_file_base = None
-        self.primary_source = None
-        self.secondary_source = None
 
         self.setup_accelerated_inferencing_device()
 
@@ -598,8 +584,6 @@ def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"):
             "model_name": model_name,
             "model_path": model_path,
             "model_data": model_data,
-            "primary_stem_output_path": self.primary_stem_output_path,
-            "secondary_stem_output_path": self.secondary_stem_output_path,
             "output_format": self.output_format,
             "output_dir": self.output_dir,
             "normalization_threshold": self.normalization_threshold,
@@ -652,17 +636,8 @@ def separate(self, audio_file_path):
         # Clear GPU cache to free up memory
         self.model_instance.clear_gpu_cache()
 
-        # Unset the audio file to prevent accidental re-separation of the same file
-        self.logger.debug("Clearing audio file...")
-        self.audio_file_path = None
-        self.audio_file_base = None
-
         # Unset more separation params to prevent accidentally re-using the wrong source files or output paths
-        self.logger.debug("Clearing sources and stems...")
-        self.primary_source = None
-        self.secondary_source = None
-        self.primary_stem_output_path = None
-        self.secondary_stem_output_path = None
+        self.model_instance.clear_file_specific_paths()
 
         # Log the completion of the separation process
         self.logger.debug("Separation process completed.")
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "audio-separator"
-version = "0.16.0"
+version = "0.16.1"
 description = "Easy to use audio stem separation, using various models from UVR trained primarily by @Anjok07"
 authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
 license = "MIT"
diff --git a/tests/TODO.txt b/tests/TODO.txt
@@ -6,4 +6,5 @@
 - Test processing file with multiple different models outputs separate expected files
 - Test each of the architecure specific parameters works as expected in both CLI and class mode
 - Generate oscillogram and spectrogram of model output for a short test file for each major supported model and compare to expected output to ensure separation is actually separating stems
-- Add a few different test files with different properties, e.g. background noise, stems present, or genre of music and ensure separation works as expected for each
+- Add a few different test files with different properties, e.g. background noise, stems present, or genre of music and ensure separation works as expected for each
+- Test that processing more than one distinct input file in sequence outputs separate files (not overwriting the first output with the second)