0.3 release

My Name · My Name · commit 9d34f8a514d1 · 2025-03-28T12:56:25.000+01:00
- Enhanced Privacy: Audio data never touches the disk, protecting sensitive information
- Better Performance: Direct 16kHz recording reduces processing overhead
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Syllablaze v0.2 for KDE Plasma
+# Syllablaze v0.3 for KDE Plasma
 
 Real-time audio transcription app using OpenAI's Whisper.
 
@@ -12,6 +12,15 @@ Originally created by Guilherme da Silveira as "Telly Spelly".
 - Microphone selection
 - Auto clipboard copy
 - Native KDE integration
+- In-memory audio processing (no temporary files)
+- Direct 16kHz recording for improved privacy and reduced file size
+
+## What's New in v0.3
+
+- **Enhanced Privacy**: Audio is now processed entirely in memory without writing to disk at any point
+- **Improved Performance**: Direct 16kHz recording reduces processing time and memory usage
+- **Better Security**: No temporary files means no risk of sensitive audio data being left on disk
+- **Reduced Resource Usage**: Streamlined audio processing pipeline for more efficient operation
 
 ## Project Structure
 
diff --git a/blaze/constants.py b/blaze/constants.py
@@ -6,7 +6,7 @@
 APP_NAME = "Syllablaze"
 
 # Application version
-APP_VERSION = "0.2"
+APP_VERSION = "0.3"
 
 # Organization name
 ORG_NAME = "KDE"
diff --git a/blaze/main.py b/blaze/main.py
@@ -315,17 +315,17 @@ def update_volume_meter(self, value):
         if self.progress_window and self.recording:
             self.progress_window.update_volume(value)
     
-    def handle_recording_finished(self, audio_file):
-        """Called when recording is saved to file"""
-        logger.info("TrayRecorder: Recording finished, starting transcription")
+    def handle_recording_finished(self, audio_data):
+        """Called when recording is processed in memory"""
+        logger.info("TrayRecorder: Recording processed, starting transcription")
         
         # Ensure progress window is in processing mode
         if self.progress_window:
             self.progress_window.set_processing_mode()
             self.progress_window.set_status("Starting transcription...")
         
         if self.transcriber:
-            self.transcriber.transcribe_file(audio_file)
+            self.transcriber.transcribe_file(audio_data)
         else:
             logger.error("Transcriber not initialized")
             if self.progress_window:
diff --git a/blaze/recorder.py b/blaze/recorder.py
@@ -38,7 +38,6 @@ def flush(self):
 import pyaudio
 import wave
 from PyQt6.QtCore import QObject, pyqtSignal
-import tempfile
 import logging
 import numpy as np
 from blaze.settings import Settings
@@ -53,7 +52,7 @@ def flush(self):
 logger = logging.getLogger(__name__)
 
 class AudioRecorder(QObject):
-    recording_finished = pyqtSignal(str)  # Emits path to recorded file
+    recording_finished = pyqtSignal(object)  # Emits audio data as numpy array
     recording_error = pyqtSignal(str)
     volume_updated = pyqtSignal(float)
     
@@ -266,13 +265,40 @@ def stop_recording(self):
             self.recording_error.emit(f"Error stopping recording: {e}")
 
     def _process_recording(self):
-        """Process and save the recording"""
+        """Process the recording and keep it in memory"""
         try:
-            temp_file = tempfile.mktemp(suffix='.wav')
-            logger.info("Processing recording...")
-            self.save_audio(temp_file)
-            logger.info(f"Recording processed and saved to: {os.path.abspath(temp_file)}")
-            self.recording_finished.emit(temp_file)
+            logger.info("Processing recording in memory...")
+            # Convert frames to numpy array
+            audio_data = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+            
+            if not hasattr(self, 'current_sample_rate') or self.current_sample_rate is None:
+                logger.warning("No sample rate information available, assuming device default")
+                if self.current_device_info is not None:
+                    original_rate = int(self.current_device_info['defaultSampleRate'])
+                else:
+                    # If no device info is available, we have to use a reasonable default
+                    # Get the default input device's sample rate
+                    original_rate = int(self.audio.get_default_input_device_info()['defaultSampleRate'])
+            else:
+                original_rate = self.current_sample_rate
+                
+            # Resample to 16000Hz if needed
+            if original_rate != WHISPER_SAMPLE_RATE:
+                logger.info(f"Resampling audio from {original_rate}Hz to {WHISPER_SAMPLE_RATE}Hz")
+                # Calculate resampling ratio
+                ratio = WHISPER_SAMPLE_RATE / original_rate
+                output_length = int(len(audio_data) * ratio)
+                
+                # Resample audio
+                audio_data = signal.resample(audio_data, output_length)
+            else:
+                logger.info(f"No resampling needed, audio already at {WHISPER_SAMPLE_RATE}Hz")
+            
+            # Normalize the audio data to float32 in the range [-1.0, 1.0] as expected by Whisper
+            audio_data = audio_data.astype(np.float32) / 32768.0
+            
+            logger.info("Recording processed in memory")
+            self.recording_finished.emit(audio_data)
         except Exception as e:
             logger.error(f"Failed to process recording: {e}")
             self.recording_error.emit(f"Failed to process recording: {e}")
diff --git a/blaze/transcriber.py b/blaze/transcriber.py
@@ -15,22 +15,19 @@ class TranscriptionWorker(QThread):
     progress_percent = pyqtSignal(int)
     error = pyqtSignal(str)
     
-    def __init__(self, model, audio_file):
+    def __init__(self, model, audio_data):
         super().__init__()
         self.model = model
-        self.audio_file = audio_file
+        self.audio_data = audio_data
         self.settings = Settings()
         self.language = self.settings.get('language', 'auto')
         
     def run(self):
         try:
-            if not os.path.exists(self.audio_file):
-                raise FileNotFoundError(f"Audio file not found: {self.audio_file}")
-                
-            self.progress.emit("Loading audio file...")
+            self.progress.emit("Processing audio...")
             self.progress_percent.emit(10)
             
-            # Load and transcribe
+            # Transcribe directly from memory
             self.progress.emit("Processing audio with Whisper...")
             self.progress_percent.emit(30)
             
@@ -46,7 +43,7 @@ def progress_callback(progress):
             logger.info(f"Transcribing with language: {lang_str}")
             
             result = self.model.transcribe(
-                self.audio_file,
+                self.audio_data,
                 fp16=False,
                 language=None if self.language == 'auto' else self.language
             )
@@ -64,13 +61,6 @@ def progress_callback(progress):
             logger.error(f"Transcription error: {e}")
             self.error.emit(f"Transcription failed: {str(e)}")
             self.finished.emit("")
-        finally:
-            # Clean up the temporary file
-            try:
-                if os.path.exists(self.audio_file):
-                    os.remove(self.audio_file)
-            except Exception as e:
-                logger.error(f"Failed to remove temporary file: {e}")
 
 class WhisperTranscriber(QObject):
     transcription_progress = pyqtSignal(str)
@@ -200,8 +190,8 @@ def _cleanup_worker(self):
                 self.worker.deleteLater()
                 self.worker = None
                 
-    def transcribe(self, audio_file):
-        """Transcribe audio file using Whisper"""
+    def transcribe(self, audio_data):
+        """Transcribe audio data directly from memory"""
         try:
             # Check if model needs to be reloaded due to settings changes
             self.reload_model_if_needed()
@@ -222,7 +212,7 @@ def transcribe(self, audio_file):
             
             # Run transcription with language setting
             result = self.model.transcribe(
-                audio_file,
+                audio_data,
                 fp16=False,
                 language=None if self.current_language == 'auto' else self.current_language
             )
@@ -235,18 +225,19 @@ def transcribe(self, audio_file):
             logger.info(f"Transcribed text: [{text}]")
             self.transcription_finished.emit(text)
             
-            # Clean up the temporary file
-            try:
-                if os.path.exists(audio_file):
-                    os.remove(audio_file)
-            except Exception as e:
-                logger.error(f"Failed to remove temporary file: {e}")
-            
         except Exception as e:
             logger.error(f"Transcription failed: {e}")
             self.transcription_error.emit(str(e))
 
-    def transcribe_file(self, audio_file):
+    def transcribe_file(self, audio_data):
+        """
+        Transcribe audio data directly from memory
+        
+        Parameters:
+        -----------
+        audio_data: np.ndarray
+            Audio data as a NumPy array, expected to be float32 in range [-1.0, 1.0]
+        """
         if self.worker and self.worker.isRunning():
             logger.warning("Transcription already in progress")
             return
@@ -272,9 +263,9 @@ def transcribe_file(self, audio_file):
         lang_str = "auto-detect" if self.current_language == 'auto' else self.current_language
         logger.info(f"Transcription worker using language: {lang_str}")
         logger.info(f"Transcription worker using model: {self.current_model_name}")
-        print(f"Transcribing file with model: {self.current_model_name}, language: {lang_str}")
+        print(f"Transcribing audio with model: {self.current_model_name}, language: {lang_str}")
         
-        self.worker = TranscriptionWorker(self.model, audio_file)
+        self.worker = TranscriptionWorker(self.model, audio_data)
         # Make sure the worker uses the current language setting
         self.worker.language = self.current_language
         self.worker.finished.connect(self.transcription_finished)
diff --git a/docs/activeContext.md b/docs/activeContext.md
@@ -13,6 +13,11 @@ The current focus of the Syllablaze project is to optimize the application for U
 
 ## Recent Changes
 
+1. **In-Memory Audio Processing**:
+   - Implemented direct memory-to-memory audio processing without writing to disk
+   - Enhanced privacy by eliminating temporary files
+   - Improved performance with direct 16kHz recording
+   - Updated to version 0.3
 
 ## Next Steps
 
@@ -36,8 +41,13 @@ The current focus of the Syllablaze project is to optimize the application for U
    - Implemented table-based UI for model management
    - Added download, delete, and activation functionality
    - Integrated with settings window
-10. **Test Installation**: Verify the installation process works correctly on Ubuntu KDE
-11. **Future Exploration**: Begin research on creating a Flatpak version
+10. ✅ **Implement In-Memory Audio Processing**: Enhanced privacy and performance
+   - Eliminated temporary files for better privacy and security
+   - Implemented direct memory-to-memory audio processing
+   - Optimized for 16kHz recording to reduce processing overhead
+   - Updated to version 0.3
+11. **Test Installation**: Verify the installation process works correctly on Ubuntu KDE
+12. **Future Exploration**: Begin research on creating a Flatpak version
 
 ## Active Decisions and Considerations
 
@@ -82,6 +92,11 @@ The current focus of the Syllablaze project is to optimize the application for U
     - Consideration: Need to handle download progress simulation since Whisper API doesn't provide direct progress tracking
 
 9. **Single Instance Enforcement**:
-    - Decision: Implement a robust file locking mechanism to ensure only one instance of Syllablaze can run at a time
-    - Rationale: Prevents resource conflicts and confusion from multiple instances running simultaneously
-    - Consideration: Uses a file lock in ~/.cache/syllablaze/ with proper cleanup on application exit and signal handling
+     - Decision: Implement a robust file locking mechanism to ensure only one instance of Syllablaze can run at a time
+     - Rationale: Prevents resource conflicts and confusion from multiple instances running simultaneously
+     - Consideration: Uses a file lock in ~/.cache/syllablaze/ with proper cleanup on application exit and signal handling
+
+10. **In-Memory Audio Processing**:
+     - Decision: Process audio entirely in memory without writing to temporary files
+     - Rationale: Enhances privacy, security, and performance by avoiding disk operations
+     - Consideration: Directly passes audio data as NumPy arrays between components, leveraging Whisper's ability to process in-memory data
diff --git a/docs/progress.md b/docs/progress.md
@@ -5,9 +5,10 @@
 1. **Core Functionality**:
    - Audio recording from system microphones
    - Real-time transcription using OpenAI's Whisper
+   - In-memory audio processing (no temporary files)
+   - Direct 16kHz recording for improved performance
    - Automatic clipboard integration for transcribed text
    - System tray integration with KDE Plasma
-   - Global keyboard shortcuts for quick recording
    - Settings management and persistence
 
 2. **User Interface**:
@@ -18,7 +19,6 @@
    - Comprehensive Whisper model management interface
 
 3. **Installation**:
-   - Enhanced setup.sh script for user-level installation using pipx
    - Desktop file integration with KDE
    - Icon integration
    - Improved system dependency checks
@@ -41,7 +41,7 @@
 
 ## Current Status
 
-The core functionality works well, with significant improvements in the Whisper model management interface. There are still opportunities for enhancement in error handling and system integration.
+The core functionality works well, with significant improvements in the Whisper model management interface and enhanced privacy through in-memory audio processing. Version 0.3 introduces direct memory-to-memory audio processing without writing to disk, improving both privacy and performance. There are still opportunities for enhancement in error handling and system integration.
 
 ### Installation Status
 
@@ -52,7 +52,8 @@ The core functionality works well, with significant improvements in the Whisper
 
 ### Functionality Status
 
-- Audio recording works reliably
+- Audio recording works reliably with in-memory processing
+- No temporary files are created during the recording and transcription process
 - Transcription accuracy depends on the Whisper model selected
 - KDE integration works well on standard KDE Plasma
 - Clipboard integration functions as expected
@@ -111,4 +112,12 @@ The core functionality works well, with significant improvements in the Whisper
    - Created a comprehensive model management interface
    - Implemented table-based UI for model management
    - Added download, delete, and activation functionality
-   - Integrated with settings window
+   - Integrated with settings window
+
+8. **Temporary Files**: ✅ FIXED
+   - ~~Audio was temporarily written to disk during processing~~
+   - ~~Potential privacy concern with sensitive audio data~~
+   - Solution: Implemented in-memory audio processing without writing to disk
+   - Audio data now flows directly from recorder to transcriber as NumPy arrays
+   - Enhanced privacy and security by eliminating temporary files
+   - Improved performance with direct 16kHz recording
diff --git a/test_lock.py b/test_lock.py
diff --git a/test_single_instance.py b/test_single_instance.py