Skip to content

Commit 9d34f8a

Browse files
author
My Name
committed
0.3 release
- Enhanced Privacy: Audio data never touches the disk, protecting sensitive information - Better Performance: Direct 16kHz recording reduces processing overhead
1 parent 545abc7 commit 9d34f8a

9 files changed

+102
-280
lines changed

Diff for: README.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Syllablaze v0.2 for KDE Plasma
1+
# Syllablaze v0.3 for KDE Plasma
22

33
Real-time audio transcription app using OpenAI's Whisper.
44

@@ -12,6 +12,15 @@ Originally created by Guilherme da Silveira as "Telly Spelly".
1212
- Microphone selection
1313
- Auto clipboard copy
1414
- Native KDE integration
15+
- In-memory audio processing (no temporary files)
16+
- Direct 16kHz recording for improved privacy and reduced file size
17+
18+
## What's New in v0.3
19+
20+
- **Enhanced Privacy**: Audio is now processed entirely in memory without writing to disk at any point
21+
- **Improved Performance**: Direct 16kHz recording reduces processing time and memory usage
22+
- **Better Security**: No temporary files means no risk of sensitive audio data being left on disk
23+
- **Reduced Resource Usage**: Streamlined audio processing pipeline for more efficient operation
1524

1625
## Project Structure
1726

Diff for: blaze/constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
APP_NAME = "Syllablaze"
77

88
# Application version
9-
APP_VERSION = "0.2"
9+
APP_VERSION = "0.3"
1010

1111
# Organization name
1212
ORG_NAME = "KDE"

Diff for: blaze/main.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -315,17 +315,17 @@ def update_volume_meter(self, value):
315315
if self.progress_window and self.recording:
316316
self.progress_window.update_volume(value)
317317

318-
def handle_recording_finished(self, audio_file):
319-
"""Called when recording is saved to file"""
320-
logger.info("TrayRecorder: Recording finished, starting transcription")
318+
def handle_recording_finished(self, audio_data):
319+
"""Called when recording is processed in memory"""
320+
logger.info("TrayRecorder: Recording processed, starting transcription")
321321

322322
# Ensure progress window is in processing mode
323323
if self.progress_window:
324324
self.progress_window.set_processing_mode()
325325
self.progress_window.set_status("Starting transcription...")
326326

327327
if self.transcriber:
328-
self.transcriber.transcribe_file(audio_file)
328+
self.transcriber.transcribe_file(audio_data)
329329
else:
330330
logger.error("Transcriber not initialized")
331331
if self.progress_window:

Diff for: blaze/recorder.py

+34-8
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ def flush(self):
3838
import pyaudio
3939
import wave
4040
from PyQt6.QtCore import QObject, pyqtSignal
41-
import tempfile
4241
import logging
4342
import numpy as np
4443
from blaze.settings import Settings
@@ -53,7 +52,7 @@ def flush(self):
5352
logger = logging.getLogger(__name__)
5453

5554
class AudioRecorder(QObject):
56-
recording_finished = pyqtSignal(str) # Emits path to recorded file
55+
recording_finished = pyqtSignal(object) # Emits audio data as numpy array
5756
recording_error = pyqtSignal(str)
5857
volume_updated = pyqtSignal(float)
5958

@@ -266,13 +265,40 @@ def stop_recording(self):
266265
self.recording_error.emit(f"Error stopping recording: {e}")
267266

268267
def _process_recording(self):
269-
"""Process and save the recording"""
268+
"""Process the recording and keep it in memory"""
270269
try:
271-
temp_file = tempfile.mktemp(suffix='.wav')
272-
logger.info("Processing recording...")
273-
self.save_audio(temp_file)
274-
logger.info(f"Recording processed and saved to: {os.path.abspath(temp_file)}")
275-
self.recording_finished.emit(temp_file)
270+
logger.info("Processing recording in memory...")
271+
# Convert frames to numpy array
272+
audio_data = np.frombuffer(b''.join(self.frames), dtype=np.int16)
273+
274+
if not hasattr(self, 'current_sample_rate') or self.current_sample_rate is None:
275+
logger.warning("No sample rate information available, assuming device default")
276+
if self.current_device_info is not None:
277+
original_rate = int(self.current_device_info['defaultSampleRate'])
278+
else:
279+
# If no device info is available, we have to use a reasonable default
280+
# Get the default input device's sample rate
281+
original_rate = int(self.audio.get_default_input_device_info()['defaultSampleRate'])
282+
else:
283+
original_rate = self.current_sample_rate
284+
285+
# Resample to 16000Hz if needed
286+
if original_rate != WHISPER_SAMPLE_RATE:
287+
logger.info(f"Resampling audio from {original_rate}Hz to {WHISPER_SAMPLE_RATE}Hz")
288+
# Calculate resampling ratio
289+
ratio = WHISPER_SAMPLE_RATE / original_rate
290+
output_length = int(len(audio_data) * ratio)
291+
292+
# Resample audio
293+
audio_data = signal.resample(audio_data, output_length)
294+
else:
295+
logger.info(f"No resampling needed, audio already at {WHISPER_SAMPLE_RATE}Hz")
296+
297+
# Normalize the audio data to float32 in the range [-1.0, 1.0] as expected by Whisper
298+
audio_data = audio_data.astype(np.float32) / 32768.0
299+
300+
logger.info("Recording processed in memory")
301+
self.recording_finished.emit(audio_data)
276302
except Exception as e:
277303
logger.error(f"Failed to process recording: {e}")
278304
self.recording_error.emit(f"Failed to process recording: {e}")

Diff for: blaze/transcriber.py

+19-28
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,19 @@ class TranscriptionWorker(QThread):
1515
progress_percent = pyqtSignal(int)
1616
error = pyqtSignal(str)
1717

18-
def __init__(self, model, audio_file):
18+
def __init__(self, model, audio_data):
1919
super().__init__()
2020
self.model = model
21-
self.audio_file = audio_file
21+
self.audio_data = audio_data
2222
self.settings = Settings()
2323
self.language = self.settings.get('language', 'auto')
2424

2525
def run(self):
2626
try:
27-
if not os.path.exists(self.audio_file):
28-
raise FileNotFoundError(f"Audio file not found: {self.audio_file}")
29-
30-
self.progress.emit("Loading audio file...")
27+
self.progress.emit("Processing audio...")
3128
self.progress_percent.emit(10)
3229

33-
# Load and transcribe
30+
# Transcribe directly from memory
3431
self.progress.emit("Processing audio with Whisper...")
3532
self.progress_percent.emit(30)
3633

@@ -46,7 +43,7 @@ def progress_callback(progress):
4643
logger.info(f"Transcribing with language: {lang_str}")
4744

4845
result = self.model.transcribe(
49-
self.audio_file,
46+
self.audio_data,
5047
fp16=False,
5148
language=None if self.language == 'auto' else self.language
5249
)
@@ -64,13 +61,6 @@ def progress_callback(progress):
6461
logger.error(f"Transcription error: {e}")
6562
self.error.emit(f"Transcription failed: {str(e)}")
6663
self.finished.emit("")
67-
finally:
68-
# Clean up the temporary file
69-
try:
70-
if os.path.exists(self.audio_file):
71-
os.remove(self.audio_file)
72-
except Exception as e:
73-
logger.error(f"Failed to remove temporary file: {e}")
7464

7565
class WhisperTranscriber(QObject):
7666
transcription_progress = pyqtSignal(str)
@@ -200,8 +190,8 @@ def _cleanup_worker(self):
200190
self.worker.deleteLater()
201191
self.worker = None
202192

203-
def transcribe(self, audio_file):
204-
"""Transcribe audio file using Whisper"""
193+
def transcribe(self, audio_data):
194+
"""Transcribe audio data directly from memory"""
205195
try:
206196
# Check if model needs to be reloaded due to settings changes
207197
self.reload_model_if_needed()
@@ -222,7 +212,7 @@ def transcribe(self, audio_file):
222212

223213
# Run transcription with language setting
224214
result = self.model.transcribe(
225-
audio_file,
215+
audio_data,
226216
fp16=False,
227217
language=None if self.current_language == 'auto' else self.current_language
228218
)
@@ -235,18 +225,19 @@ def transcribe(self, audio_file):
235225
logger.info(f"Transcribed text: [{text}]")
236226
self.transcription_finished.emit(text)
237227

238-
# Clean up the temporary file
239-
try:
240-
if os.path.exists(audio_file):
241-
os.remove(audio_file)
242-
except Exception as e:
243-
logger.error(f"Failed to remove temporary file: {e}")
244-
245228
except Exception as e:
246229
logger.error(f"Transcription failed: {e}")
247230
self.transcription_error.emit(str(e))
248231

249-
def transcribe_file(self, audio_file):
232+
def transcribe_file(self, audio_data):
233+
"""
234+
Transcribe audio data directly from memory
235+
236+
Parameters:
237+
-----------
238+
audio_data: np.ndarray
239+
Audio data as a NumPy array, expected to be float32 in range [-1.0, 1.0]
240+
"""
250241
if self.worker and self.worker.isRunning():
251242
logger.warning("Transcription already in progress")
252243
return
@@ -272,9 +263,9 @@ def transcribe_file(self, audio_file):
272263
lang_str = "auto-detect" if self.current_language == 'auto' else self.current_language
273264
logger.info(f"Transcription worker using language: {lang_str}")
274265
logger.info(f"Transcription worker using model: {self.current_model_name}")
275-
print(f"Transcribing file with model: {self.current_model_name}, language: {lang_str}")
266+
print(f"Transcribing audio with model: {self.current_model_name}, language: {lang_str}")
276267

277-
self.worker = TranscriptionWorker(self.model, audio_file)
268+
self.worker = TranscriptionWorker(self.model, audio_data)
278269
# Make sure the worker uses the current language setting
279270
self.worker.language = self.current_language
280271
self.worker.finished.connect(self.transcription_finished)

Diff for: docs/activeContext.md

+20-5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ The current focus of the Syllablaze project is to optimize the application for U
1313

1414
## Recent Changes
1515

16+
1. **In-Memory Audio Processing**:
17+
- Implemented direct memory-to-memory audio processing without writing to disk
18+
- Enhanced privacy by eliminating temporary files
19+
- Improved performance with direct 16kHz recording
20+
- Updated to version 0.3
1621

1722
## Next Steps
1823

@@ -36,8 +41,13 @@ The current focus of the Syllablaze project is to optimize the application for U
3641
- Implemented table-based UI for model management
3742
- Added download, delete, and activation functionality
3843
- Integrated with settings window
39-
10. **Test Installation**: Verify the installation process works correctly on Ubuntu KDE
40-
11. **Future Exploration**: Begin research on creating a Flatpak version
44+
10.**Implement In-Memory Audio Processing**: Enhanced privacy and performance
45+
- Eliminated temporary files for better privacy and security
46+
- Implemented direct memory-to-memory audio processing
47+
- Optimized for 16kHz recording to reduce processing overhead
48+
- Updated to version 0.3
49+
11. **Test Installation**: Verify the installation process works correctly on Ubuntu KDE
50+
12. **Future Exploration**: Begin research on creating a Flatpak version
4151

4252
## Active Decisions and Considerations
4353

@@ -82,6 +92,11 @@ The current focus of the Syllablaze project is to optimize the application for U
8292
- Consideration: Need to handle download progress simulation since Whisper API doesn't provide direct progress tracking
8393

8494
9. **Single Instance Enforcement**:
85-
- Decision: Implement a robust file locking mechanism to ensure only one instance of Syllablaze can run at a time
86-
- Rationale: Prevents resource conflicts and confusion from multiple instances running simultaneously
87-
- Consideration: Uses a file lock in ~/.cache/syllablaze/ with proper cleanup on application exit and signal handling
95+
- Decision: Implement a robust file locking mechanism to ensure only one instance of Syllablaze can run at a time
96+
- Rationale: Prevents resource conflicts and confusion from multiple instances running simultaneously
97+
- Consideration: Uses a file lock in ~/.cache/syllablaze/ with proper cleanup on application exit and signal handling
98+
99+
10. **In-Memory Audio Processing**:
100+
- Decision: Process audio entirely in memory without writing to temporary files
101+
- Rationale: Enhances privacy, security, and performance by avoiding disk operations
102+
- Consideration: Directly passes audio data as NumPy arrays between components, leveraging Whisper's ability to process in-memory data

Diff for: docs/progress.md

+14-5
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
1. **Core Functionality**:
66
- Audio recording from system microphones
77
- Real-time transcription using OpenAI's Whisper
8+
- In-memory audio processing (no temporary files)
9+
- Direct 16kHz recording for improved performance
810
- Automatic clipboard integration for transcribed text
911
- System tray integration with KDE Plasma
10-
- Global keyboard shortcuts for quick recording
1112
- Settings management and persistence
1213

1314
2. **User Interface**:
@@ -18,7 +19,6 @@
1819
- Comprehensive Whisper model management interface
1920

2021
3. **Installation**:
21-
- Enhanced setup.sh script for user-level installation using pipx
2222
- Desktop file integration with KDE
2323
- Icon integration
2424
- Improved system dependency checks
@@ -41,7 +41,7 @@
4141

4242
## Current Status
4343

44-
The core functionality works well, with significant improvements in the Whisper model management interface. There are still opportunities for enhancement in error handling and system integration.
44+
The core functionality works well, with significant improvements in the Whisper model management interface and enhanced privacy through in-memory audio processing. Version 0.3 introduces direct memory-to-memory audio processing without writing to disk, improving both privacy and performance. There are still opportunities for enhancement in error handling and system integration.
4545

4646
### Installation Status
4747

@@ -52,7 +52,8 @@ The core functionality works well, with significant improvements in the Whisper
5252

5353
### Functionality Status
5454

55-
- Audio recording works reliably
55+
- Audio recording works reliably with in-memory processing
56+
- No temporary files are created during the recording and transcription process
5657
- Transcription accuracy depends on the Whisper model selected
5758
- KDE integration works well on standard KDE Plasma
5859
- Clipboard integration functions as expected
@@ -111,4 +112,12 @@ The core functionality works well, with significant improvements in the Whisper
111112
- Created a comprehensive model management interface
112113
- Implemented table-based UI for model management
113114
- Added download, delete, and activation functionality
114-
- Integrated with settings window
115+
- Integrated with settings window
116+
117+
8. **Temporary Files**: ✅ FIXED
118+
- ~~Audio was temporarily written to disk during processing~~
119+
- ~~Potential privacy concern with sensitive audio data~~
120+
- Solution: Implemented in-memory audio processing without writing to disk
121+
- Audio data now flows directly from recorder to transcriber as NumPy arrays
122+
- Enhanced privacy and security by eliminating temporary files
123+
- Improved performance with direct 16kHz recording

0 commit comments

Comments
 (0)