A Python module for transcribing audio files using MLX-optimized Whisper models for Apple Silicon. Features multilingual support, batch processing, and progress tracking.
- Python 3.13 or higher
- Apple Silicon Mac (for MLX optimization)
- UV package manager (recommended) or pip
Using UV (recommended):
uv add lightning-whisper-mlxUsing pip:
pip install lightning-whisper-mlxpython -m anyfile_to_ai.audio_processor --help# Transcribe single audio file
python -m anyfile_to_ai.audio_processor audio.mp3
# Transcribe with JSON output
python -m anyfile_to_ai.audio_processor audio.mp3 --format json
# Transcribe with progress tracking
python -m anyfile_to_ai.audio_processor audio.mp3 --verbose
# Transcribe multiple files
python -m anyfile_to_ai.audio_processor audio1.mp3 audio2.wav audio3.m4a# Use tiny model for speed
python -m anyfile_to_ai.audio_processor audio.mp3 --model tiny
# Use large model for accuracy
python -m anyfile_to_ai.audio_processor audio.mp3 --model large-v3
# List of available models:
# tiny, small, distil-small.en, base, medium, distil-medium.en,
# large, large-v2, distil-large-v2, large-v3, distil-large-v3# Auto-detect language (default)
python -m anyfile_to_ai.audio_processor audio.mp3
# Specify language hint for better accuracy
python -m anyfile_to_ai.audio_processor audio.mp3 --language es # Spanish
python -m anyfile_to_ai.audio_processor audio.mp3 --language fr # French
python -m anyfile_to_ai.audio_processor audio.mp3 --language en # English# JSON output (structured)
python -m anyfile_to_ai.audio_processor audio.mp3 --format json
# Plain text output (default)
python -m anyfile_to_ai.audio_processor audio.mp3 --format plain
# Save output to file
python -m anyfile_to_ai.audio_processor audio.mp3 --format json --output transcript.json# Batch processing with progress
python -m anyfile_to_ai.audio_processor *.mp3 --verbose
# Custom batch size (decoder parameter)
python -m anyfile_to_ai.audio_processor audio.mp3 --batch-size 24
# With timeout
python -m anyfile_to_ai.audio_processor audio.mp3 --timeout 300
# Quiet mode (results only)
python -m anyfile_to_ai.audio_processor audio.mp3 --quiet--format, -f: Output format (plain,json)--model, -m: Whisper model selection (default:medium)--quantization, -q: Quantization level (none,4bit,8bit) - default:none--language, -l: Language hint (ISO 639-1 code, e.g.,en,es)--batch-size, -b: Whisper decoder batch size (default: 12)--output, -o: Save results to file--timeout, -t: Processing timeout per file in seconds (default: 600)--verbose, -v: Enable progress output--quiet: Suppress all output except results
from anyfile_to_ai.audio_processor import process_audio, create_config
# Simple single audio transcription
result = process_audio('audio.mp3')
if result.success:
print(f"Text: {result.text}")
print(f"Language: {result.detected_language}")
print(f"Time: {result.processing_time:.2f}s")from anyfile_to_ai.audio_processor import process_audio_batch
# Process multiple audio files
audio_files = ['audio1.mp3', 'audio2.wav', 'audio3.m4a']
results = process_audio_batch(audio_files)
print(f"Processed {results.successful_count}/{results.total_files} files")
for result in results.results:
if result.success:
print(f"{result.audio_path}: {result.text[:100]}...")from anyfile_to_ai.audio_processor import process_audio_batch, create_config
# Custom configuration
config = create_config(
model="large-v3",
quantization="none",
language="en",
output_format="json",
verbose=True
)
results = process_audio_batch(audio_files, config)The audio_processor now supports the unified progress tracking system:
from anyfile_to_ai.audio_processor import process_audio_batch
from anyfile_to_ai.progress_tracker import ProgressEmitter, CLIProgressConsumer
# Create progress emitter
emitter = ProgressEmitter(total=len(audio_files), label="Transcribing audio")
emitter.add_consumer(CLIProgressConsumer())
# Process with progress tracking
results = process_audio_batch(audio_files, progress_emitter=emitter)
# The progress bar will show:
# Transcribing audio |████████████████| 100% (5/5 files)from anyfile_to_ai.audio_processor import process_audio_streaming
from anyfile_to_ai.progress_tracker import ProgressEmitter, CLIProgressConsumer
# Create progress emitter for streaming
emitter = ProgressEmitter(total=len(audio_files), label="Processing audio")
emitter.add_consumer(CLIProgressConsumer())
# Stream processing with progress updates
for result in process_audio_streaming(audio_files):
emitter.update(1)
if result.success:
print(f"Transcribed: {result.audio_path}")
emitter.complete()The old callback-based progress is still supported but deprecated:
from anyfile_to_ai.audio_processor import process_audio_batch, create_config
def progress_handler(current, total):
print(f"Processing {current}/{total} audio files...")
# This works but emits a deprecation warning
config = create_config(
model="medium",
progress_callback=progress_handler,
verbose=True
)
results = process_audio_batch(audio_files, config)Migration: Use progress_emitter parameter instead of progress_callback.
from anyfile_to_ai.audio_processor import validate_audio, get_audio_info, get_supported_formats
# Validate audio before processing
try:
audio_doc = validate_audio('audio.mp3')
print(f"Valid: {audio_doc.format}, {audio_doc.duration}s")
except Exception as e:
print(f"Invalid audio: {e}")
# Get audio metadata without processing
info = get_audio_info('audio.mp3')
print(f"Duration: {info['duration']}s, Format: {info['format']}")
# Check supported formats
formats = get_supported_formats()
print(f"Supported formats: {formats}") # ['m4a', 'mp3', 'wav']- model (
str): Whisper model selection (default:"medium")- Speed:
tiny>small>base>medium>large - Accuracy:
tiny<small<base<medium<large - Distilled models (e.g.,
distil-medium.en) are faster
- Speed:
- quantization (
str): Model quantization (default:"none")"none": No quantization (best accuracy, slower)"4bit": 4-bit quantization (faster, less memory) - may have compatibility issues"8bit": 8-bit quantization (middle ground) - may have compatibility issues
- batch_size (
int): Decoder batch size (default: 12, range: 1-128) - language (
str | None): Language hint orNonefor auto-detect (default:None) - output_format (
str): Output format ("plain"or"json") - timeout_seconds (
int): Processing timeout per file (default: 600) - progress_callback (
Callable[[int, int], None]): Progress tracking function - verbose (
bool): Enable verbose output (default:False) - max_duration_seconds (
int): Maximum audio duration (default: 7200 = 2 hours)
For Speed:
config = create_config(model="tiny", quantization="none")For Accuracy:
config = create_config(model="large-v3", quantization="none")Balanced:
config = create_config(model="medium", quantization="none") # Default# MLX is automatically optimized for Apple Silicon
# No additional environment setup needed
# For memory-constrained environments, use smaller models
python -m anyfile_to_ai.audio_processor audio.mp3 --model tiny- Models are automatically downloaded on first use
- Cached in
~/.cache/huggingface/ - First run will download ~1-5GB depending on model
from anyfile_to_ai.audio_processor import (
AudioProcessingError,
AudioNotFoundError,
UnsupportedFormatError,
CorruptedAudioError,
NoSpeechDetectedError,
DurationExceededError,
ValidationError,
ModelLoadError,
ProcessingTimeoutError
)
try:
result = process_audio('audio.mp3')
except AudioNotFoundError:
print("Audio file not found")
except UnsupportedFormatError:
print("Unsupported audio format (use mp3, wav, or m4a)")
except NoSpeechDetectedError:
print("No speech detected in audio")
except DurationExceededError:
print("Audio exceeds 2-hour limit")
except ModelLoadError as e:
print(f"Failed to load Whisper model: {e}")
except ProcessingTimeoutError:
print("Transcription timed out")No Speech Detected:
# Whisper may hallucinate text on silence/noise
# Use a different model or check audio quality
config = create_config(model="large-v3")Out of Memory:
# Use smaller model or reduce batch size
config = create_config(model="tiny", batch_size=6)Slow Processing:
# Use distilled models for faster processing
config = create_config(model="distil-medium.en")Model Download Issues:
# Check internet connection
# Models are downloaded from HuggingFace
# Manually pre-download if needed# Audio metadata
AudioDocument(file_path, format, duration, sample_rate, file_size, channels)
# Transcription result for single audio
TranscriptionResult(
audio_path, text, confidence_score, processing_time,
model_used, quantization, detected_language, success, error_message
)
# Batch processing result
ProcessingResult(
success, results, total_files, successful_count,
failed_count, total_processing_time, average_processing_time, error_summary
)Audio markdown rendering supports an opt-in shared formatter path via anyfile_to_ai.output_formatter.
- Enable with
ANYFILE_OUTPUT_FORMATTER_AUDIO_SHARED=1. - Timestamp formatting is aligned to
HH:MM:SS.CCwith shared boundary validation.
# Example with project sample audio
sample_audio = "sample-data/audio/podcast.mp3"
# Quick transcription
result = process_audio(sample_audio)
print(f"Transcription: {result.text}")
# Batch process all samples
import glob
sample_audios = glob.glob("sample-data/audio/*.mp3")
results = process_audio_batch(sample_audios)
# With language hint
config = create_config(language="en")
result = process_audio(sample_audio, config)
print(f"English transcription: {result.text}")- MP3 (.mp3)
- WAV (.wav)
- M4A (.m4a)
Check programmatically:
from anyfile_to_ai.audio_processor import get_supported_formats
print(get_supported_formats()) # ['m4a', 'mp3', 'wav']The audio processor supports cooperative cancellation for batch processing:
from anyfile_to_ai.audio_processor import process_audio_batch, create_config
from anyfile_to_ai.progress_tracker import CancellationToken, OperationCancelledError
# Create cancellation token
token = CancellationToken()
config = create_config(model="medium")
# Process audio files with cancellation support
try:
results = process_audio_batch(
["audio1.mp3", "audio2.wav", "audio3.m4a"],
config=config,
cancel_token=token
)
print(f"Processed {results.successful_count} files")
except OperationCancelledError as e:
print(f"Processing cancelled: {e.message}")- Check at iteration boundaries: Cancellation is checked before processing each file
- Handle partial results: Completed files are included in results before cancellation
- Clean up resources: Whisper model resources are cleaned up before raising
from anyfile_to_ai.audio_processor import process_audio_batch, create_config
from anyfile_to_ai.progress_tracker import CancellationToken, OperationCancelledError
def process_with_limit(audio_paths, max_files=None):
"""Process audio files with optional limit via cancellation."""
token = CancellationToken()
config = create_config(model="medium")
try:
results = process_audio_batch(audio_paths, config=config, cancel_token=token)
# Cancel if limit reached (checked before next file)
if max_files and results.successful_count >= max_files:
token.cancel()
return results
except OperationCancelledError:
print(f"Cancelled during processing")
return Nonefrom anyfile_to_ai.audio_processor import process_audio_batch
from anyfile_to_ai.progress_tracker import CancellationToken, OperationCancelledError
# Process with timeout via cancellation
import threading
token = CancellationToken()
def cancel_after_timeout(seconds):
import time
time.sleep(seconds)
token.cancel()
# Start timeout thread
timer = threading.Thread(target=cancel_after_timeout, args=(300,))
timer.daemon = True
timer.start()
try:
results = process_audio_batch(audio_files, cancel_token=token)
except OperationCancelledError:
print("Processing timed out after 5 minutes")- MLX Optimization: 10x faster than CPU Whisper on Apple Silicon
- Model Size: Larger models are more accurate but slower
- Audio Length: ~1 minute audio takes ~5-15 seconds with medium model
- Memory Usage: ~2-4GB RAM for medium model
- Batch Processing: Sequential processing (not parallel) for stability
- Maximum Duration: 2 hours per audio file
- Language Detection: Auto-detection works best with clear speech
- Background Noise: May affect transcription quality
- Quantization: 4bit/8bit quantization has compatibility issues with current MLX version (use
none) - No Timestamps: Text-only output (no word-level timestamps)
from anyfile_to_ai.pdf_extractor import extract_text
from anyfile_to_ai.audio_processor import process_audio
# Extract PDF and transcribe related audio
pdf_text = extract_text('document.pdf')
audio_text = process_audio('presentation.mp3')
combined = f"Document: {pdf_text.pages[0].text}\n\nAudio: {audio_text.text}"from anyfile_to_ai.image_processor import process_image
from anyfile_to_ai.audio_processor import process_audio
# Describe image and transcribe audio
img_result = process_image('slide.jpg')
audio_result = process_audio('narration.mp3')
print(f"Image: {img_result.description}")
print(f"Narration: {audio_result.text}")# Run contract tests
PYTHONPATH=. uv run pytest tests/contract/test_audio*.py -v
# Test with sample audio
python -m anyfile_to_ai.audio_processor sample-data/audio/silence.mp3 --verboseCurrent version: 0.1.0
[Your License Here]