v0.8.2: configurable OpenAI STT model for voice messages

FlyingFathead · FlyingFathead · commit c77df7dd8610 · 2026-05-14T19:35:15.000+03:00
diff --git a/README.md b/README.md
@@ -3,8 +3,8 @@
 ## 🤖 _Powered by ChatKeke_ 🚀
 
 - **A simple-to-use, quick-to-deploy Python-based Telegram bot for OpenAI API & Perplexity API**
-- **🎙 Transcribed voice messages over Whisper API**
-  - (auto-transcriptions, translations, and other messages to the bot over TG's voice messages)
+- **🎙 Transcribed Telegram voice messages via OpenAI speech-to-text models (Whisper and others)**
+  - Supports configurable STT models such as `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and legacy `whisper-1`
 - **☁️ Real-time weather info, weather alerts, and geolocation data via [OpenWeatherMap](https://openweathermap.org/), [WeatherAPI](https://www.weatherapi.com/) and U.S. NWS ([weather.gov](https://weather.gov))**
 - **🗺 Geolocation and map lookups via MapTiler API**
   - (with weather forecasts around the world in all OpenAI API supported languages)
@@ -240,6 +240,44 @@ If you run into any issues, consult the logs or reach out on the repository's [I
 ---
 
 # Changelog
+
+- v0.8.2 - Configurable OpenAI speech-to-text model support for Telegram voice messages
+  - Updated Telegram voice message transcription handling for the current OpenAI audio transcription model lineup.
+  - Added configurable speech-to-text model selection through `config.ini`:
+    - `STTModel = gpt-4o-transcribe`
+    - `STTModel = gpt-4o-mini-transcribe`
+    - `STTModel = whisper-1`
+  - Kept backward compatibility with the old `EnableWhisper` config flag so existing configs do not break.
+  - Added `OPENAI_STT_MODEL` environment-variable fallback for overriding the configured STT model in Docker/systemd/shell deployments.
+  - Changed the default voice transcription model from legacy `whisper-1` to modern `gpt-4o-transcribe`.
+  - Refactored `src/voice_message_handler.py` so the STT model is read from the main `TelegramBot` config object instead of being hardcoded in the voice module.
+  - Added lazy OpenAI async client initialization for voice transcription so the client is created only after API key loading.
+  - Preserved support for legacy `whisper-1` as a fallback model.
+  - Preserved Telegram-visible voice transcription marker:
+    - `🎤📝`
+  - Improved Telegram HTML safety by escaping transcribed text before wrapping it in HTML formatting.
+  - Improved model-facing voice transcription formatting through `context.user_data["transcribed_text"]`.
+  - Fixed voice message duration checking so `MaxDurationMinutes` is correctly treated as minutes while Telegram/audio duration values are handled as seconds.
+  - Improved voice message logging with detailed metadata:
+    - Telegram user ID
+    - username
+    - first/last name
+    - chat ID
+    - chat type/title
+    - message ID
+    - Telegram voice file ID / unique ID
+    - voice duration
+    - MIME type
+    - file size
+    - local downloaded file path
+    - selected STT model
+    - final transcription text
+  - Logs detailed voice transcription events into `bot.log`.
+  - Logs transcription events into `chat.log` when `ChatLoggingEnabled = True`.
+  - Cleaned up `main.py` voice handler registration:
+    - removed the old extra-argument voice handler path
+    - now routes voice messages through `self.voice_message_handler`
+  - Fixed `load_config()` directory creation logging by replacing undefined `logger` references with a local `TelegramBotLogger`.
 - v0.8.1 - Startup status banner cleanup
   - Added Perplexity API enabled/disabled status to the startup banner.
   - Reads Perplexity status from `[Perplexity] -> Enabled` in `config.ini`.
diff --git a/config/config.ini b/config/config.ini
@@ -82,6 +82,14 @@ MaxHistoryDays = 30
 # ~~~~~~~~~~~
 # Allow speech-to-text transcriptions via Whisper API
 EnableWhisper = True
+
+# Speech-to-text model.
+# Good values:
+#   gpt-4o-transcribe       = modern default, same per-minute price as whisper-1
+#   gpt-4o-mini-transcribe  = cheaper
+#   whisper-1               = legacy fallback
+STTModel = gpt-4o-transcribe
+
 # Maximum duration of a voice message (in minutes)
 MaxDurationMinutes = 5
 
diff --git a/src/main.py b/src/main.py
@@ -6,7 +6,7 @@
 # https://github.com/FlyingFathead/TelegramBot-OpenAI-API
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # version of this program
-version_number = "0.8.1"
+version_number = "0.8.2"
 
 # Add the project root directory to Python's path
 import sys
@@ -155,7 +155,10 @@ def __init__(self):
         # Because we do that in main() before constructing TelegramBot.
 
         self.logger = logging.getLogger('TelegramBotLogger')
-        self.logger.info("Initializing TelegramBot...")
+        self.logger.info("Initializing Telegram Bot...")
+
+        self.logger.info(f"Voice STT enabled: {self.enable_whisper}")
+        self.logger.info(f"Voice STT model: {self.stt_model}")
 
         # The rest is mostly unchanged:
         self.reminders_enabled = self._parser.getboolean('Reminders', 'EnableReminders', fallback=False)
@@ -229,6 +232,14 @@ def load_config(self):
         self.bot_disabled_msg = self.config.get('BotDisabledMsg', 'The bot is currently disabled.')
 
         self.enable_whisper = self.config.getboolean('EnableWhisper', True)
+
+        # Speech-to-text model for voice messages.
+        # Primary source: config.ini
+        # Fallback source: OPENAI_STT_MODEL env var
+        # Final fallback: gpt-4o-transcribe
+        configured_stt_model = self.config.get('STTModel', fallback='').strip()
+        env_stt_model = os.getenv('OPENAI_STT_MODEL', '').strip()
+        self.stt_model = configured_stt_model or env_stt_model or 'gpt-4o-transcribe'
         self.max_voice_message_length = self.config.getint('MaxDurationMinutes', 5)
 
         self.data_directory = self.config.get('DataDirectory', 'data')
@@ -241,25 +252,29 @@ def load_config(self):
         # Build paths
         project_root = Path(__file__).resolve().parents[1]
         self.data_directory = str(project_root / self.config.get('DataDirectory', 'data'))
+        self.logs_directory = str(project_root / self.config.get('LogsDirectory', 'logs'))
+
+        # self.logger is not assigned yet during load_config(), so use a local logger.
+        config_logger = logging.getLogger('TelegramBotLogger')
 
         # Create data directory if needed
         try:
             if not os.path.exists(self.data_directory):
                 os.makedirs(self.data_directory, exist_ok=True)
-                logger.info(f"Created data directory at {self.data_directory}")
+                config_logger.info(f"Created data directory at {self.data_directory}")
         except OSError as e:
-            logger.error(
+            config_logger.error(
                 f"Failed to create data directory {self.data_directory}: {e} "
                 "-- Some commands might be disabled due to this."
             )
 
-        self.logs_directory = str(project_root / self.config.get('LogsDirectory', 'logs'))
+        # Create logs directory if needed
         try:
             if not os.path.exists(self.logs_directory):
                 os.makedirs(self.logs_directory, exist_ok=True)
-                logger.info(f"Created logs directory at {self.logs_directory}")
+                config_logger.info(f"Created logs directory at {self.logs_directory}")
         except OSError as e:
-            logger.error(
+            config_logger.error(
                 f"Failed to create logs directory {self.logs_directory}: {e} "
                 "-- Some commands might be disabled due to this."
             )
@@ -405,15 +420,7 @@ def split_large_messages(self, message, max_length=4096):
     # voice message handler - see: voice_message_handler.py
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     async def voice_message_handler(self, update: Update, context: CallbackContext) -> None:
-        await handle_voice_message(
-            self,
-            update,
-            context,
-            self.data_directory,
-            self.enable_whisper,
-            self.max_voice_message_length,
-            logger
-        )
+        await handle_voice_message(self, update, context)
 
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # text message handler - see: text_message_handler.py
@@ -438,7 +445,7 @@ def run(self):
         application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, self.handle_message))
 
         # Voice handler
-        application.add_handler(MessageHandler(filters.VOICE, partial(handle_voice_message, self)))
+        application.add_handler(MessageHandler(filters.VOICE, self.voice_message_handler))
 
         # Register command handlers from bot_commands module
         application.add_handler(
diff --git a/src/voice_message_handler.py b/src/voice_message_handler.py