zhayujie · thzjy · May 29, 2025 · MonkeyCode-AI · Feb 5, 2026 · MonkeyCode-AI
diff --git a/voice/google/config.json b/voice/google/config.json
@@ -0,0 +1,3 @@
+{
+  "gcs_bucket_name": "your-google-storage-bucket-name"
+}
diff --git a/voice/google/google-credentials.json b/voice/google/google-credentials.json
@@ -0,0 +1 @@
+把你的google密钥文件替换成本文件。
diff --git a/voice/google/google_voice2.py b/voice/google/google_voice2.py
@@ -0,0 +1,284 @@
+import os
+import time
+import uuid
+import json
+from google.cloud import speech
+from google.cloud import texttospeech_v1 as texttospeech
+from google.cloud import storage
+from google.api_core.exceptions import GoogleAPIError
+from pydub import AudioSegment
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from common.tmp_dir import TmpDir
+from voice.voice import Voice
+from common.utils import remove_markdown_symbol
+
+# 设置 Google Cloud 凭据和配置文件路径
+cred_path = os.path.join(os.path.dirname(__file__), "google-credentials.json")
+config_path = os.path.join(os.path.dirname(__file__), "config.json")
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
+
-# 设置 Google Cloud 凭据和配置文件路径
-cred_path = os.path.join(os.path.dirname(__file__), "google-credentials.json")
-config_path = os.path.join(os.path.dirname(__file__), "config.json")
-os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
-
+import os
+import time
+import uuid
+import json
+from google.cloud import speech
+from google.cloud import texttospeech_v1 as texttospeech
+from google.cloud import storage
+from google.api_core.exceptions import GoogleAPIError
+from pydub import AudioSegment
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from common.tmp_dir import TmpDir
+from voice.voice import Voice
+from common.utils import remove_markdown_symbol
+
+# 设置 Google Cloud 凭据和配置文件路径
+cred_path = os.path.join(os.path.dirname(__file__), "google-credentials.json")
+config_path = os.path.join(os.path.dirname(__file__), "config.json")
+
+# 不要覆盖外部已配置的凭据；仅在未设置且本地文件存在时回退
+if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ and os.path.exists(cred_path):
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
-# 设置 Google Cloud 凭据和配置文件路径
-cred_path = os.path.join(os.path.dirname(__file__), "google-credentials.json")
-config_path = os.path.join(os.path.dirname(__file__), "config.json")
-os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
-
+import os
+import time
+import uuid
+import json
+from google.cloud import speech
+from google.cloud import texttospeech_v1 as texttospeech
+from google.cloud import storage
+from google.api_core.exceptions import GoogleAPIError
+from pydub import AudioSegment
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from common.tmp_dir import TmpDir
+from voice.voice import Voice
+from common.utils import remove_markdown_symbol
+
+# 设置 Google Cloud 凭据和配置文件路径
+cred_path = os.path.join(os.path.dirname(__file__), "google-credentials.json")
+config_path = os.path.join(os.path.dirname(__file__), "config.json")
+
+# 不要覆盖外部已配置的凭据；仅在未设置且本地文件存在时回退
+if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ and os.path.exists(cred_path):
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
+class GoogleVoice(Voice):
+    def __init__(self):
+        super().__init__()
+        self.speech_client = speech.SpeechClient()
+        self.tts_client = texttospeech.TextToSpeechClient()
+        self.tts_long_client = texttospeech.TextToSpeechLongAudioSynthesizeClient()
+        self.storage_client = storage.Client()
+        # 从 google-credentials.json 获取 project_id
+        try:
+            with open(cred_path, 'r') as f:
+                credentials = json.load(f)
+                self.project_id = credentials.get('project_id')
+                if not self.project_id:
+                    raise ValueError("project_id 未在 google-credentials.json 中找到")
+                logger.debug(f"从 JSON 获取 project_id: {self.project_id}")
+        except Exception as e:
+            logger.error(f"无法读取 project_id: {e}")
+            raise
+        # 从 config.json 获取 bucket_name
+        try:
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+                self.bucket_name = config.get('gcs_bucket_name')
+                if not self.bucket_name:
+                    raise ValueError("gcs_bucket_name 未在 config.json 中找到")
+                logger.debug(f"从 config.json 获取 bucket_name: {self.bucket_name}")
+        except Exception as e:
+            logger.error(f"无法读取 config.json: {e}")
+            raise
+
+    def convert_audio_to_wav(self, input_file_path, output_file_path="temp_audio.wav"):
+        """
+        将 AMR 或 MP3 文件转换为 WAV 格式
+        参数:
+            input_file_path: 输入音频文件路径（AMR 或 MP3）
+            output_file_path: 输出 WAV 文件路径
+        返回:
+            转换后的 WAV 文件路径及其采样率
+        """
+        try:
+            audio = AudioSegment.from_file(input_file_path)
+            sample_rate = audio.frame_rate
+            duration_ms = len(audio)
+            logger.debug(f"输入音频: {input_file_path}, 采样率: {sample_rate}Hz, 时长: {duration_ms/1000}s")
+            if duration_ms < 100:
+                logger.error("音频文件过短，无法处理")
+                return None, None
+            audio = audio.set_channels(1).set_sample_width(2)
+            audio.export(output_file_path, format="wav", codec="pcm_s16le")
+            return output_file_path, sample_rate
+        except Exception as e:
+            logger.error(f"音频转换失败: {e}")
+            return None, None
+
+    def voiceToText(self, voice_file):
+        """
+        将中文音频文件（AMR 或 MP3）转换为文本
+        参数:
+            voice_file: 输入音频文件路径
+        返回:
+            Reply 对象，包含转录文本或错误信息
+        """
+        try:
+            if not os.path.exists(voice_file) or os.path.getsize(voice_file) == 0:
+                logger.error(f"音频文件无效或为空: {voice_file}")
+                return Reply(ReplyType.ERROR, "音频文件无效或为空")
+
+            file_ext = os.path.splitext(voice_file)[1].lower()
+            if file_ext in [".amr", ".mp3"]:
+                temp_wav_file = f"temp_audio_{uuid.uuid4().hex}.wav"
+                voice_file, sample_rate = self.convert_audio_to_wav(voice_file, temp_wav_file)
+                if not voice_file:
+                    logger.error("音频转换失败")
+                    return Reply(ReplyType.ERROR, "音频转换失败")
+            elif file_ext == ".wav":
+                audio = AudioSegment.from_wav(voice_file)
+                sample_rate = audio.frame_rate
+                duration_ms = len(audio)
+                logger.debug(f"WAV 音频: {voice_file}, 采样率: {sample_rate}Hz, 时长: {duration_ms/1000}s")
+                if duration_ms < 100:
+                    logger.error("音频文件过短，无法处理")
+                    return Reply(ReplyType.ERROR, "音频文件过短，无法处理")
+            else:
+                logger.error("不支持的音频格式，仅支持 AMR、MP3 和 WAV")
+                return Reply(ReplyType.ERROR, "不支持的音频格式，仅支持 AMR、MP3 和 WAV")
+
+            with open(voice_file, "rb") as audio_file:
+                audio_content = audio_file.read()
+
+            audio = speech.RecognitionAudio(content=audio_content)
+            config = speech.RecognitionConfig(
+                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=sample_rate,
+                language_code="cmn-CN",
+            )
+
+            response = self.speech_client.recognize(config=config, audio=audio)
+
+            transcript = ""
+            for result in response.results:
+                transcript += result.alternatives[0].transcript + " "
+
+            transcript = transcript.strip()
+            if not transcript:
+                logger.error("语音识别失败：无法理解音频内容")
+                return Reply(ReplyType.ERROR, "抱歉，我听不懂")
+
+            logger.info(f"[Google] voiceToText text={transcript} voice file name={voice_file}")
+            reply = Reply(ReplyType.TEXT, transcript)
+
+            if file_ext in [".amr", ".mp3"] and os.path.exists(voice_file):
+                os.remove(voice_file)
+
+            return reply
+
+        except GoogleAPIError as e:
+            logger.error(f"语音识别失败：无法连接到 Google 语音识别服务；{e}")
+            return Reply(ReplyType.ERROR, f"抱歉，无法连接到 Google 语音识别服务；{e}")
+        except Exception as e:
+            logger.error(f"发生错误: {e}")
+            return Reply(ReplyType.ERROR, f"抱歉，我听不懂或发生错误：{e}")
+
+    def textToVoice(self, text):
+        """
+        将中文文本转换为语音并保存为音频文件
+        参数:
+            text: 要转换的中文文本（可能包含 Markdown 标记）
+        返回:
+            Reply 对象，包含音频文件路径或错误信息
+        """
+        try:
+            # 清理 Markdown 标记
+            cleaned_text = remove_markdown_symbol(text)
+            if not cleaned_text:
+                logger.error("清理后的文本为空")
+                return Reply(ReplyType.ERROR, "文本内容为空，无法转换")
+
+            # 检查文本字节长度
+            text_bytes = cleaned_text.encode('utf-8')
+            byte_length = len(text_bytes)
+            logger.debug(f"文本字节长度: {byte_length} 字节")
+
+            # 生成唯一的输出文件名
+            unique_id = uuid.uuid4().hex
+            mp3_file = f"{TmpDir().path()}reply-{int(time.time())}-{unique_id}.mp3"
+            gcs_output_path = f"output-{unique_id}.wav"  # Long Audio 使用 WAV
+
+            # 配置语音参数（中文普通话）
+            voice = texttospeech.VoiceSelectionParams(
+                language_code="cmn-CN",
+                name="cmn-CN-Wavenet-A",
+            )
+
+            if byte_length <= 5000:
+                # 使用标准 Text-to-Speech API（短文本，输出 MP3）
+                audio_config = texttospeech.AudioConfig(
+                    audio_encoding=texttospeech.AudioEncoding.MP3
+                )
+                synthesis_input = texttospeech.SynthesisInput(text=cleaned_text)
+                response = self.tts_client.synthesize_speech(
+                    input=synthesis_input, voice=voice, audio_config=audio_config
+                )
+                with open(mp3_file, "wb") as out:
+                    out.write(response.audio_content)
+                    logger.info(f"[Google] textToVoice (standard) text={cleaned_text[:50]}... voice file name={mp3_file}")
+                return Reply(ReplyType.VOICE, mp3_file)
+            else:
+                # 使用 Long Audio API（长文本，输出 LINEAR16/WAV）
+                audio_config = texttospeech.AudioConfig(
+                    audio_encoding=texttospeech.AudioEncoding.LINEAR16
+                )
+                parent = f"projects/{self.project_id}/locations/global"
+                synthesis_input = texttospeech.SynthesisInput(text=cleaned_text)
+                output_gcs_uri = f"gs://{self.bucket_name}/{gcs_output_path}"
+                request = texttospeech.SynthesizeLongAudioRequest(
+                    parent=parent,
+                    input=synthesis_input,
+                    audio_config=audio_config,
+                    voice=voice,
+                    output_gcs_uri=output_gcs_uri,
+                )
+                operation = self.tts_long_client.synthesize_long_audio(request=request)
+                result = operation.result(timeout=600)  # 等待长音频合成完成（最大 10 分钟）
+
+                # 从 GCS 下载 WAV 文件
+                temp_wav_file = f"{TmpDir().path()}temp_wav_{unique_id}.wav"
+                bucket = self.storage_client.bucket(self.bucket_name)
+                blob = bucket.blob(gcs_output_path)
+                blob.download_to_filename(temp_wav_file)
+                logger.debug(f"从 GCS 下载 WAV 文件: {temp_wav_file}")
+
+                # 转换为 MP3
+                audio = AudioSegment.from_wav(temp_wav_file)
+                audio.export(mp3_file, format="mp3")
+                logger.info(f"[Google] textToVoice (long audio) text={cleaned_text[:50]}... voice file name={mp3_file}")
+
+                # 清理临时文件
+                os.remove(temp_wav_file)
+                blob.delete()
+
+                return Reply(ReplyType.VOICE, mp3_file)
+
+        except GoogleAPIError as e:
+            logger.error(f"文字转语音失败: {e}")
+            return Reply(ReplyType.ERROR, f"抱歉，无法连接到 Google 文字转语音服务；{e}")
+        except Exception as e:
+            logger.error(f"发生错误: {e}")
+            return Reply(ReplyType.ERROR, f"发生错误：{e}")
+
+"""
+语言代码: yue-HK
+  名称: yue-HK-Standard-A, 性别: FEMALE, 采样率: 24000Hz
+  名称: yue-HK-Standard-B, 性别: MALE, 采样率: 24000Hz
+  名称: yue-HK-Standard-C, 性别: FEMALE, 采样率: 24000Hz
+  名称: yue-HK-Standard-D, 性别: MALE, 采样率: 24000Hz
+
+  语言代码: cmn-CN
+  名称: cmn-CN-Chirp3-HD-Achernar, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Achird, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Algenib, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Algieba, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Alnilam, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Aoede, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Autonoe, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Callirrhoe, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Charon, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Despina, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Enceladus, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Erinome, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Fenrir, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Gacrux, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Iapetus, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Kore, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Laomedeia, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Leda, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Orus, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Puck, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Pulcherrima, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Rasalgethi, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Sadachbia, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Sadaltager, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Schedar, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Sulafat, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Umbriel, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Vindemiatrix, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Zephyr, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Chirp3-HD-Zubenelgenubi, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Standard-A, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Standard-B, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Standard-C, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Standard-D, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Wavenet-A, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-CN-Wavenet-B, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Wavenet-C, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-CN-Wavenet-D, 性别: FEMALE, 采样率: 24000Hz
+
+语言代码: cmn-TW
+  名称: cmn-TW-Standard-A, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-TW-Standard-B, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-TW-Standard-C, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-TW-Wavenet-A, 性别: FEMALE, 采样率: 24000Hz
+  名称: cmn-TW-Wavenet-B, 性别: MALE, 采样率: 24000Hz
+  名称: cmn-TW-Wavenet-C, 性别: MALE, 采样率: 24000Hz
+"""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		把你的google密钥文件替换成本文件。
Copy link Copy Markdown MonkeyCode-AI Feb 5, 2026 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Caution 🚨 将凭据文件纳入仓库的风险（即便是占位说明也容易导致误提交真实密钥） PR 新增了 google-credentials.json 文件（当前内容为提示语），会强烈暗示用户在该路径放置真实服务账号密钥文件，后续极易被误提交到仓库；同时代码在运行时强绑定该相对路径，进一步提高把真实密钥放进项目目录的概率，属于严重安全/合规风险。建议: 删除仓库中的 credentials 占位文件；改为读取环境变量（GOOGLE_APPLICATION_CREDENTIALS）或由部署环境通过挂载/secret 注入；并在 .gitignore 中忽略 voice/google/google-credentials.json。