From 93f525e3fea6e4b0cafee85fe76332cd6cfb2025 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Sat, 29 Mar 2025 17:48:22 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=88=20perf:=20edge=20tts=20=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E4=BD=BF=E7=94=A8=E4=BB=A3=E7=90=86=EF=BC=9B=E7=A7=BB?= =?UTF-8?q?=E9=99=A4=E4=BA=86=E4=B8=80=E4=BA=9B=E4=B8=8D=E9=9C=80=E8=A6=81?= =?UTF-8?q?=E7=9A=84=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/provider/sources/edge_tts_source.py | 70 +++++++++++-------- .../sources/sensevoice_selfhosted_source.py | 8 --- .../provider/sources/whisper_api_source.py | 8 --- .../sources/whisper_selfhosted_source.py | 8 --- 4 files changed, 42 insertions(+), 52 deletions(-) diff --git a/astrbot/core/provider/sources/edge_tts_source.py b/astrbot/core/provider/sources/edge_tts_source.py index c7887d3ea..b6b758e29 100644 --- a/astrbot/core/provider/sources/edge_tts_source.py +++ b/astrbot/core/provider/sources/edge_tts_source.py @@ -35,6 +35,8 @@ class ProviderEdgeTTS(TTSProvider): self.pitch = provider_config.get("pitch", None) self.timeout = provider_config.get("timeout", 30) + self.proxy = os.getenv("https_proxy", None) + self.set_model("edge_tts") async def get_audio(self, text: str) -> str: @@ -42,7 +44,7 @@ class ProviderEdgeTTS(TTSProvider): mp3_path = f"data/temp/edge_tts_temp_{uuid.uuid4()}.mp3" wav_path = f"data/temp/edge_tts_{uuid.uuid4()}.wav" - # 构建Edge TTS参数 + # 构建 Edge TTS 参数 kwargs = {"text": text, "voice": self.voice} if self.rate: kwargs["rate"] = self.rate @@ -52,35 +54,47 @@ class ProviderEdgeTTS(TTSProvider): kwargs["pitch"] = self.pitch try: - communicate = edge_tts.Communicate(**kwargs) + communicate = edge_tts.Communicate(proxy=self.proxy, **kwargs) await communicate.save(mp3_path) - # 使用ffmpeg将MP3转换为标准WAV格式 - _ = await asyncio.create_subprocess_exec( - "ffmpeg", - "-y", # 覆盖输出文件 - "-i", - mp3_path, # 输入文件 - "-acodec", - "pcm_s16le", # 16位PCM编码 - "-ar", - "24000", # 采样率24kHz (适合微信语音) - "-ac", - "1", # 单声道 - "-af", - "apad=pad_dur=2", # 确保输出时长准确 - "-fflags", - "+genpts", # 强制生成时间戳 - "-hide_banner", # 隐藏版本信息 - wav_path, # 输出文件 - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - # 等待进程完成并获取输出 - stdout, stderr = await _.communicate() - logger.info(f"[EdgeTTS] FFmpeg 标准输出: {stdout.decode().strip()}") - logger.debug(f"FFmpeg错误输出: {stderr.decode().strip()}") - logger.info(f"[EdgeTTS] 返回值(0代表成功): {_.returncode}") + try: + from pyffmpeg import FFmpeg + + ff = FFmpeg() + ff.convert(input=mp3_path, output=wav_path) + except Exception as e: + logger.debug( + f"pyffmpeg 转换失败: {e}, 尝试使用 ffmpeg 命令行进行转换" + ) + # use ffmpeg command line + + # 使用ffmpeg将MP3转换为标准WAV格式 + p = await asyncio.create_subprocess_exec( + "ffmpeg", + "-y", # 覆盖输出文件 + "-i", + mp3_path, # 输入文件 + "-acodec", + "pcm_s16le", # 16位PCM编码 + "-ar", + "24000", # 采样率24kHz (适合微信语音) + "-ac", + "1", # 单声道 + "-af", + "apad=pad_dur=2", # 确保输出时长准确 + "-fflags", + "+genpts", # 强制生成时间戳 + "-hide_banner", # 隐藏版本信息 + wav_path, # 输出文件 + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + # 等待进程完成并获取输出 + stdout, stderr = await p.communicate() + logger.info(f"[EdgeTTS] FFmpeg 标准输出: {stdout.decode().strip()}") + logger.debug(f"FFmpeg错误输出: {stderr.decode().strip()}") + logger.info(f"[EdgeTTS] 返回值(0代表成功): {p.returncode}") + os.remove(mp3_path) if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0: return wav_path diff --git a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py index 4842b0e04..84087ecf6 100644 --- a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py +++ b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py @@ -48,14 +48,6 @@ class ProviderSenseVoiceSTTSelfHost(STTProvider): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") return os.path.join("data", "temp", f"{timestamp}") - async def _convert_audio(self, path: str) -> str: - from pyffmpeg import FFmpeg - - filename = await self.get_timestamped_path() + ".mp3" - ff = FFmpeg() - output_path = ff.convert(path, os.path.join('data","temp', filename)) - return output_path - async def _is_silk_file(self, file_path): silk_header = b"SILK" with open(file_path, "rb") as f: diff --git a/astrbot/core/provider/sources/whisper_api_source.py b/astrbot/core/provider/sources/whisper_api_source.py index ce474f4ef..e38a81de9 100644 --- a/astrbot/core/provider/sources/whisper_api_source.py +++ b/astrbot/core/provider/sources/whisper_api_source.py @@ -31,14 +31,6 @@ class ProviderOpenAIWhisperAPI(STTProvider): self.set_model(provider_config.get("model", None)) - async def _convert_audio(self, path: str) -> str: - from pyffmpeg import FFmpeg - - filename = str(uuid.uuid4()) + ".mp3" - ff = FFmpeg() - output_path = ff.convert(path, os.path.join("data/temp", filename)) - return output_path - async def _is_silk_file(self, file_path): silk_header = b"SILK" with open(file_path, "rb") as f: diff --git a/astrbot/core/provider/sources/whisper_selfhosted_source.py b/astrbot/core/provider/sources/whisper_selfhosted_source.py index 1bbc2a1dc..cfd1267d0 100644 --- a/astrbot/core/provider/sources/whisper_selfhosted_source.py +++ b/astrbot/core/provider/sources/whisper_selfhosted_source.py @@ -33,14 +33,6 @@ class ProviderOpenAIWhisperSelfHost(STTProvider): ) logger.info("Whisper 模型加载完成。") - async def _convert_audio(self, path: str) -> str: - from pyffmpeg import FFmpeg - - filename = str(uuid.uuid4()) + ".mp3" - ff = FFmpeg() - output_path = ff.convert(path, os.path.join("data/temp", filename)) - return output_path - async def _is_silk_file(self, file_path): silk_header = b"SILK" with open(file_path, "rb") as f: