From 93f525e3fea6e4b0cafee85fe76332cd6cfb2025 Mon Sep 17 00:00:00 2001
From: Soulter <905617992@qq.com>
Date: Sat, 29 Mar 2025 17:48:22 +0800
Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=88=20perf:=20edge=20tts=20=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E4=BD=BF=E7=94=A8=E4=BB=A3=E7=90=86=EF=BC=9B=E7=A7=BB?=
 =?UTF-8?q?=E9=99=A4=E4=BA=86=E4=B8=80=E4=BA=9B=E4=B8=8D=E9=9C=80=E8=A6=81?=
 =?UTF-8?q?=E7=9A=84=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../core/provider/sources/edge_tts_source.py  | 70 +++++++++++--------
 .../sources/sensevoice_selfhosted_source.py   |  8 ---
 .../provider/sources/whisper_api_source.py    |  8 ---
 .../sources/whisper_selfhosted_source.py      |  8 ---
 4 files changed, 42 insertions(+), 52 deletions(-)

diff --git a/astrbot/core/provider/sources/edge_tts_source.py b/astrbot/core/provider/sources/edge_tts_source.py
index c7887d3ea..b6b758e29 100644
--- a/astrbot/core/provider/sources/edge_tts_source.py
+++ b/astrbot/core/provider/sources/edge_tts_source.py
@@ -35,6 +35,8 @@ class ProviderEdgeTTS(TTSProvider):
         self.pitch = provider_config.get("pitch", None)
         self.timeout = provider_config.get("timeout", 30)
 
+        self.proxy = os.getenv("https_proxy", None)
+
         self.set_model("edge_tts")
 
     async def get_audio(self, text: str) -> str:
@@ -42,7 +44,7 @@ class ProviderEdgeTTS(TTSProvider):
         mp3_path = f"data/temp/edge_tts_temp_{uuid.uuid4()}.mp3"
         wav_path = f"data/temp/edge_tts_{uuid.uuid4()}.wav"
 
-        # 构建Edge TTS参数
+        # 构建 Edge TTS 参数
         kwargs = {"text": text, "voice": self.voice}
         if self.rate:
             kwargs["rate"] = self.rate
@@ -52,35 +54,47 @@ class ProviderEdgeTTS(TTSProvider):
             kwargs["pitch"] = self.pitch
 
         try:
-            communicate = edge_tts.Communicate(**kwargs)
+            communicate = edge_tts.Communicate(proxy=self.proxy, **kwargs)
             await communicate.save(mp3_path)
 
-            # 使用ffmpeg将MP3转换为标准WAV格式
-            _ = await asyncio.create_subprocess_exec(
-                "ffmpeg",
-                "-y",  # 覆盖输出文件
-                "-i",
-                mp3_path,  # 输入文件
-                "-acodec",
-                "pcm_s16le",  # 16位PCM编码
-                "-ar",
-                "24000",  # 采样率24kHz (适合微信语音)
-                "-ac",
-                "1",  # 单声道
-                "-af",
-                "apad=pad_dur=2",  # 确保输出时长准确
-                "-fflags",
-                "+genpts",  # 强制生成时间戳
-                "-hide_banner",  # 隐藏版本信息
-                wav_path,  # 输出文件
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-            )
-            # 等待进程完成并获取输出
-            stdout, stderr = await _.communicate()
-            logger.info(f"[EdgeTTS] FFmpeg 标准输出: {stdout.decode().strip()}")
-            logger.debug(f"FFmpeg错误输出: {stderr.decode().strip()}")
-            logger.info(f"[EdgeTTS] 返回值(0代表成功): {_.returncode}")
+            try:
+                from pyffmpeg import FFmpeg
+
+                ff = FFmpeg()
+                ff.convert(input=mp3_path, output=wav_path)
+            except Exception as e:
+                logger.debug(
+                    f"pyffmpeg 转换失败: {e}, 尝试使用 ffmpeg 命令行进行转换"
+                )
+                # use ffmpeg command line
+
+                # 使用ffmpeg将MP3转换为标准WAV格式
+                p = await asyncio.create_subprocess_exec(
+                    "ffmpeg",
+                    "-y",  # 覆盖输出文件
+                    "-i",
+                    mp3_path,  # 输入文件
+                    "-acodec",
+                    "pcm_s16le",  # 16位PCM编码
+                    "-ar",
+                    "24000",  # 采样率24kHz (适合微信语音)
+                    "-ac",
+                    "1",  # 单声道
+                    "-af",
+                    "apad=pad_dur=2",  # 确保输出时长准确
+                    "-fflags",
+                    "+genpts",  # 强制生成时间戳
+                    "-hide_banner",  # 隐藏版本信息
+                    wav_path,  # 输出文件
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                # 等待进程完成并获取输出
+                stdout, stderr = await p.communicate()
+                logger.info(f"[EdgeTTS] FFmpeg 标准输出: {stdout.decode().strip()}")
+                logger.debug(f"FFmpeg错误输出: {stderr.decode().strip()}")
+                logger.info(f"[EdgeTTS] 返回值(0代表成功): {p.returncode}")
+
             os.remove(mp3_path)
             if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
                 return wav_path
diff --git a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
index 4842b0e04..84087ecf6 100644
--- a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
+++ b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
@@ -48,14 +48,6 @@ class ProviderSenseVoiceSTTSelfHost(STTProvider):
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         return os.path.join("data", "temp", f"{timestamp}")
 
-    async def _convert_audio(self, path: str) -> str:
-        from pyffmpeg import FFmpeg
-
-        filename = await self.get_timestamped_path() + ".mp3"
-        ff = FFmpeg()
-        output_path = ff.convert(path, os.path.join('data","temp', filename))
-        return output_path
-
     async def _is_silk_file(self, file_path):
         silk_header = b"SILK"
         with open(file_path, "rb") as f:
diff --git a/astrbot/core/provider/sources/whisper_api_source.py b/astrbot/core/provider/sources/whisper_api_source.py
index ce474f4ef..e38a81de9 100644
--- a/astrbot/core/provider/sources/whisper_api_source.py
+++ b/astrbot/core/provider/sources/whisper_api_source.py
@@ -31,14 +31,6 @@ class ProviderOpenAIWhisperAPI(STTProvider):
 
         self.set_model(provider_config.get("model", None))
 
-    async def _convert_audio(self, path: str) -> str:
-        from pyffmpeg import FFmpeg
-
-        filename = str(uuid.uuid4()) + ".mp3"
-        ff = FFmpeg()
-        output_path = ff.convert(path, os.path.join("data/temp", filename))
-        return output_path
-
     async def _is_silk_file(self, file_path):
         silk_header = b"SILK"
         with open(file_path, "rb") as f:
diff --git a/astrbot/core/provider/sources/whisper_selfhosted_source.py b/astrbot/core/provider/sources/whisper_selfhosted_source.py
index 1bbc2a1dc..cfd1267d0 100644
--- a/astrbot/core/provider/sources/whisper_selfhosted_source.py
+++ b/astrbot/core/provider/sources/whisper_selfhosted_source.py
@@ -33,14 +33,6 @@ class ProviderOpenAIWhisperSelfHost(STTProvider):
         )
         logger.info("Whisper 模型加载完成。")
 
-    async def _convert_audio(self, path: str) -> str:
-        from pyffmpeg import FFmpeg
-
-        filename = str(uuid.uuid4()) + ".mp3"
-        ff = FFmpeg()
-        output_path = ff.convert(path, os.path.join("data/temp", filename))
-        return output_path
-
     async def _is_silk_file(self, file_path):
         silk_header = b"SILK"
         with open(file_path, "rb") as f: