feat: wechatpadpro 发送tts时添加对mp3格式音频支持

2025-06-16 10:05:21 +08:00
parent bbc039366e
commit ccb95f803c
2 changed files with 73 additions and 13 deletions
@@ -17,7 +17,7 @@ from astrbot.core.message.message_event_result import MessageChain
 from astrbot.core.platform.astr_message_event import AstrMessageEvent
 from astrbot.core.platform.astrbot_message import AstrBotMessage, MessageType
 from astrbot.core.platform.platform_metadata import PlatformMetadata
-from astrbot.core.utils.tencent_record_helper import wav_to_tencent_silk_base64
+from astrbot.core.utils.tencent_record_helper import audio_to_tencent_silk_base64

 if TYPE_CHECKING:
    from .wechatpadpro_adapter import WeChatPadProAdapter
@@ -109,7 +109,7 @@ class WeChatPadProMessageEvent(AstrMessageEvent):
    async def _send_voice(self, session: aiohttp.ClientSession, comp: Record):
        record_path = await comp.convert_to_file_path()
        # 默认已经存在 data/temp 中
-        b64, duration = await wav_to_tencent_silk_base64(record_path)
+        b64, duration = await audio_to_tencent_silk_base64(record_path)
        payload = {
            "ToUserName": self.session_id,
            "VoiceData": b64,
@@ -1,9 +1,11 @@
 import base64
 import wave
 import os
+import subprocess
 from io import BytesIO
 import asyncio
 import tempfile
+from astrbot.core import logger
 from astrbot.core.utils.astrbot_path import get_astrbot_data_path


@@ -57,33 +59,89 @@ async def wav_to_tencent_silk(wav_path: str, output_path: str) -> int:
        return duration


-async def wav_to_tencent_silk_base64(wav_path: str) -> str:
+async def convert_to_pcm_wav(input_path: str, output_path: str) -> str:
    """
-    将 WAV 文件转为 Silk，并返回 Base64 字符串。
-    默认采样率为 24000，输出临时文件为 temp/output.silk。
+    将 MP3 或其他音频格式转换为 PCM 16bit WAV，采样率24000Hz，单声道。
+    若转换失败则抛出异常。
+    """
+    try:
+        from pyffmpeg import FFmpeg
+
+        ff = FFmpeg()
+        ff.convert(input=input_path, output=output_path)
+    except Exception as e:
+        logger.debug(f"pyffmpeg 转换失败: {e}, 尝试使用 ffmpeg 命令行进行转换")
+
+        p = await asyncio.create_subprocess_exec(
+            "ffmpeg",
+            "-y",
+            "-i",
+            input_path,
+            "-acodec",
+            "pcm_s16le",
+            "-ar",
+            "24000",
+            "-ac",
+            "1",
+            "-af",
+            "apad=pad_dur=2",
+            "-fflags",
+            "+genpts",
+            "-hide_banner",
+            output_path,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = await p.communicate()
+        logger.info(f"[FFmpeg] stdout: {stdout.decode().strip()}")
+        logger.debug(f"[FFmpeg] stderr: {stderr.decode().strip()}")
+        logger.info(f"[FFmpeg] return code: {p.returncode}")
+
+    if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+        return output_path
+    else:
+        raise RuntimeError("生成的WAV文件不存在或为空")
+
+
+async def audio_to_tencent_silk_base64(audio_path: str) -> tuple[str, float]:
+    """
+    将 MP3/WAV 文件转为 Tencent Silk 并返回 base64 编码与时长（秒）。

    参数:
-    - wav_path: 输入 .wav 文件路径（需为 PCM 16bit）
+    - audio_path: 输入音频文件路径（.mp3 或 .wav）

    返回:
-    - Base64 编码的 Silk 字符串
+    - silk_b64: Base64 编码的 Silk 字符串
    - duration: 音频时长（秒）
    """
    try:
        import pilk
    except ImportError as e:
-        raise Exception("pysilk 模块未安装，请安装 pysilk") from e
+        raise Exception("未安装 pysilk，请执行: pip install pysilk") from e

    temp_dir = os.path.join(get_astrbot_data_path(), "temp")
    os.makedirs(temp_dir, exist_ok=True)

-    with wave.open(wav_path, "rb") as wav:
-        rate = wav.getframerate()
+    # 是否需要转换为 WAV
+    ext = os.path.splitext(audio_path)[1].lower()
+    temp_wav = tempfile.NamedTemporaryFile(
+        suffix=".wav", delete=False, dir=temp_dir
+    ).name

-    with tempfile.NamedTemporaryFile(
+    if ext != ".wav":
+        await convert_to_pcm_wav(audio_path, temp_wav)
+        # 删除原文件
+        os.remove(audio_path)
+        wav_path = temp_wav
+    else:
+        wav_path = audio_path
+
+    with wave.open(wav_path, "rb") as wav_file:
+        rate = wav_file.getframerate()
+
+    silk_path = tempfile.NamedTemporaryFile(
        suffix=".silk", delete=False, dir=temp_dir
-    ) as tmp_file:
-        silk_path = tmp_file.name
+    ).name

    try:
        duration = await asyncio.to_thread(
@@ -96,5 +154,7 @@ async def wav_to_tencent_silk_base64(wav_path: str) -> str:

        return silk_b64, duration  # 已是秒
    finally:
+        if os.path.exists(wav_path) and wav_path != audio_path:
+            os.remove(wav_path)
        if os.path.exists(silk_path):
            os.remove(silk_path)