fix: 修复阿里云百炼平台 TTS 下接入 CosyVoice V2, Qwen TTS 生成报错的问题 (#2964)

* fix: 修复了CosyVoice V2，Qwen TTS生成报错的问题。Fixed compatability problems with CosyVoice V2, Qwen TTS. * fix: 将urlopen的同步请求替换为aiohttp的异步请求以下载音频 * fix: cozyvoice 报错显示 * fix: 添加阿里云百炼 TTS API Key 获取提示信息 --------- Co-authored-by: Soulter <905617992@qq.com>
2025-10-11 13:03:06 -04:00
parent 12fc6f9d38
commit 8c120b020e
2 changed files with 124 additions and 16 deletions
@@ -1056,6 +1056,7 @@ CONFIG_METADATA_2 = {
                        "timeout": "20",
                    },
                    "阿里云百炼 TTS(API)": {
+                        "hint": "API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取。模型和音色的选择文档请参考: 阿里云百炼语音合成音色名称。具体可参考 https://help.aliyun.com/zh/model-studio/speech-synthesis-and-speech-recognition",
                        "id": "dashscope_tts",
                        "provider": "dashscope",
                        "type": "dashscope_tts",
@@ -1436,9 +1437,8 @@ CONFIG_METADATA_2 = {
                        "hint": "Azure_TTS 服务的订阅密钥（注意不是令牌）",
                    },
                    "dashscope_tts_voice": {
-                        "description": "语音合成模型",
-                        "type": "string",
-                        "hint": "阿里云百炼语音合成模型名称。具体可参考 https://help.aliyun.com/zh/model-studio/developer-reference/cosyvoice-python-api 等内容",
+                        "description": "音色",
+                        "type": "string"
                    },
                    "gm_resp_image_modal": {
                        "description": "启用图片模态",
@@ -1,10 +1,22 @@
-import os
-import dashscope
-import uuid
 import asyncio
-from dashscope.audio.tts_v2 import *
-from ..provider import TTSProvider
+import base64
+import logging
+import os
+import uuid
+from typing import Optional, Tuple
+import aiohttp
+import dashscope
+from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer
+
+try:
+    from dashscope.aigc.multimodal_conversation import MultiModalConversation
+except (
+    ImportError
+):  # pragma: no cover - older dashscope versions without Qwen TTS support
+    MultiModalConversation = None
+
 from ..entities import ProviderType
+from ..provider import TTSProvider
 from ..register import register_provider_adapter
 from astrbot.core.utils.astrbot_path import get_astrbot_data_path

@@ -26,16 +38,112 @@ class ProviderDashscopeTTSAPI(TTSProvider):
        dashscope.api_key = self.chosen_api_key

    async def get_audio(self, text: str) -> str:
+        model = self.get_model()
+        if not model:
+            raise RuntimeError("Dashscope TTS model is not configured.")
+
        temp_dir = os.path.join(get_astrbot_data_path(), "temp")
-        path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}.wav")
-        self.synthesizer = SpeechSynthesizer(
-            model=self.get_model(),
+        os.makedirs(temp_dir, exist_ok=True)
+
+        if self._is_qwen_tts_model(model):
+            audio_bytes, ext = await self._synthesize_with_qwen_tts(model, text)
+        else:
+            audio_bytes, ext = await self._synthesize_with_cosyvoice(model, text)
+
+        if not audio_bytes:
+            raise RuntimeError(
+                "Audio synthesis failed, returned empty content. The model may not be supported or the service is unavailable."
+            )
+
+        path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}{ext}")
+        with open(path, "wb") as f:
+            f.write(audio_bytes)
+        return path
+
+    def _call_qwen_tts(self, model: str, text: str):
+        if MultiModalConversation is None:
+            raise RuntimeError(
+                "dashscope SDK missing MultiModalConversation. Please upgrade the dashscope package to use Qwen TTS models."
+            )
+
+        kwargs = {
+            "model": model,
+            "text": text,
+            "api_key": self.chosen_api_key,
+            "voice": self.voice or "Cherry",
+        }
+        if not self.voice:
+            logging.warning(
+                "No voice specified for Qwen TTS model, using default 'Cherry'."
+            )
+        return MultiModalConversation.call(**kwargs)
+
+    async def _synthesize_with_qwen_tts(
+        self, model: str, text: str
+    ) -> Tuple[Optional[bytes], str]:
+        loop = asyncio.get_event_loop()
+        response = await loop.run_in_executor(None, self._call_qwen_tts, model, text)
+        audio_bytes = await self._extract_audio_from_response(response)
+        if not audio_bytes:
+            raise RuntimeError(
+                f"Audio synthesis failed for model '{model}'. {response}"
+            )
+        ext = ".wav"
+        return audio_bytes, ext
+
+    async def _extract_audio_from_response(self, response) -> Optional[bytes]:
+        output = getattr(response, "output", None)
+        audio_obj = getattr(output, "audio", None) if output is not None else None
+        if not audio_obj:
+            return None
+
+        data_b64 = getattr(audio_obj, "data", None)
+        if data_b64:
+            try:
+                return base64.b64decode(data_b64)
+            except (ValueError, TypeError):
+                logging.error("Failed to decode base64 audio data.")
+                return None
+
+        url = getattr(audio_obj, "url", None)
+        if url:
+            return await self._download_audio_from_url(url)
+        return None
+
+    async def _download_audio_from_url(self, url: str) -> Optional[bytes]:
+        if not url:
+            return None
+        timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(
+                    url, timeout=aiohttp.ClientTimeout(total=timeout)
+                ) as response:
+                    return await response.read()
+        except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e:
+            logging.error(f"Failed to download audio from URL {url}: {e}")
+            return None
+
+    async def _synthesize_with_cosyvoice(
+        self, model: str, text: str
+    ) -> Tuple[Optional[bytes], str]:
+        synthesizer = SpeechSynthesizer(
+            model=model,
            voice=self.voice,
            format=AudioFormat.WAV_24000HZ_MONO_16BIT,
        )
-        audio = await asyncio.get_event_loop().run_in_executor(
-            None, self.synthesizer.call, text, self.timeout_ms
+        loop = asyncio.get_event_loop()
+        audio_bytes = await loop.run_in_executor(
+            None, synthesizer.call, text, self.timeout_ms
        )
-        with open(path, "wb") as f:
-            f.write(audio)
-        return path
+        if not audio_bytes:
+            resp = synthesizer.get_response()
+            if resp and isinstance(resp, dict):
+                raise RuntimeError(
+                    f"Audio synthesis failed for model '{model}'. {resp}".strip()
+                )
+        return audio_bytes, ".wav"
+
+    def _is_qwen_tts_model(self, model: str) -> bool:
+        model_lower = model.lower()
+        return "tts" in model_lower and model_lower.startswith("qwen")