fix: 修复阿里云百炼平台 TTS 下接入 CosyVoice V2, Qwen TTS 生成报错的问题 (#2964)
* fix: 修复了CosyVoice V2,Qwen TTS生成报错的问题。Fixed compatability problems with CosyVoice V2, Qwen TTS. * fix: 将urlopen的同步请求替换为aiohttp的异步请求以下载音频 * fix: cozyvoice 报错显示 * fix: 添加阿里云百炼 TTS API Key 获取提示信息 --------- Co-authored-by: Soulter <905617992@qq.com>
This commit is contained in:
@@ -1056,6 +1056,7 @@ CONFIG_METADATA_2 = {
|
||||
"timeout": "20",
|
||||
},
|
||||
"阿里云百炼 TTS(API)": {
|
||||
"hint": "API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取。模型和音色的选择文档请参考: 阿里云百炼语音合成音色名称。具体可参考 https://help.aliyun.com/zh/model-studio/speech-synthesis-and-speech-recognition",
|
||||
"id": "dashscope_tts",
|
||||
"provider": "dashscope",
|
||||
"type": "dashscope_tts",
|
||||
@@ -1436,9 +1437,8 @@ CONFIG_METADATA_2 = {
|
||||
"hint": "Azure_TTS 服务的订阅密钥(注意不是令牌)",
|
||||
},
|
||||
"dashscope_tts_voice": {
|
||||
"description": "语音合成模型",
|
||||
"type": "string",
|
||||
"hint": "阿里云百炼语音合成模型名称。具体可参考 https://help.aliyun.com/zh/model-studio/developer-reference/cosyvoice-python-api 等内容",
|
||||
"description": "音色",
|
||||
"type": "string"
|
||||
},
|
||||
"gm_resp_image_modal": {
|
||||
"description": "启用图片模态",
|
||||
|
||||
@@ -1,10 +1,22 @@
|
||||
import os
|
||||
import dashscope
|
||||
import uuid
|
||||
import asyncio
|
||||
from dashscope.audio.tts_v2 import *
|
||||
from ..provider import TTSProvider
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from typing import Optional, Tuple
|
||||
import aiohttp
|
||||
import dashscope
|
||||
from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer
|
||||
|
||||
try:
|
||||
from dashscope.aigc.multimodal_conversation import MultiModalConversation
|
||||
except (
|
||||
ImportError
|
||||
): # pragma: no cover - older dashscope versions without Qwen TTS support
|
||||
MultiModalConversation = None
|
||||
|
||||
from ..entities import ProviderType
|
||||
from ..provider import TTSProvider
|
||||
from ..register import register_provider_adapter
|
||||
from astrbot.core.utils.astrbot_path import get_astrbot_data_path
|
||||
|
||||
@@ -26,16 +38,112 @@ class ProviderDashscopeTTSAPI(TTSProvider):
|
||||
dashscope.api_key = self.chosen_api_key
|
||||
|
||||
async def get_audio(self, text: str) -> str:
|
||||
model = self.get_model()
|
||||
if not model:
|
||||
raise RuntimeError("Dashscope TTS model is not configured.")
|
||||
|
||||
temp_dir = os.path.join(get_astrbot_data_path(), "temp")
|
||||
path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}.wav")
|
||||
self.synthesizer = SpeechSynthesizer(
|
||||
model=self.get_model(),
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
if self._is_qwen_tts_model(model):
|
||||
audio_bytes, ext = await self._synthesize_with_qwen_tts(model, text)
|
||||
else:
|
||||
audio_bytes, ext = await self._synthesize_with_cosyvoice(model, text)
|
||||
|
||||
if not audio_bytes:
|
||||
raise RuntimeError(
|
||||
"Audio synthesis failed, returned empty content. The model may not be supported or the service is unavailable."
|
||||
)
|
||||
|
||||
path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}{ext}")
|
||||
with open(path, "wb") as f:
|
||||
f.write(audio_bytes)
|
||||
return path
|
||||
|
||||
def _call_qwen_tts(self, model: str, text: str):
|
||||
if MultiModalConversation is None:
|
||||
raise RuntimeError(
|
||||
"dashscope SDK missing MultiModalConversation. Please upgrade the dashscope package to use Qwen TTS models."
|
||||
)
|
||||
|
||||
kwargs = {
|
||||
"model": model,
|
||||
"text": text,
|
||||
"api_key": self.chosen_api_key,
|
||||
"voice": self.voice or "Cherry",
|
||||
}
|
||||
if not self.voice:
|
||||
logging.warning(
|
||||
"No voice specified for Qwen TTS model, using default 'Cherry'."
|
||||
)
|
||||
return MultiModalConversation.call(**kwargs)
|
||||
|
||||
async def _synthesize_with_qwen_tts(
|
||||
self, model: str, text: str
|
||||
) -> Tuple[Optional[bytes], str]:
|
||||
loop = asyncio.get_event_loop()
|
||||
response = await loop.run_in_executor(None, self._call_qwen_tts, model, text)
|
||||
audio_bytes = await self._extract_audio_from_response(response)
|
||||
if not audio_bytes:
|
||||
raise RuntimeError(
|
||||
f"Audio synthesis failed for model '{model}'. {response}"
|
||||
)
|
||||
ext = ".wav"
|
||||
return audio_bytes, ext
|
||||
|
||||
async def _extract_audio_from_response(self, response) -> Optional[bytes]:
|
||||
output = getattr(response, "output", None)
|
||||
audio_obj = getattr(output, "audio", None) if output is not None else None
|
||||
if not audio_obj:
|
||||
return None
|
||||
|
||||
data_b64 = getattr(audio_obj, "data", None)
|
||||
if data_b64:
|
||||
try:
|
||||
return base64.b64decode(data_b64)
|
||||
except (ValueError, TypeError):
|
||||
logging.error("Failed to decode base64 audio data.")
|
||||
return None
|
||||
|
||||
url = getattr(audio_obj, "url", None)
|
||||
if url:
|
||||
return await self._download_audio_from_url(url)
|
||||
return None
|
||||
|
||||
async def _download_audio_from_url(self, url: str) -> Optional[bytes]:
|
||||
if not url:
|
||||
return None
|
||||
timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
url, timeout=aiohttp.ClientTimeout(total=timeout)
|
||||
) as response:
|
||||
return await response.read()
|
||||
except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e:
|
||||
logging.error(f"Failed to download audio from URL {url}: {e}")
|
||||
return None
|
||||
|
||||
async def _synthesize_with_cosyvoice(
|
||||
self, model: str, text: str
|
||||
) -> Tuple[Optional[bytes], str]:
|
||||
synthesizer = SpeechSynthesizer(
|
||||
model=model,
|
||||
voice=self.voice,
|
||||
format=AudioFormat.WAV_24000HZ_MONO_16BIT,
|
||||
)
|
||||
audio = await asyncio.get_event_loop().run_in_executor(
|
||||
None, self.synthesizer.call, text, self.timeout_ms
|
||||
loop = asyncio.get_event_loop()
|
||||
audio_bytes = await loop.run_in_executor(
|
||||
None, synthesizer.call, text, self.timeout_ms
|
||||
)
|
||||
with open(path, "wb") as f:
|
||||
f.write(audio)
|
||||
return path
|
||||
if not audio_bytes:
|
||||
resp = synthesizer.get_response()
|
||||
if resp and isinstance(resp, dict):
|
||||
raise RuntimeError(
|
||||
f"Audio synthesis failed for model '{model}'. {resp}".strip()
|
||||
)
|
||||
return audio_bytes, ".wav"
|
||||
|
||||
def _is_qwen_tts_model(self, model: str) -> bool:
|
||||
model_lower = model.lower()
|
||||
return "tts" in model_lower and model_lower.startswith("qwen")
|
||||
|
||||
Reference in New Issue
Block a user