Add Support for Azure TTS

2025-05-11 01:20:17 +08:00
parent c97cda6b84
commit da4cd7fb65
3 changed files with 245 additions and 0 deletions
@@ -751,8 +751,57 @@ CONFIG_METADATA_2 = {
                        "dashscope_tts_voice": "loongstella",
                        "timeout": "20",
                    },
+                    "Azure_TTS": {
+                        "id": "azure_tts",
+                        "type": "azure_tts",
+                        "enable": True,
+                        "azure_tts_voice": "zh-CN-YunxiaNeural",
+                        "azure_tts_style": "cheerful",
+                        "azure_tts_role": "Boy",
+                        "azure_tts_rate": "1",
+                        "azure_tts_volume": "100",
+                        "azure_tts_subscription_key": "",
+                        "azure_tts_region": "eastus"
+                    },
                },
                "items": {
+                    "azure_tts_voice": {
+                        "type": "string",
+                        "description": "音色设置",
+                        "hint": "API 音色"
+                    },
+                    "azure_tts_style": {
+                        "type": "string",
+                        "description": "风格设置",
+                        "hint": "声音特定的讲话风格。 可以表达快乐、同情和平静等情绪。"
+                    },
+                    "azure_tts_role": {
+                        "type": "string",
+                        "description": "模仿设置（可选）",
+                        "hint": "讲话角色扮演。 声音可以模仿不同的年龄和性别，但声音名称不会更改。 例如，男性语音可以提高音调和改变语调来模拟女性语音，但语音名称不会更改。 如果角色缺失或不受声音的支持，则会忽略此属性。",
+                        "options": ["Boy","Girl","YoungAdultFemale","YoungAdultMale","OlderAdultFemale","OlderAdultMale","SeniorFemale","SeniorMale","禁用"]
+                    },
+                    "azure_tts_rate": {
+                        "type": "string",
+                        "description": "语速设置",
+                        "hint": "指示文本的讲出速率。可在字词或句子层面应用语速。 速率变化应为原始音频的 0.5 到 2 倍。"
+                    },
+                    "azure_tts_volume": {
+                        "type": "string",
+                        "description": "语音音量设置",
+                        "hint": "指示语音的音量级别。 可在句子层面应用音量的变化。以从 0.0 到 100.0（从最安静到最大声，例如 75）的数字表示。 默认值为 100.0。"
+                    },
+                    "azure_tts_region": {
+                        "type": "string",
+                        "description": "API 地区",
+                        "hint": "Azure_TTS 处理数据所在区域，具体参考 https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/regions",
+                        "options": ["southafricanorth", "eastasia", "southeastasia", "australiaeast", "centralindia", "japaneast", "japanwest", "koreacentral", "canadacentral", "northeurope", "westeurope", "francecentral", "germanywestcentral", "norwayeast", "swedencentral", "switzerlandnorth", "switzerlandwest", "uksouth", "uaenorth", "brazilsouth", "qatarcentral", "centralus", "eastus", "eastus2", "northcentralus", "southcentralus", "westcentralus", "westus", "westus2", "westus3"]
+                    },
+                    "azure_tts_subscription_key": {
+                        "type": "string",
+                        "description": "服务订阅密钥",
+                        "hint": "Azure_TTS 服务的订阅密钥（注意不是令牌）"
+                    },
                    "dashscope_tts_voice": {
                        "description": "语音合成模型",
                        "type": "string",
@@ -202,6 +202,10 @@ class ProviderManager:
                    from .sources.dashscope_tts import (
                        ProviderDashscopeTTSAPI as ProviderDashscopeTTSAPI,
                    )
+                case "azure_tts":
+                    from .sources.azure_tts_source import (
+                        AzureTTSProvider as AzureTTSProvider,
+                    )
        except (ImportError, ModuleNotFoundError) as e:
            logger.critical(
                f"加载 {provider_config['type']}({provider_config['id']}) 提供商适配器失败：{e}。可能是因为有未安装的依赖。"
@@ -0,0 +1,192 @@
+import uuid
+import time
+import json
+import re
+import hashlib
+import random
+import asyncio
+from pathlib import Path
+from typing import Dict
+from xml.sax.saxutils import escape
+
+from httpx import AsyncClient, Timeout
+from astrbot.core.config.default import VERSION
+
+from ..entities import ProviderType
+from ..provider import TTSProvider
+from ..register import register_provider_adapter
+
+TEMP_DIR = Path("data/temp/azure_tts")
+TEMP_DIR.mkdir(parents=True, exist_ok=True)
+
+class OTTSProvider:
+    def __init__(self, config: Dict):
+        self.skey = config["OTTS_SKEY"]
+        self.api_url = config["OTTS_URL"]
+        self.auth_time_url = config["OTTS_AUTH_TIME"]
+        self.time_offset = 0
+        self.last_sync_time = 0
+        self.timeout = Timeout(10.0)
+        self.retry_count = 3
+        self.client = AsyncClient(timeout=self.timeout)
+
+    async def _sync_time(self):
+        try:
+            response = await self.client.get(self.auth_time_url)
+            response.raise_for_status()
+            server_time = int(response.json()["timestamp"])
+            local_time = int(time.time())
+            self.time_offset = server_time - local_time
+            self.last_sync_time = local_time
+        except Exception as e:
+            if time.time() - self.last_sync_time > 3600:
+                raise RuntimeError("时间同步失败") from e
+
+    async def _generate_signature(self) -> str:
+        await self._sync_time()
+        timestamp = int(time.time()) + self.time_offset
+        nonce = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=10))
+        path = re.sub(r'^https?://[^/]+', '', self.api_url) or '/'
+        return f"{timestamp}-{nonce}-0-{hashlib.md5(f'{path}-{timestamp}-{nonce}-0-{self.skey}'.encode()).hexdigest()}"
+
+    async def get_audio(self, text: str, voice_params: Dict) -> str:
+        file_path = TEMP_DIR / f"otts-{uuid.uuid4()}.wav"
+        signature = await self._generate_signature()
+        for attempt in range(self.retry_count):
+            try:
+                response = await self.client.post(
+                    f"{self.api_url}?sign={signature}",
+                    data={
+                        "text": text,
+                        "voice": voice_params["voice"],
+                        "style": voice_params["style"],
+                        "role": voice_params["role"],
+                        "rate": voice_params["rate"],
+                        "volume": voice_params["volume"]
+                    },headers={
+                        "User-Agent": f"AstrBot/{VERSION}",
+                        "UAK": f"AstrBot/AzureTTS"
+                    }
+                )
+                response.raise_for_status()
+                file_path.parent.mkdir(parents=True, exist_ok=True)
+                with file_path.open("wb") as f:
+                    for chunk in response.iter_bytes(4096):
+                        f.write(chunk)
+                return str(file_path.resolve())
+            except Exception as e:
+                if attempt == self.retry_count - 1:
+                    raise RuntimeError(f"OTTS请求失败: {str(e)}") from e
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+class AzureNativeProvider(TTSProvider):
+    def __init__(self, provider_config: dict, provider_settings: dict):
+        super().__init__(provider_config, provider_settings)
+        self.subscription_key = provider_config["azure_tts_subscription_key"].strip()
+        if not re.fullmatch(r'^[a-zA-Z0-9]{32}$', self.subscription_key):
+            raise ValueError("无效的Azure订阅密钥")
+
+        self.region = provider_config.get("azure_tts_region", "eastus").strip()
+        self.endpoint = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1"
+        self.client = AsyncClient(headers={
+            "User-Agent": f"AstrBot/{VERSION}",
+            "Content-Type": "application/ssml+xml",
+            "X-Microsoft-OutputFormat": "riff-48khz-16bit-mono-pcm"
+        })
+        self.token = None
+        self.token_expire = 0
+
+        self.voice_params = {
+            "voice": provider_config.get("azure_tts_voice", "zh-CN-YunxiaNeural"),
+            "style": provider_config.get("azure_tts_style", "cheerful"),
+            "role": provider_config.get("azure_tts_role", "Boy"),
+            "rate": provider_config.get("azure_tts_rate", "1"),
+            "volume": provider_config.get("azure_tts_volume", "100")
+        }
+
+    async def _refresh_token(self):
+        token_url = f"https://{self.region}.api.cognitive.microsoft.com/sts/v1.0/issuetoken"
+        response = await self.client.post(
+            token_url,
+            headers={"Ocp-Apim-Subscription-Key": self.subscription_key}
+        )
+        response.raise_for_status()
+        self.token = response.text
+        self.token_expire = time.time() + 540
+
+    async def get_audio(self, text: str) -> str:
+        if not self.token or time.time() > self.token_expire:
+            await self._refresh_token()
+        file_path = TEMP_DIR / f"azure-{uuid.uuid4()}.wav"
+        ssml = f"""<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis'
+            xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='zh-CN'>
+            <voice name='{escape(self.voice_params["voice"])}'>
+                <mstts:express-as style='{escape(self.voice_params["style"])}'
+                    role='{escape(self.voice_params["role"])}'>
+                    <prosody rate='{escape(self.voice_params["rate"])}'
+                        volume='{escape(self.voice_params["volume"])}'>
+                        {escape(text)}
+                    </prosody>
+                </mstts:express-as>
+            </voice>
+        </speak>"""
+        response = await self.client.post(
+            self.endpoint,
+            content=ssml,
+            headers={
+                "Authorization": f"Bearer {self.token}",
+                "User-Agent": f"AstrBot/{VERSION}"
+                }
+        )
+        response.raise_for_status()
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        with file_path.open("wb") as f:
+            for chunk in response.iter_bytes(4096):
+                f.write(chunk)
+        return str(file_path.resolve())
+
+@register_provider_adapter("azure_tts", "Azure TTS", ProviderType.TEXT_TO_SPEECH)
+class AzureTTSProvider(TTSProvider):
+    def __init__(self, provider_config: dict, provider_settings: dict):
+        super().__init__(provider_config, provider_settings)
+        key_value = provider_config.get("azure_tts_subscription_key", "")
+        self.provider = self._parse_provider(key_value, provider_config)
+
+    def _parse_provider(self, key_value: str, config: dict) -> TTSProvider:
+        if key_value.lower().startswith("other["):
+            try:
+                match = re.match(r"other\[(.*)\]", key_value, re.DOTALL)
+                if not match:
+                    raise ValueError("无效的other[...]格式，应形如 other[{...}]")
+                json_str = match.group(1).strip()
+                otts_config = json.loads(json_str)
+                required = {"OTTS_SKEY", "OTTS_URL", "OTTS_AUTH_TIME"}
+                if missing := required - otts_config.keys():
+                    raise ValueError(f"缺少OTTS参数: {', '.join(missing)}")
+
+                return OTTSProvider(otts_config)
+            except json.JSONDecodeError as e:
+                error_msg = (
+                    f"JSON解析失败，请检查格式（错误位置：行 {e.lineno} 列 {e.colno}）\n"
+                    f"错误详情: {e.msg}\n"
+                    f"错误上下文: {json_str[max(0, e.pos-30):e.pos+30]}"
+                )
+                raise ValueError(error_msg) from e
+            except KeyError as e:
+                raise ValueError(f"配置错误: 缺少必要参数 {e}") from e
+        if re.fullmatch(r'^[a-zA-Z0-9]{32}$', key_value):
+            return AzureNativeProvider(config, self.provider_settings)
+        raise ValueError("订阅密钥格式无效，应为32位字母数字或other[...]格式")
+    async def get_audio(self, text: str) -> str:
+        if isinstance(self.provider, OTTSProvider):
+            return await self.provider.get_audio(
+                text,
+                {
+                    "voice": self.provider_config.get("azure_tts_voice"),
+                    "style": self.provider_config.get("azure_tts_style"),
+                    "role": self.provider_config.get("azure_tts_role"),
+                    "rate": self.provider_config.get("azure_tts_rate"),
+                    "volume": self.provider_config.get("azure_tts_volume")
+                }
+            )
+        return await self.provider.get_audio(text)