diff --git a/.gitignore b/.gitignore index 15ab085d4..a863e36ec 100644 --- a/.gitignore +++ b/.gitignore @@ -25,5 +25,4 @@ package.json venv/* packages/python_interpreter/workplace .venv/* - -.conda/ \ No newline at end of file +.conda/ diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index cf0f0f252..ca42825d3 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -583,6 +583,14 @@ CONFIG_METADATA_2 = { "type": "openai_whisper_selfhost", "model": "tiny", }, + "sensevoice(本地加载)": { + "sensevoice_hint": "(不用修改我)", + "enable": False, + "id": "sensevoice", + "type": "sensevoice_stt_selfhost", + "stt_model": "icc/SenseVoiceSmall", + "is_emotion": False, + }, "OpenAI_TTS(API)": { "id": "openai_tts", "type": "openai_tts_api", @@ -604,6 +612,22 @@ CONFIG_METADATA_2 = { }, }, "items": { + "sensevoice_hint": { + "description": "部署SenseVoice", + "type": "string", + "hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。", + "obvious_hint": True, + }, + "is_emotion": { + "description": "情绪识别", + "type": "bool", + "hint": "是否开启情绪识别。happy|sad|angry|neutral|fearful|disgusted|surprised|unknown", + }, + "stt_model": { + "description": "模型名称", + "type": "string", + "hint": "modelscope 上的模型名称。默认:iic/SenseVoiceSmall。", + }, # "variables": { # "description": "工作流固定输入变量", # "type": "object", diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py index 871fd79a0..3d97cdd13 100644 --- a/astrbot/core/provider/manager.py +++ b/astrbot/core/provider/manager.py @@ -104,7 +104,8 @@ class ProviderManager(): kdb_cfg = config.get("knowledge_db", {}) if kdb_cfg and len(kdb_cfg): self.curr_kdb_name = list(kdb_cfg.keys())[0] - + + async def initialize(self): for provider_config in self.providers_config: await self.load_provider(provider_config) @@ -143,6 +144,8 @@ class ProviderManager(): from .sources.dashscope_source import ProviderDashscope as ProviderDashscope case "googlegenai_chat_completion": from .sources.gemini_source import ProviderGoogleGenAI as ProviderGoogleGenAI + case "sensevoice_stt_selfhost": + from .sources.sensevoice_selfhosted_source import ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost case "openai_whisper_api": from .sources.whisper_api_source import ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI case "openai_whisper_selfhost": diff --git a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py new file mode 100644 index 000000000..e08c1bd0a --- /dev/null +++ b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py @@ -0,0 +1,105 @@ +''' +Author: diudiu62 +Date: 2025-02-24 18:04:18 +LastEditTime: 2025-02-25 14:06:30 +''' +import asyncio +from datetime import datetime +import os +import re +from funasr_onnx import SenseVoiceSmall +from funasr_onnx.utils.postprocess_utils import rich_transcription_postprocess +from ..provider import STTProvider +from ..entites import ProviderType +from astrbot.core.utils.io import download_file +from ..register import register_provider_adapter +from astrbot.core import logger +from astrbot.core.utils.tencent_record_helper import tencent_silk_to_wav + +@register_provider_adapter("sensevoice_stt_selfhost", "SenseVoice 自托管语音识别 模型部署", provider_type=ProviderType.SPEECH_TO_TEXT) +class ProviderSenseVoiceSTTSelfHost(STTProvider): + def __init__( + self, + provider_config: dict, + provider_settings: dict, + ) -> None: + super().__init__(provider_config, provider_settings) + self.set_model(provider_config.get("stt_model", None)) + self.model = None + self.is_emotion = provider_config.get("is_emotion", False) + + async def initialize(self): + logger.info("下载或者加载 SenseVoice 模型中,这可能需要一些时间 ...") + + + # 将模型加载放到线程池中执行 + self.model = await asyncio.get_event_loop().run_in_executor( + None, + lambda: SenseVoiceSmall(self.model_name, quantize=True, batch_size=16) + ) + + logger.info("SenseVoice 模型加载完成。") + + async def get_timestamped_path(self) -> str: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return os.path.join("data", "temp", f"{timestamp}") + + async def _convert_audio(self, path: str) -> str: + from pyffmpeg import FFmpeg + filename = await self.get_timestamped_path() + '.mp3' + ff = FFmpeg() + output_path = ff.convert(path, os.path.join('data","temp', filename)) + return output_path + + async def _is_silk_file(self, file_path): + silk_header = b"SILK" + with open(file_path, "rb") as f: + file_header = f.read(8) + + if silk_header in file_header: + return True + else: + return False + + async def get_text(self, audio_url: str) -> str: + try: + is_tencent = audio_url.startswith("http") and "multimedia.nt.qq.com.cn" in audio_url + + if is_tencent: + path = await self.get_timestamped_path() + await download_file(audio_url, path) + audio_url = path + + if not os.path.isfile(audio_url): + raise FileNotFoundError(f"文件不存在: {audio_url}") + + if audio_url.endswith((".amr", ".silk")) or is_tencent: + is_silk = await self._is_silk_file(audio_url) + if is_silk: + logger.info("Converting silk file to wav ...") + output_path = await self.get_timestamped_path()+'.wav' + await tencent_silk_to_wav(audio_url, output_path) + audio_url = output_path + + # 使用 run_in_executor 来调用模型进行识别 + loop = asyncio.get_event_loop() + res = await loop.run_in_executor( + None, # 使用默认的线程池 + lambda: self.model(audio_url, language="auto", use_itn=True) + ) + + # res = self.model(audio_url, language="auto", use_itn=True) + logger.debug(f"SenseVoice识别到的文案:{res}") + text = rich_transcription_postprocess(res[0]) + if self.is_emotion: + # 提取第二个匹配的值 + matches = re.findall(r'<\|([^|]+)\|>', res[0]) + if len(matches) >= 2: + emotion = matches[1] + text = f"(当前的情绪:{emotion}) {text}" + else: + logger.warning("未能提取到情绪信息") + return text + except Exception as e: + logger.error(f"处理音频文件时出错: {e}") + raise \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9384b3bc7..cd1175f65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,4 @@ ormsgpack cryptography dashscope python-telegram-bot -wechatpy \ No newline at end of file +wechatpy