From 0ec382c86b96dfedad38827efd249e66de75e2ad Mon Sep 17 00:00:00 2001 From: diudiu62 <115522593@qq.com> Date: Tue, 25 Feb 2025 09:05:24 +0800 Subject: [PATCH 1/6] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E9=9B=86=E6=88=90sensevo?= =?UTF-8?q?ice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + astrbot/core/config/default.py | 7 ++ astrbot/core/provider/manager.py | 2 + .../sources/sensevoice_selfhosted_source.py | 83 +++++++++++++++++++ requirements.txt | 5 +- 5 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 astrbot/core/provider/sources/sensevoice_selfhosted_source.py diff --git a/.gitignore b/.gitignore index 7745d18ba..cb28c1d12 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ package.json venv/* packages/python_interpreter/workplace .venv/* + +Dockerfile_diudiu62 \ No newline at end of file diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 7a6b68b3e..f4ff839cb 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -532,6 +532,13 @@ CONFIG_METADATA_2 = { "type": "openai_whisper_selfhost", "model": "tiny", }, + "sensevoice(本地加载)": { + "whisper_hint": "(不用修改我)", + "enable": False, + "id": "sensevoice", + "type": "sensevoice_stt_selfhost", + "model": "tiny", + }, "openai_tts(API)": { "id": "openai_tts", "type": "openai_tts_api", diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py index 2ba108e29..7187cc100 100644 --- a/astrbot/core/provider/manager.py +++ b/astrbot/core/provider/manager.py @@ -128,6 +128,8 @@ class ProviderManager(): from .sources.whisper_api_source import ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI case "openai_whisper_selfhost": from .sources.whisper_selfhosted_source import ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost + case "sensevoice_stt_selfhost": + from .sources.sensevoice_selfhosted_source import ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost case "openai_tts_api": from .sources.openai_tts_api_source import ProviderOpenAITTSAPI as ProviderOpenAITTSAPI case "fishaudio_tts_api": diff --git a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py new file mode 100644 index 000000000..0bcb5729e --- /dev/null +++ b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py @@ -0,0 +1,83 @@ +''' +Author: diudiu62 +Date: 2025-02-24 18:04:18 +LastEditTime: 2025-02-24 18:33:48 +''' +from datetime import datetime +import os +import asyncio +from funasr import AutoModel +from ..provider import STTProvider +from ..entites import ProviderType +from astrbot.core.utils.io import download_file +from ..register import register_provider_adapter +from astrbot.core import logger +from astrbot.core.utils.tencent_record_helper import tencent_silk_to_wav + +@register_provider_adapter("sensevoice_stt_selfhost", "SenseVoice 自托管语音识别 模型部署", provider_type=ProviderType.SPEECH_TO_TEXT) +class ProviderSenseVoiceSTTSelfHost(STTProvider): + def __init__( + self, + provider_config: dict, + provider_settings: dict, + ) -> None: + super().__init__(provider_config, provider_settings) + + async def initialize(self): + model_dir = "data/model/iic/SenseVoiceSmall" + loop = asyncio.get_event_loop() + logger.info("下载或者加载 SenseVoice 模型中,这可能需要一些时间 ...") + self.model = await loop.run_in_executor(None, AutoModel, + model=model_dir, + trust_remote_code=False, + # remote_code="./model.py", + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + ) + logger.info("SenseVoice 模型加载完成。") + + async def _convert_audio(self, path: str) -> str: + from pyffmpeg import FFmpeg + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # 获取当前时间戳 + filename = timestamp + '.mp3' + ff = FFmpeg() + output_path = ff.convert(path, os.path.join('data/temp', filename)) + return output_path + + async def _is_silk_file(self, file_path): + silk_header = b"SILK" + with open(file_path, "rb") as f: + file_header = f.read(8) + + if silk_header in file_header: + return True + else: + return False + + async def get_text(self, audio_url: str) -> str: + loop = asyncio.get_event_loop() + + is_tencent = False + + if audio_url.startswith("http"): + if "multimedia.nt.qq.com.cn" in audio_url: + is_tencent = True + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # 获取当前时间戳 + path = os.path.join("data/temp", timestamp) + await download_file(audio_url, path) + audio_url = path + + if not os.path.exists(audio_url): + raise FileNotFoundError(f"文件不存在: {audio_url}") + + if audio_url.endswith(".amr") or audio_url.endswith(".silk") or is_tencent: + is_silk = await self._is_silk_file(audio_url) + if is_silk: + logger.info("Converting silk file to wav ...") + output_path = os.path.join('data/temp', str(uuid.uuid4()) + '.wav') + await tencent_silk_to_wav(audio_url, output_path) + audio_url = output_path + + result = await loop.run_in_executor(None, self.model.transcribe, audio_url) + return result['text'] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c077496e2..aaf1562cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,7 @@ silk-python lark-oapi ormsgpack -cryptography \ No newline at end of file +cryptography + +funasr +torch~=2.6.0 \ No newline at end of file From 5aa842cf66059c28dcfa31d80dca1549a2a2913a Mon Sep 17 00:00:00 2001 From: diudiu62 <115522593@qq.com> Date: Tue, 25 Feb 2025 14:15:22 +0800 Subject: [PATCH 2/6] =?UTF-8?q?=E5=A2=9E=E5=8A=A0sensevoice=E9=85=8D?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/core/config/default.py | 21 +++- .../sources/sensevoice_selfhosted_source.py | 104 +++++++++++------- requirements.txt | 5 +- 3 files changed, 85 insertions(+), 45 deletions(-) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index f4ff839cb..e83e381ad 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -533,11 +533,12 @@ CONFIG_METADATA_2 = { "model": "tiny", }, "sensevoice(本地加载)": { - "whisper_hint": "(不用修改我)", + "sensevoice_hint": "(不用修改我)", "enable": False, "id": "sensevoice", "type": "sensevoice_stt_selfhost", - "model": "tiny", + "stt_model": "icc/SenseVoiceSmall", + "is_emotion": False, }, "openai_tts(API)": { "id": "openai_tts", @@ -560,6 +561,22 @@ CONFIG_METADATA_2 = { }, }, "items": { + "sensevoice_hint": { + "description": "部署SenseVoice", + "type": "string", + "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。", + "obvious_hint": True, + }, + "is_emotion": { + "description": "情绪识别", + "type": "bool", + "hint": "是否开启情绪识别。happy|sad|angry|neutral|fearful|disgusted|surprised|unknown", + }, + "stt_model": { + "description": "模型名称", + "type": "string", + "hint": "modelscope 上的模型名称。默认:iic/SenseVoiceSmall。", + }, "timeout": { "description": "超时时间", "type": "int", diff --git a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py index 0bcb5729e..e08c1bd0a 100644 --- a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py +++ b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py @@ -1,12 +1,14 @@ ''' Author: diudiu62 Date: 2025-02-24 18:04:18 -LastEditTime: 2025-02-24 18:33:48 +LastEditTime: 2025-02-25 14:06:30 ''' +import asyncio from datetime import datetime import os -import asyncio -from funasr import AutoModel +import re +from funasr_onnx import SenseVoiceSmall +from funasr_onnx.utils.postprocess_utils import rich_transcription_postprocess from ..provider import STTProvider from ..entites import ProviderType from astrbot.core.utils.io import download_file @@ -22,26 +24,31 @@ class ProviderSenseVoiceSTTSelfHost(STTProvider): provider_settings: dict, ) -> None: super().__init__(provider_config, provider_settings) + self.set_model(provider_config.get("stt_model", None)) + self.model = None + self.is_emotion = provider_config.get("is_emotion", False) async def initialize(self): - model_dir = "data/model/iic/SenseVoiceSmall" - loop = asyncio.get_event_loop() logger.info("下载或者加载 SenseVoice 模型中,这可能需要一些时间 ...") - self.model = await loop.run_in_executor(None, AutoModel, - model=model_dir, - trust_remote_code=False, - # remote_code="./model.py", - vad_model="fsmn-vad", - vad_kwargs={"max_single_segment_time": 30000}, - ) + + + # 将模型加载放到线程池中执行 + self.model = await asyncio.get_event_loop().run_in_executor( + None, + lambda: SenseVoiceSmall(self.model_name, quantize=True, batch_size=16) + ) + logger.info("SenseVoice 模型加载完成。") - + + async def get_timestamped_path(self) -> str: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return os.path.join("data", "temp", f"{timestamp}") + async def _convert_audio(self, path: str) -> str: from pyffmpeg import FFmpeg - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # 获取当前时间戳 - filename = timestamp + '.mp3' + filename = await self.get_timestamped_path() + '.mp3' ff = FFmpeg() - output_path = ff.convert(path, os.path.join('data/temp', filename)) + output_path = ff.convert(path, os.path.join('data","temp', filename)) return output_path async def _is_silk_file(self, file_path): @@ -55,29 +62,44 @@ class ProviderSenseVoiceSTTSelfHost(STTProvider): return False async def get_text(self, audio_url: str) -> str: - loop = asyncio.get_event_loop() - - is_tencent = False - - if audio_url.startswith("http"): - if "multimedia.nt.qq.com.cn" in audio_url: - is_tencent = True + try: + is_tencent = audio_url.startswith("http") and "multimedia.nt.qq.com.cn" in audio_url + + if is_tencent: + path = await self.get_timestamped_path() + await download_file(audio_url, path) + audio_url = path - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # 获取当前时间戳 - path = os.path.join("data/temp", timestamp) - await download_file(audio_url, path) - audio_url = path - - if not os.path.exists(audio_url): - raise FileNotFoundError(f"文件不存在: {audio_url}") - - if audio_url.endswith(".amr") or audio_url.endswith(".silk") or is_tencent: - is_silk = await self._is_silk_file(audio_url) - if is_silk: - logger.info("Converting silk file to wav ...") - output_path = os.path.join('data/temp', str(uuid.uuid4()) + '.wav') - await tencent_silk_to_wav(audio_url, output_path) - audio_url = output_path - - result = await loop.run_in_executor(None, self.model.transcribe, audio_url) - return result['text'] \ No newline at end of file + if not os.path.isfile(audio_url): + raise FileNotFoundError(f"文件不存在: {audio_url}") + + if audio_url.endswith((".amr", ".silk")) or is_tencent: + is_silk = await self._is_silk_file(audio_url) + if is_silk: + logger.info("Converting silk file to wav ...") + output_path = await self.get_timestamped_path()+'.wav' + await tencent_silk_to_wav(audio_url, output_path) + audio_url = output_path + + # 使用 run_in_executor 来调用模型进行识别 + loop = asyncio.get_event_loop() + res = await loop.run_in_executor( + None, # 使用默认的线程池 + lambda: self.model(audio_url, language="auto", use_itn=True) + ) + + # res = self.model(audio_url, language="auto", use_itn=True) + logger.debug(f"SenseVoice识别到的文案:{res}") + text = rich_transcription_postprocess(res[0]) + if self.is_emotion: + # 提取第二个匹配的值 + matches = re.findall(r'<\|([^|]+)\|>', res[0]) + if len(matches) >= 2: + emotion = matches[1] + text = f"(当前的情绪:{emotion}) {text}" + else: + logger.warning("未能提取到情绪信息") + return text + except Exception as e: + logger.error(f"处理音频文件时出错: {e}") + raise \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index aaf1562cb..b9fa04592 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,5 +22,6 @@ lark-oapi ormsgpack cryptography -funasr -torch~=2.6.0 \ No newline at end of file +funasr_onnx +torchaudio +torch \ No newline at end of file From d01d1a8520df5fcb30b43cd2bc85bcdc879909d0 Mon Sep 17 00:00:00 2001 From: diudiu62 <115522593@qq.com> Date: Tue, 25 Feb 2025 18:03:29 +0800 Subject: [PATCH 3/6] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/core/config/default.py | 2 +- requirements.txt | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index e83e381ad..24dea759c 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -564,7 +564,7 @@ CONFIG_METADATA_2 = { "sensevoice_hint": { "description": "部署SenseVoice", "type": "string", - "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。", + "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch、modelscope、jieba 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。", "obvious_hint": True, }, "is_emotion": { diff --git a/requirements.txt b/requirements.txt index b9fa04592..727be2aad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,6 @@ cryptography funasr_onnx torchaudio -torch \ No newline at end of file +torch +modelscope +jieba \ No newline at end of file From 82673e8ddd6700f2c0a2e45deaae67f44699d6a2 Mon Sep 17 00:00:00 2001 From: diudiu62 <115522593@qq.com> Date: Wed, 26 Feb 2025 09:46:30 +0800 Subject: [PATCH 4/6] =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=94=BE=E5=88=B0?= =?UTF-8?q?=E4=BA=86=E5=8F=82=E6=95=B0=E9=85=8D=E7=BD=AE=E5=9C=B0=E6=96=B9?= =?UTF-8?q?=E6=8F=90=E9=86=92=EF=BC=8Cdocker=E6=8F=90=E5=89=8D=E8=87=AA?= =?UTF-8?q?=E8=A1=8C=E6=89=93=E5=8C=85=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/core/config/default.py | 2 +- requirements.txt | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 24dea759c..d2e5dcee5 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -564,7 +564,7 @@ CONFIG_METADATA_2 = { "sensevoice_hint": { "description": "部署SenseVoice", "type": "string", - "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch、modelscope、jieba 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。", + "hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。", "obvious_hint": True, }, "is_emotion": { diff --git a/requirements.txt b/requirements.txt index 727be2aad..f9d4817af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,9 +21,3 @@ silk-python lark-oapi ormsgpack cryptography - -funasr_onnx -torchaudio -torch -modelscope -jieba \ No newline at end of file From 68c1957267a126ac3ff88cae88c399660a00ad16 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Wed, 26 Feb 2025 23:21:28 +0800 Subject: [PATCH 5/6] chore: update gitignore --- .gitignore | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index cb28c1d12..52b57f486 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,4 @@ package-lock.json package.json venv/* packages/python_interpreter/workplace -.venv/* - -Dockerfile_diudiu62 \ No newline at end of file +.venv/* \ No newline at end of file From 8677d70baff036b8643ef0a901d0a3aa91481f2c Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Wed, 26 Feb 2025 23:55:00 +0800 Subject: [PATCH 6/6] feat: add sensevoice adapter --- astrbot/core/provider/manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py index 1e64a1d9d..3d97cdd13 100644 --- a/astrbot/core/provider/manager.py +++ b/astrbot/core/provider/manager.py @@ -144,6 +144,8 @@ class ProviderManager(): from .sources.dashscope_source import ProviderDashscope as ProviderDashscope case "googlegenai_chat_completion": from .sources.gemini_source import ProviderGoogleGenAI as ProviderGoogleGenAI + case "sensevoice_stt_selfhost": + from .sources.sensevoice_selfhosted_source import ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost case "openai_whisper_api": from .sources.whisper_api_source import ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI case "openai_whisper_selfhost":