Merge pull request #612 from diudiu62/feat-sensevoice

新增sensevoice语言识别能力
This commit is contained in:
Soulter
2025-02-26 23:56:03 +08:00
committed by GitHub
5 changed files with 135 additions and 4 deletions
+1 -2
View File
@@ -25,5 +25,4 @@ package.json
venv/*
packages/python_interpreter/workplace
.venv/*
.conda/
.conda/
+24
View File
@@ -583,6 +583,14 @@ CONFIG_METADATA_2 = {
"type": "openai_whisper_selfhost",
"model": "tiny",
},
"sensevoice(本地加载)": {
"sensevoice_hint": "(不用修改我)",
"enable": False,
"id": "sensevoice",
"type": "sensevoice_stt_selfhost",
"stt_model": "icc/SenseVoiceSmall",
"is_emotion": False,
},
"OpenAI_TTS(API)": {
"id": "openai_tts",
"type": "openai_tts_api",
@@ -604,6 +612,22 @@ CONFIG_METADATA_2 = {
},
},
"items": {
"sensevoice_hint": {
"description": "部署SenseVoice",
"type": "string",
"hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。",
"obvious_hint": True,
},
"is_emotion": {
"description": "情绪识别",
"type": "bool",
"hint": "是否开启情绪识别。happysadangryneutralfearfuldisgustedsurprisedunknown",
},
"stt_model": {
"description": "模型名称",
"type": "string",
"hint": "modelscope 上的模型名称。默认:iic/SenseVoiceSmall。",
},
# "variables": {
# "description": "工作流固定输入变量",
# "type": "object",
+4 -1
View File
@@ -104,7 +104,8 @@ class ProviderManager():
kdb_cfg = config.get("knowledge_db", {})
if kdb_cfg and len(kdb_cfg):
self.curr_kdb_name = list(kdb_cfg.keys())[0]
async def initialize(self):
for provider_config in self.providers_config:
await self.load_provider(provider_config)
@@ -143,6 +144,8 @@ class ProviderManager():
from .sources.dashscope_source import ProviderDashscope as ProviderDashscope
case "googlegenai_chat_completion":
from .sources.gemini_source import ProviderGoogleGenAI as ProviderGoogleGenAI
case "sensevoice_stt_selfhost":
from .sources.sensevoice_selfhosted_source import ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost
case "openai_whisper_api":
from .sources.whisper_api_source import ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI
case "openai_whisper_selfhost":
@@ -0,0 +1,105 @@
'''
Author: diudiu62
Date: 2025-02-24 18:04:18
LastEditTime: 2025-02-25 14:06:30
'''
import asyncio
from datetime import datetime
import os
import re
from funasr_onnx import SenseVoiceSmall
from funasr_onnx.utils.postprocess_utils import rich_transcription_postprocess
from ..provider import STTProvider
from ..entites import ProviderType
from astrbot.core.utils.io import download_file
from ..register import register_provider_adapter
from astrbot.core import logger
from astrbot.core.utils.tencent_record_helper import tencent_silk_to_wav
@register_provider_adapter("sensevoice_stt_selfhost", "SenseVoice 自托管语音识别 模型部署", provider_type=ProviderType.SPEECH_TO_TEXT)
class ProviderSenseVoiceSTTSelfHost(STTProvider):
def __init__(
self,
provider_config: dict,
provider_settings: dict,
) -> None:
super().__init__(provider_config, provider_settings)
self.set_model(provider_config.get("stt_model", None))
self.model = None
self.is_emotion = provider_config.get("is_emotion", False)
async def initialize(self):
logger.info("下载或者加载 SenseVoice 模型中,这可能需要一些时间 ...")
# 将模型加载放到线程池中执行
self.model = await asyncio.get_event_loop().run_in_executor(
None,
lambda: SenseVoiceSmall(self.model_name, quantize=True, batch_size=16)
)
logger.info("SenseVoice 模型加载完成。")
async def get_timestamped_path(self) -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return os.path.join("data", "temp", f"{timestamp}")
async def _convert_audio(self, path: str) -> str:
from pyffmpeg import FFmpeg
filename = await self.get_timestamped_path() + '.mp3'
ff = FFmpeg()
output_path = ff.convert(path, os.path.join('data","temp', filename))
return output_path
async def _is_silk_file(self, file_path):
silk_header = b"SILK"
with open(file_path, "rb") as f:
file_header = f.read(8)
if silk_header in file_header:
return True
else:
return False
async def get_text(self, audio_url: str) -> str:
try:
is_tencent = audio_url.startswith("http") and "multimedia.nt.qq.com.cn" in audio_url
if is_tencent:
path = await self.get_timestamped_path()
await download_file(audio_url, path)
audio_url = path
if not os.path.isfile(audio_url):
raise FileNotFoundError(f"文件不存在: {audio_url}")
if audio_url.endswith((".amr", ".silk")) or is_tencent:
is_silk = await self._is_silk_file(audio_url)
if is_silk:
logger.info("Converting silk file to wav ...")
output_path = await self.get_timestamped_path()+'.wav'
await tencent_silk_to_wav(audio_url, output_path)
audio_url = output_path
# 使用 run_in_executor 来调用模型进行识别
loop = asyncio.get_event_loop()
res = await loop.run_in_executor(
None, # 使用默认的线程池
lambda: self.model(audio_url, language="auto", use_itn=True)
)
# res = self.model(audio_url, language="auto", use_itn=True)
logger.debug(f"SenseVoice识别到的文案:{res}")
text = rich_transcription_postprocess(res[0])
if self.is_emotion:
# 提取第二个匹配的值
matches = re.findall(r'<\|([^|]+)\|>', res[0])
if len(matches) >= 2:
emotion = matches[1]
text = f"(当前的情绪:{emotion}) {text}"
else:
logger.warning("未能提取到情绪信息")
return text
except Exception as e:
logger.error(f"处理音频文件时出错: {e}")
raise
+1 -1
View File
@@ -23,4 +23,4 @@ ormsgpack
cryptography
dashscope
python-telegram-bot
wechatpy
wechatpy