尝试集成sensevoice

2025-02-25 09:05:24 +08:00
parent dba1ed1e19
commit 0ec382c86b
5 changed files with 98 additions and 1 deletions
@@ -24,3 +24,5 @@ package.json
 venv/*
 packages/python_interpreter/workplace
 .venv/*
+
+Dockerfile_diudiu62
@@ -532,6 +532,13 @@ CONFIG_METADATA_2 = {
                        "type": "openai_whisper_selfhost",
                        "model": "tiny",
                    },
+                    "sensevoice(本地加载)": {
+                        "whisper_hint": "(不用修改我)",
+                        "enable": False,
+                        "id": "sensevoice",
+                        "type": "sensevoice_stt_selfhost",
+                        "model": "tiny",
+                    },
                    "openai_tts(API)": {
                        "id": "openai_tts",
                        "type": "openai_tts_api",
@@ -128,6 +128,8 @@ class ProviderManager():
                        from .sources.whisper_api_source import ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI
                    case "openai_whisper_selfhost":
                        from .sources.whisper_selfhosted_source import ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost
+                    case "sensevoice_stt_selfhost":
+                        from .sources.sensevoice_selfhosted_source import ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost
                    case "openai_tts_api":
                        from .sources.openai_tts_api_source import ProviderOpenAITTSAPI as ProviderOpenAITTSAPI
                    case "fishaudio_tts_api":
@@ -0,0 +1,83 @@
+'''
+Author: diudiu62
+Date: 2025-02-24 18:04:18
+LastEditTime: 2025-02-24 18:33:48
+'''
+from datetime import datetime
+import os
+import asyncio
+from funasr import AutoModel
+from ..provider import STTProvider
+from ..entites import ProviderType
+from astrbot.core.utils.io import download_file
+from ..register import register_provider_adapter
+from astrbot.core import logger
+from astrbot.core.utils.tencent_record_helper import tencent_silk_to_wav
+
+@register_provider_adapter("sensevoice_stt_selfhost", "SenseVoice 自托管语音识别 模型部署", provider_type=ProviderType.SPEECH_TO_TEXT)
+class ProviderSenseVoiceSTTSelfHost(STTProvider):
+    def __init__(
+        self, 
+        provider_config: dict, 
+        provider_settings: dict,
+    ) -> None:
+        super().__init__(provider_config, provider_settings)
+    
+    async def initialize(self):
+        model_dir = "data/model/iic/SenseVoiceSmall"
+        loop = asyncio.get_event_loop()
+        logger.info("下载或者加载 SenseVoice 模型中，这可能需要一些时间 ...")
+        self.model = await loop.run_in_executor(None, AutoModel, 
+                                                model=model_dir,
+                                                trust_remote_code=False,
+                                                # remote_code="./model.py",  
+                                                vad_model="fsmn-vad",
+                                                vad_kwargs={"max_single_segment_time": 30000},
+                                                )
+        logger.info("SenseVoice 模型加载完成。")
+        
+    async def _convert_audio(self, path: str) -> str:
+        from pyffmpeg import FFmpeg
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # 获取当前时间戳
+        filename = timestamp + '.mp3'
+        ff = FFmpeg()
+        output_path = ff.convert(path, os.path.join('data/temp', filename))
+        return output_path
+    
+    async def _is_silk_file(self, file_path):
+        silk_header = b"SILK"
+        with open(file_path, "rb") as f:
+            file_header = f.read(8)
+
+        if silk_header in file_header:
+            return True
+        else:
+            return False
+
+    async def get_text(self, audio_url: str) -> str:
+        loop = asyncio.get_event_loop()
+        
+        is_tencent = False
+        
+        if audio_url.startswith("http"):
+            if "multimedia.nt.qq.com.cn" in audio_url:
+                is_tencent = True
+                
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # 获取当前时间戳
+            path = os.path.join("data/temp", timestamp)
+            await download_file(audio_url, path)
+            audio_url = path
+        
+        if not os.path.exists(audio_url):
+            raise FileNotFoundError(f"文件不存在: {audio_url}")
+        
+        if audio_url.endswith(".amr") or audio_url.endswith(".silk") or is_tencent:
+            is_silk = await self._is_silk_file(audio_url)
+            if is_silk:
+                logger.info("Converting silk file to wav ...")
+                output_path = os.path.join('data/temp', str(uuid.uuid4()) + '.wav')
+                await tencent_silk_to_wav(audio_url, output_path)
+                audio_url = output_path
+                
+        result = await loop.run_in_executor(None, self.model.transcribe, audio_url)
+        return result['text']
@@ -20,4 +20,7 @@ silk-python

 lark-oapi
 ormsgpack
-cryptography
+cryptography
+
+funasr
+torch~=2.6.0