From 0ec382c86b96dfedad38827efd249e66de75e2ad Mon Sep 17 00:00:00 2001
From: diudiu62 <115522593@qq.com>
Date: Tue, 25 Feb 2025 09:05:24 +0800
Subject: [PATCH 1/6] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E9=9B=86=E6=88=90sensevo?=
 =?UTF-8?q?ice?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |  2 +
 astrbot/core/config/default.py                |  7 ++
 astrbot/core/provider/manager.py              |  2 +
 .../sources/sensevoice_selfhosted_source.py   | 83 +++++++++++++++++++
 requirements.txt                              |  5 +-
 5 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 astrbot/core/provider/sources/sensevoice_selfhosted_source.py

diff --git a/.gitignore b/.gitignore
index 7745d18ba..cb28c1d12 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,5 @@ package.json
 venv/*
 packages/python_interpreter/workplace
 .venv/*
+
+Dockerfile_diudiu62
\ No newline at end of file
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index 7a6b68b3e..f4ff839cb 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -532,6 +532,13 @@ CONFIG_METADATA_2 = {
                         "type": "openai_whisper_selfhost",
                         "model": "tiny",
                     },
+                    "sensevoice(本地加载)": {
+                        "whisper_hint": "(不用修改我)",
+                        "enable": False,
+                        "id": "sensevoice",
+                        "type": "sensevoice_stt_selfhost",
+                        "model": "tiny",
+                    },
                     "openai_tts(API)": {
                         "id": "openai_tts",
                         "type": "openai_tts_api",
diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py
index 2ba108e29..7187cc100 100644
--- a/astrbot/core/provider/manager.py
+++ b/astrbot/core/provider/manager.py
@@ -128,6 +128,8 @@ class ProviderManager():
                         from .sources.whisper_api_source import ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI
                     case "openai_whisper_selfhost":
                         from .sources.whisper_selfhosted_source import ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost
+                    case "sensevoice_stt_selfhost":
+                        from .sources.sensevoice_selfhosted_source import ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost
                     case "openai_tts_api":
                         from .sources.openai_tts_api_source import ProviderOpenAITTSAPI as ProviderOpenAITTSAPI
                     case "fishaudio_tts_api":
diff --git a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
new file mode 100644
index 000000000..0bcb5729e
--- /dev/null
+++ b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
@@ -0,0 +1,83 @@
+'''
+Author: diudiu62
+Date: 2025-02-24 18:04:18
+LastEditTime: 2025-02-24 18:33:48
+'''
+from datetime import datetime
+import os
+import asyncio
+from funasr import AutoModel
+from ..provider import STTProvider
+from ..entites import ProviderType
+from astrbot.core.utils.io import download_file
+from ..register import register_provider_adapter
+from astrbot.core import logger
+from astrbot.core.utils.tencent_record_helper import tencent_silk_to_wav
+
+@register_provider_adapter("sensevoice_stt_selfhost", "SenseVoice 自托管语音识别 模型部署", provider_type=ProviderType.SPEECH_TO_TEXT)
+class ProviderSenseVoiceSTTSelfHost(STTProvider):
+    def __init__(
+        self, 
+        provider_config: dict, 
+        provider_settings: dict,
+    ) -> None:
+        super().__init__(provider_config, provider_settings)
+    
+    async def initialize(self):
+        model_dir = "data/model/iic/SenseVoiceSmall"
+        loop = asyncio.get_event_loop()
+        logger.info("下载或者加载 SenseVoice 模型中，这可能需要一些时间 ...")
+        self.model = await loop.run_in_executor(None, AutoModel, 
+                                                model=model_dir,
+                                                trust_remote_code=False,
+                                                # remote_code="./model.py",  
+                                                vad_model="fsmn-vad",
+                                                vad_kwargs={"max_single_segment_time": 30000},
+                                                )
+        logger.info("SenseVoice 模型加载完成。")
+        
+    async def _convert_audio(self, path: str) -> str:
+        from pyffmpeg import FFmpeg
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # 获取当前时间戳
+        filename = timestamp + '.mp3'
+        ff = FFmpeg()
+        output_path = ff.convert(path, os.path.join('data/temp', filename))
+        return output_path
+    
+    async def _is_silk_file(self, file_path):
+        silk_header = b"SILK"
+        with open(file_path, "rb") as f:
+            file_header = f.read(8)
+
+        if silk_header in file_header:
+            return True
+        else:
+            return False
+
+    async def get_text(self, audio_url: str) -> str:
+        loop = asyncio.get_event_loop()
+        
+        is_tencent = False
+        
+        if audio_url.startswith("http"):
+            if "multimedia.nt.qq.com.cn" in audio_url:
+                is_tencent = True
+                
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # 获取当前时间戳
+            path = os.path.join("data/temp", timestamp)
+            await download_file(audio_url, path)
+            audio_url = path
+        
+        if not os.path.exists(audio_url):
+            raise FileNotFoundError(f"文件不存在: {audio_url}")
+        
+        if audio_url.endswith(".amr") or audio_url.endswith(".silk") or is_tencent:
+            is_silk = await self._is_silk_file(audio_url)
+            if is_silk:
+                logger.info("Converting silk file to wav ...")
+                output_path = os.path.join('data/temp', str(uuid.uuid4()) + '.wav')
+                await tencent_silk_to_wav(audio_url, output_path)
+                audio_url = output_path
+                
+        result = await loop.run_in_executor(None, self.model.transcribe, audio_url)
+        return result['text']
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index c077496e2..aaf1562cb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,7 @@ silk-python
 
 lark-oapi
 ormsgpack
-cryptography
\ No newline at end of file
+cryptography
+
+funasr
+torch~=2.6.0
\ No newline at end of file

From 5aa842cf66059c28dcfa31d80dca1549a2a2913a Mon Sep 17 00:00:00 2001
From: diudiu62 <115522593@qq.com>
Date: Tue, 25 Feb 2025 14:15:22 +0800
Subject: [PATCH 2/6] =?UTF-8?q?=E5=A2=9E=E5=8A=A0sensevoice=E9=85=8D?=
 =?UTF-8?q?=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 astrbot/core/config/default.py                |  21 +++-
 .../sources/sensevoice_selfhosted_source.py   | 104 +++++++++++-------
 requirements.txt                              |   5 +-
 3 files changed, 85 insertions(+), 45 deletions(-)

diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index f4ff839cb..e83e381ad 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -533,11 +533,12 @@ CONFIG_METADATA_2 = {
                         "model": "tiny",
                     },
                     "sensevoice(本地加载)": {
-                        "whisper_hint": "(不用修改我)",
+                        "sensevoice_hint": "(不用修改我)",
                         "enable": False,
                         "id": "sensevoice",
                         "type": "sensevoice_stt_selfhost",
-                        "model": "tiny",
+                        "stt_model": "icc/SenseVoiceSmall",
+                        "is_emotion": False,
                     },
                     "openai_tts(API)": {
                         "id": "openai_tts",
@@ -560,6 +561,22 @@ CONFIG_METADATA_2 = {
                     },
                 },
                 "items": {
+                    "sensevoice_hint": {
+                        "description": "部署SenseVoice",
+                        "type": "string",
+                        "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch 库（默认使用CPU，大约下载 1 GB），并且安装 ffmpeg。否则将无法正常转文字。",
+                        "obvious_hint": True,
+                    },
+                    "is_emotion": {
+                        "description": "情绪识别",
+                        "type": "bool",
+                        "hint": "是否开启情绪识别。happy｜sad｜angry｜neutral｜fearful｜disgusted｜surprised｜unknown",
+                    },
+                    "stt_model": {
+                        "description": "模型名称",
+                        "type": "string",
+                        "hint": "modelscope 上的模型名称。默认：iic/SenseVoiceSmall。",
+                    },
                     "timeout": {
                         "description": "超时时间",
                         "type": "int",
diff --git a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
index 0bcb5729e..e08c1bd0a 100644
--- a/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
+++ b/astrbot/core/provider/sources/sensevoice_selfhosted_source.py
@@ -1,12 +1,14 @@
 '''
 Author: diudiu62
 Date: 2025-02-24 18:04:18
-LastEditTime: 2025-02-24 18:33:48
+LastEditTime: 2025-02-25 14:06:30
 '''
+import asyncio
 from datetime import datetime
 import os
-import asyncio
-from funasr import AutoModel
+import re
+from funasr_onnx import SenseVoiceSmall
+from funasr_onnx.utils.postprocess_utils import rich_transcription_postprocess
 from ..provider import STTProvider
 from ..entites import ProviderType
 from astrbot.core.utils.io import download_file
@@ -22,26 +24,31 @@ class ProviderSenseVoiceSTTSelfHost(STTProvider):
         provider_settings: dict,
     ) -> None:
         super().__init__(provider_config, provider_settings)
+        self.set_model(provider_config.get("stt_model", None))
+        self.model = None
+        self.is_emotion = provider_config.get("is_emotion", False)
     
     async def initialize(self):
-        model_dir = "data/model/iic/SenseVoiceSmall"
-        loop = asyncio.get_event_loop()
         logger.info("下载或者加载 SenseVoice 模型中，这可能需要一些时间 ...")
-        self.model = await loop.run_in_executor(None, AutoModel, 
-                                                model=model_dir,
-                                                trust_remote_code=False,
-                                                # remote_code="./model.py",  
-                                                vad_model="fsmn-vad",
-                                                vad_kwargs={"max_single_segment_time": 30000},
-                                                )
+      
+
+        # 将模型加载放到线程池中执行
+        self.model = await asyncio.get_event_loop().run_in_executor(
+            None,
+            lambda: SenseVoiceSmall(self.model_name, quantize=True, batch_size=16)
+        )
+
         logger.info("SenseVoice 模型加载完成。")
-        
+    
+    async def get_timestamped_path(self) -> str:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return os.path.join("data", "temp", f"{timestamp}")
+ 
     async def _convert_audio(self, path: str) -> str:
         from pyffmpeg import FFmpeg
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # 获取当前时间戳
-        filename = timestamp + '.mp3'
+        filename = await self.get_timestamped_path() + '.mp3'
         ff = FFmpeg()
-        output_path = ff.convert(path, os.path.join('data/temp', filename))
+        output_path = ff.convert(path, os.path.join('data","temp', filename))
         return output_path
     
     async def _is_silk_file(self, file_path):
@@ -55,29 +62,44 @@ class ProviderSenseVoiceSTTSelfHost(STTProvider):
             return False
 
     async def get_text(self, audio_url: str) -> str:
-        loop = asyncio.get_event_loop()
-        
-        is_tencent = False
-        
-        if audio_url.startswith("http"):
-            if "multimedia.nt.qq.com.cn" in audio_url:
-                is_tencent = True
+        try:
+            is_tencent = audio_url.startswith("http") and "multimedia.nt.qq.com.cn" in audio_url
+            
+            if is_tencent:
+                path = await self.get_timestamped_path()
+                await download_file(audio_url, path)
+                audio_url = path
                 
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # 获取当前时间戳
-            path = os.path.join("data/temp", timestamp)
-            await download_file(audio_url, path)
-            audio_url = path
-        
-        if not os.path.exists(audio_url):
-            raise FileNotFoundError(f"文件不存在: {audio_url}")
-        
-        if audio_url.endswith(".amr") or audio_url.endswith(".silk") or is_tencent:
-            is_silk = await self._is_silk_file(audio_url)
-            if is_silk:
-                logger.info("Converting silk file to wav ...")
-                output_path = os.path.join('data/temp', str(uuid.uuid4()) + '.wav')
-                await tencent_silk_to_wav(audio_url, output_path)
-                audio_url = output_path
-                
-        result = await loop.run_in_executor(None, self.model.transcribe, audio_url)
-        return result['text']
\ No newline at end of file
+            if not os.path.isfile(audio_url):
+                raise FileNotFoundError(f"文件不存在: {audio_url}")
+
+            if audio_url.endswith((".amr", ".silk")) or is_tencent:
+                is_silk = await self._is_silk_file(audio_url)
+                if is_silk:
+                    logger.info("Converting silk file to wav ...")
+                    output_path = await self.get_timestamped_path()+'.wav'
+                    await tencent_silk_to_wav(audio_url, output_path)
+                    audio_url = output_path
+
+            # 使用 run_in_executor 来调用模型进行识别
+            loop = asyncio.get_event_loop()
+            res = await loop.run_in_executor(
+                None,  # 使用默认的线程池
+                lambda: self.model(audio_url, language="auto", use_itn=True)
+            )
+
+            # res = self.model(audio_url, language="auto", use_itn=True)
+            logger.debug(f"SenseVoice识别到的文案：{res}")
+            text = rich_transcription_postprocess(res[0])
+            if self.is_emotion:
+                # 提取第二个匹配的值
+                matches = re.findall(r'<\|([^|]+)\|>', res[0])
+                if len(matches) >= 2:
+                    emotion = matches[1]
+                    text = f"(当前的情绪：{emotion}) {text}"
+                else:
+                    logger.warning("未能提取到情绪信息")
+            return text
+        except Exception as e:
+            logger.error(f"处理音频文件时出错: {e}")
+            raise
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index aaf1562cb..b9fa04592 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,5 +22,6 @@ lark-oapi
 ormsgpack
 cryptography
 
-funasr
-torch~=2.6.0
\ No newline at end of file
+funasr_onnx
+torchaudio
+torch
\ No newline at end of file

From d01d1a8520df5fcb30b43cd2bc85bcdc879909d0 Mon Sep 17 00:00:00 2001
From: diudiu62 <115522593@qq.com>
Date: Tue, 25 Feb 2025 18:03:29 +0800
Subject: [PATCH 3/6] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BE=9D=E8=B5=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 astrbot/core/config/default.py | 2 +-
 requirements.txt               | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index e83e381ad..24dea759c 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -564,7 +564,7 @@ CONFIG_METADATA_2 = {
                     "sensevoice_hint": {
                         "description": "部署SenseVoice",
                         "type": "string",
-                        "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch 库（默认使用CPU，大约下载 1 GB），并且安装 ffmpeg。否则将无法正常转文字。",
+                        "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch、modelscope、jieba 库（默认使用CPU，大约下载 1 GB），并且安装 ffmpeg。否则将无法正常转文字。",
                         "obvious_hint": True,
                     },
                     "is_emotion": {
diff --git a/requirements.txt b/requirements.txt
index b9fa04592..727be2aad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,4 +24,6 @@ cryptography
 
 funasr_onnx
 torchaudio
-torch
\ No newline at end of file
+torch
+modelscope
+jieba
\ No newline at end of file

From 82673e8ddd6700f2c0a2e45deaae67f44699d6a2 Mon Sep 17 00:00:00 2001
From: diudiu62 <115522593@qq.com>
Date: Wed, 26 Feb 2025 09:46:30 +0800
Subject: [PATCH 4/6] =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=94=BE=E5=88=B0?=
 =?UTF-8?q?=E4=BA=86=E5=8F=82=E6=95=B0=E9=85=8D=E7=BD=AE=E5=9C=B0=E6=96=B9?=
 =?UTF-8?q?=E6=8F=90=E9=86=92=EF=BC=8Cdocker=E6=8F=90=E5=89=8D=E8=87=AA?=
 =?UTF-8?q?=E8=A1=8C=E6=89=93=E5=8C=85=E4=BE=9D=E8=B5=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 astrbot/core/config/default.py | 2 +-
 requirements.txt               | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index 24dea759c..d2e5dcee5 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -564,7 +564,7 @@ CONFIG_METADATA_2 = {
                     "sensevoice_hint": {
                         "description": "部署SenseVoice",
                         "type": "string",
-                        "hint": "启用前请 pip 安装 funasr_onnx、torchaudio、torch、modelscope、jieba 库（默认使用CPU，大约下载 1 GB），并且安装 ffmpeg。否则将无法正常转文字。",
+                        "hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库（默认使用CPU，大约下载 1 GB），并且安装 ffmpeg。否则将无法正常转文字。",
                         "obvious_hint": True,
                     },
                     "is_emotion": {
diff --git a/requirements.txt b/requirements.txt
index 727be2aad..f9d4817af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,9 +21,3 @@ silk-python
 lark-oapi
 ormsgpack
 cryptography
-
-funasr_onnx
-torchaudio
-torch
-modelscope
-jieba
\ No newline at end of file

From 68c1957267a126ac3ff88cae88c399660a00ad16 Mon Sep 17 00:00:00 2001
From: Soulter <905617992@qq.com>
Date: Wed, 26 Feb 2025 23:21:28 +0800
Subject: [PATCH 5/6] chore: update gitignore

---
 .gitignore | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index cb28c1d12..52b57f486 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,6 +23,4 @@ package-lock.json
 package.json
 venv/*
 packages/python_interpreter/workplace
-.venv/*
-
-Dockerfile_diudiu62
\ No newline at end of file
+.venv/*
\ No newline at end of file

From 8677d70baff036b8643ef0a901d0a3aa91481f2c Mon Sep 17 00:00:00 2001
From: Soulter <905617992@qq.com>
Date: Wed, 26 Feb 2025 23:55:00 +0800
Subject: [PATCH 6/6] feat: add sensevoice adapter

---
 astrbot/core/provider/manager.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py
index 1e64a1d9d..3d97cdd13 100644
--- a/astrbot/core/provider/manager.py
+++ b/astrbot/core/provider/manager.py
@@ -144,6 +144,8 @@ class ProviderManager():
                     from .sources.dashscope_source import ProviderDashscope as ProviderDashscope
                 case "googlegenai_chat_completion":
                     from .sources.gemini_source import ProviderGoogleGenAI as ProviderGoogleGenAI
+                case "sensevoice_stt_selfhost":
+                    from .sources.sensevoice_selfhosted_source import ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost
                 case "openai_whisper_api":
                     from .sources.whisper_api_source import ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI
                 case "openai_whisper_selfhost":