From 565c371e5c1982dde3cc6085db848c4bc0e503c4 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Mon, 16 Mar 2026 22:36:29 +0800 Subject: [PATCH] feat: enhance Live Mode with text input functionality and UI improvements - Added a text input panel to allow users to send plain text messages while in Live Mode. - Updated the LiveMode.vue component to handle text input and integrate it with WebSocket communication. - Improved the layout and styling of the Live Mode interface for better user experience. - Documented the new `text_input` message type in the Live API README. --- astrbot/dashboard/routes/live_chat.py | 161 ++- dashboard/src/components/chat/LiveMode.vue | 1087 +++++++++++--------- docs/live-api/README.md | 11 + 3 files changed, 730 insertions(+), 529 deletions(-) diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index 839ac6555..f089b659f 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -723,6 +723,16 @@ class LiveChatRoute(Route): elif msg_type == "end_speaking": # 结束说话 + if session.is_processing: + await websocket.send_json( + { + "t": "error", + "data": "Session is busy", + "code": "PROCESSING_ERROR", + } + ) + return + stamp = message.get("stamp") if not stamp: logger.warning("[Live Chat] end_speaking 缺少 stamp") @@ -736,45 +746,59 @@ class LiveChatRoute(Route): # 处理音频:STT -> LLM -> TTS await self._process_audio(session, audio_path, assemble_duration) + elif msg_type == "text_input": + if session.is_processing: + await websocket.send_json( + { + "t": "error", + "data": "Session is busy", + "code": "PROCESSING_ERROR", + } + ) + return + + user_text = message.get("text") + if not isinstance(user_text, str): + user_text = message.get("message") + + if not isinstance(user_text, str) or not user_text.strip(): + await websocket.send_json( + { + "t": "error", + "data": "message must be non-empty text", + "code": "INVALID_MESSAGE_FORMAT", + } + ) + return + + await self._process_live_user_text( + session, + user_text=user_text.strip(), + initial_metrics={"input_type": "text"}, + processing_start_time=time.time(), + ) + elif msg_type == "interrupt": # 用户打断 session.should_interrupt = True logger.info(f"[Live Chat] 用户打断: {session.username}") - async def _process_audio( - self, session: LiveChatSession, audio_path: str, assemble_duration: float + async def _process_live_user_text( + self, + session: LiveChatSession, + user_text: str, + initial_metrics: dict[str, Any] | None = None, + processing_start_time: float | None = None, ) -> None: - """处理音频:STT -> LLM -> 流式 TTS""" + """处理 Live 用户文本:走 run_live_agent pipeline 并回传流式 TTS.""" try: - # 发送 WAV 组装耗时 - await websocket.send_json( - {"t": "metrics", "data": {"wav_assemble_time": assemble_duration}} - ) - wav_assembly_finish_time = time.time() + if initial_metrics: + await websocket.send_json({"t": "metrics", "data": initial_metrics}) + processing_start = processing_start_time or time.time() session.is_processing = True session.should_interrupt = False - # 1. STT - 语音转文字 - ctx = self.plugin_manager.context - stt_provider = ctx.provider_manager.stt_provider_insts[0] - - if not stt_provider: - logger.error("[Live Chat] STT Provider 未配置") - await websocket.send_json({"t": "error", "data": "语音识别服务未配置"}) - return - - await websocket.send_json( - {"t": "metrics", "data": {"stt": stt_provider.meta().type}} - ) - - user_text = await stt_provider.get_text(audio_path) - if not user_text: - logger.warning("[Live Chat] STT 识别结果为空") - return - - logger.info(f"[Live Chat] STT 结果: {user_text}") - await websocket.send_json( { "t": "user_msg", @@ -794,7 +818,6 @@ class LiveChatRoute(Route): "action_type": "live", # 标记为 live mode } - # 将消息放入队列 await queue.put((session.username, cid, payload)) # 3. 等待响应并流式发送 TTS 音频 @@ -809,11 +832,9 @@ class LiveChatRoute(Route): # 用户打断,停止处理 logger.info("[Live Chat] 检测到用户打断") await websocket.send_json({"t": "stop_play"}) - # 保存消息并标记为被打断 await self._save_interrupted_message( session, user_text, bot_text ) - # 清空队列中未处理的消息 while not back_queue.empty(): try: back_queue.get_nowait() @@ -861,12 +882,7 @@ class LiveChatRoute(Route): if result_chain_type == "tts_stats": try: stats = json.loads(data) - await websocket.send_json( - { - "t": "metrics", - "data": stats, - } - ) + await websocket.send_json({"t": "metrics", "data": stats}) except Exception as e: logger.error(f"[Live Chat] 解析 TTSStats 失败: {e}") continue @@ -893,18 +909,14 @@ class LiveChatRoute(Route): "data": {"text": data}, } ) - # 普通文本消息 bot_text += data elif result_type == "audio_chunk": - # 流式音频数据 if not audio_playing: audio_playing = True logger.debug("[Live Chat] 开始播放音频流") - - # Calculate latency from wav assembly finish to first audio chunk speak_to_first_frame_latency = ( - time.time() - wav_assembly_finish_time + time.time() - processing_start ) await websocket.send_json( { @@ -924,19 +936,15 @@ class LiveChatRoute(Route): } ) - # 发送音频数据给前端 await websocket.send_json( { "t": "response", - "data": data, # base64 编码的音频数据 + "data": data, } ) elif result_type in ["complete", "end"]: - # 处理完成 logger.info(f"[Live Chat] Bot 回复完成: {bot_text}") - - # 如果没有音频流,发送 bot 消息文本 if not audio_playing: await websocket.send_json( { @@ -948,11 +956,8 @@ class LiveChatRoute(Route): } ) - # 发送结束标记 await websocket.send_json({"t": "end"}) - - # 发送总耗时 - wav_to_tts_duration = time.time() - wav_assembly_finish_time + wav_to_tts_duration = time.time() - processing_start await websocket.send_json( { "t": "metrics", @@ -964,13 +969,65 @@ class LiveChatRoute(Route): webchat_queue_mgr.remove_back_queue(message_id) except Exception as e: - logger.error(f"[Live Chat] 处理音频失败: {e}", exc_info=True) + logger.error(f"[Live Chat] 处理文本失败: {e}", exc_info=True) await websocket.send_json({"t": "error", "data": f"处理失败: {str(e)}"}) finally: session.is_processing = False session.should_interrupt = False + async def _process_audio( + self, session: LiveChatSession, audio_path: str, assemble_duration: float + ) -> None: + """处理音频:STT -> LLM -> 流式 TTS""" + try: + await websocket.send_json( + { + "t": "metrics", + "data": { + "wav_assemble_time": assemble_duration, + "input_type": "audio", + }, + } + ) + wav_assembly_finish_time = time.time() + + # 1. STT - 语音转文字 + ctx = self.plugin_manager.context + stt_provider = ctx.provider_manager.stt_provider_insts[0] + + if not stt_provider: + logger.error("[Live Chat] STT Provider 未配置") + await websocket.send_json({"t": "error", "data": "语音识别服务未配置"}) + return + + await websocket.send_json( + { + "t": "metrics", + "data": { + "stt": stt_provider.meta().type, + }, + } + ) + + user_text = await stt_provider.get_text(audio_path) + if not user_text: + logger.warning("[Live Chat] STT 识别结果为空") + return + + logger.info(f"[Live Chat] STT 结果: {user_text}") + + await self._process_live_user_text( + session, + user_text=user_text, + initial_metrics=None, + processing_start_time=wav_assembly_finish_time, + ) + + except Exception as e: + logger.error(f"[Live Chat] 处理音频失败: {e}", exc_info=True) + await websocket.send_json({"t": "error", "data": f"处理失败: {str(e)}"}) + async def _save_interrupted_message( self, session: LiveChatSession, user_text: str, bot_text: str ) -> None: diff --git a/dashboard/src/components/chat/LiveMode.vue b/dashboard/src/components/chat/LiveMode.vue index 2740459d9..60773f802 100644 --- a/dashboard/src/components/chat/LiveMode.vue +++ b/dashboard/src/components/chat/LiveMode.vue @@ -1,65 +1,127 @@ diff --git a/docs/live-api/README.md b/docs/live-api/README.md index 7bfff8154..7dc897284 100644 --- a/docs/live-api/README.md +++ b/docs/live-api/README.md @@ -128,6 +128,17 @@ Finish the current voice capture segment. } ``` +#### `text_input` + +Send a plain text input directly while using `ct=live`. The server will still route through Live mode with TTS and interrupt handling. + +```json +{ + "t": "text_input", + "text": "Hello, what is the weather today?" +} +``` + #### `interrupt` Interrupt the current model or TTS response.