feat: enhance Live Mode with text input functionality and UI improvements

- Added a text input panel to allow users to send plain text messages while in Live Mode. - Updated the LiveMode.vue component to handle text input and integrate it with WebSocket communication. - Improved the layout and styling of the Live Mode interface for better user experience. - Documented the new `text_input` message type in the Live API README.
2026-03-16 22:36:29 +08:00
parent a1c9dc5d01
commit 565c371e5c
3 changed files with 730 additions and 529 deletions
@@ -723,6 +723,16 @@ class LiveChatRoute(Route):

        elif msg_type == "end_speaking":
            # 结束说话
+            if session.is_processing:
+                await websocket.send_json(
+                    {
+                        "t": "error",
+                        "data": "Session is busy",
+                        "code": "PROCESSING_ERROR",
+                    }
+                )
+                return
+
            stamp = message.get("stamp")
            if not stamp:
                logger.warning("[Live Chat] end_speaking 缺少 stamp")
@@ -736,45 +746,59 @@ class LiveChatRoute(Route):
            # 处理音频：STT -> LLM -> TTS
            await self._process_audio(session, audio_path, assemble_duration)

+        elif msg_type == "text_input":
+            if session.is_processing:
+                await websocket.send_json(
+                    {
+                        "t": "error",
+                        "data": "Session is busy",
+                        "code": "PROCESSING_ERROR",
+                    }
+                )
+                return
+
+            user_text = message.get("text")
+            if not isinstance(user_text, str):
+                user_text = message.get("message")
+
+            if not isinstance(user_text, str) or not user_text.strip():
+                await websocket.send_json(
+                    {
+                        "t": "error",
+                        "data": "message must be non-empty text",
+                        "code": "INVALID_MESSAGE_FORMAT",
+                    }
+                )
+                return
+
+            await self._process_live_user_text(
+                session,
+                user_text=user_text.strip(),
+                initial_metrics={"input_type": "text"},
+                processing_start_time=time.time(),
+            )
+
        elif msg_type == "interrupt":
            # 用户打断
            session.should_interrupt = True
            logger.info(f"[Live Chat] 用户打断: {session.username}")

-    async def _process_audio(
-        self, session: LiveChatSession, audio_path: str, assemble_duration: float
+    async def _process_live_user_text(
+        self,
+        session: LiveChatSession,
+        user_text: str,
+        initial_metrics: dict[str, Any] | None = None,
+        processing_start_time: float | None = None,
    ) -> None:
-        """处理音频：STT -> LLM -> 流式 TTS"""
+        """处理 Live 用户文本：走 run_live_agent pipeline 并回传流式 TTS."""
        try:
-            # 发送 WAV 组装耗时
-            await websocket.send_json(
-                {"t": "metrics", "data": {"wav_assemble_time": assemble_duration}}
-            )
-            wav_assembly_finish_time = time.time()
+            if initial_metrics:
+                await websocket.send_json({"t": "metrics", "data": initial_metrics})

+            processing_start = processing_start_time or time.time()
            session.is_processing = True
            session.should_interrupt = False

-            # 1. STT - 语音转文字
-            ctx = self.plugin_manager.context
-            stt_provider = ctx.provider_manager.stt_provider_insts[0]
-
-            if not stt_provider:
-                logger.error("[Live Chat] STT Provider 未配置")
-                await websocket.send_json({"t": "error", "data": "语音识别服务未配置"})
-                return
-
-            await websocket.send_json(
-                {"t": "metrics", "data": {"stt": stt_provider.meta().type}}
-            )
-
-            user_text = await stt_provider.get_text(audio_path)
-            if not user_text:
-                logger.warning("[Live Chat] STT 识别结果为空")
-                return
-
-            logger.info(f"[Live Chat] STT 结果: {user_text}")
-
            await websocket.send_json(
                {
                    "t": "user_msg",
@@ -794,7 +818,6 @@ class LiveChatRoute(Route):
                "action_type": "live",  # 标记为 live mode
            }

-            # 将消息放入队列
            await queue.put((session.username, cid, payload))

            # 3. 等待响应并流式发送 TTS 音频
@@ -809,11 +832,9 @@ class LiveChatRoute(Route):
                        # 用户打断，停止处理
                        logger.info("[Live Chat] 检测到用户打断")
                        await websocket.send_json({"t": "stop_play"})
-                        # 保存消息并标记为被打断
                        await self._save_interrupted_message(
                            session, user_text, bot_text
                        )
-                        # 清空队列中未处理的消息
                        while not back_queue.empty():
                            try:
                                back_queue.get_nowait()
@@ -861,12 +882,7 @@ class LiveChatRoute(Route):
                    if result_chain_type == "tts_stats":
                        try:
                            stats = json.loads(data)
-                            await websocket.send_json(
-                                {
-                                    "t": "metrics",
-                                    "data": stats,
-                                }
-                            )
+                            await websocket.send_json({"t": "metrics", "data": stats})
                        except Exception as e:
                            logger.error(f"[Live Chat] 解析 TTSStats 失败: {e}")
                        continue
@@ -893,18 +909,14 @@ class LiveChatRoute(Route):
                                    "data": {"text": data},
                                }
                            )
-                        # 普通文本消息
                        bot_text += data

                    elif result_type == "audio_chunk":
-                        # 流式音频数据
                        if not audio_playing:
                            audio_playing = True
                            logger.debug("[Live Chat] 开始播放音频流")
-
-                            # Calculate latency from wav assembly finish to first audio chunk
                            speak_to_first_frame_latency = (
-                                time.time() - wav_assembly_finish_time
+                                time.time() - processing_start
                            )
                            await websocket.send_json(
                                {
@@ -924,19 +936,15 @@ class LiveChatRoute(Route):
                                }
                            )

-                        # 发送音频数据给前端
                        await websocket.send_json(
                            {
                                "t": "response",
-                                "data": data,  # base64 编码的音频数据
+                                "data": data,
                            }
                        )

                    elif result_type in ["complete", "end"]:
-                        # 处理完成
                        logger.info(f"[Live Chat] Bot 回复完成: {bot_text}")
-
-                        # 如果没有音频流，发送 bot 消息文本
                        if not audio_playing:
                            await websocket.send_json(
                                {
@@ -948,11 +956,8 @@ class LiveChatRoute(Route):
                                }
                            )

-                        # 发送结束标记
                        await websocket.send_json({"t": "end"})
-
-                        # 发送总耗时
-                        wav_to_tts_duration = time.time() - wav_assembly_finish_time
+                        wav_to_tts_duration = time.time() - processing_start
                        await websocket.send_json(
                            {
                                "t": "metrics",
@@ -964,13 +969,65 @@ class LiveChatRoute(Route):
                webchat_queue_mgr.remove_back_queue(message_id)

        except Exception as e:
-            logger.error(f"[Live Chat] 处理音频失败: {e}", exc_info=True)
+            logger.error(f"[Live Chat] 处理文本失败: {e}", exc_info=True)
            await websocket.send_json({"t": "error", "data": f"处理失败: {str(e)}"})

        finally:
            session.is_processing = False
            session.should_interrupt = False

+    async def _process_audio(
+        self, session: LiveChatSession, audio_path: str, assemble_duration: float
+    ) -> None:
+        """处理音频：STT -> LLM -> 流式 TTS"""
+        try:
+            await websocket.send_json(
+                {
+                    "t": "metrics",
+                    "data": {
+                        "wav_assemble_time": assemble_duration,
+                        "input_type": "audio",
+                    },
+                }
+            )
+            wav_assembly_finish_time = time.time()
+
+            # 1. STT - 语音转文字
+            ctx = self.plugin_manager.context
+            stt_provider = ctx.provider_manager.stt_provider_insts[0]
+
+            if not stt_provider:
+                logger.error("[Live Chat] STT Provider 未配置")
+                await websocket.send_json({"t": "error", "data": "语音识别服务未配置"})
+                return
+
+            await websocket.send_json(
+                {
+                    "t": "metrics",
+                    "data": {
+                        "stt": stt_provider.meta().type,
+                    },
+                }
+            )
+
+            user_text = await stt_provider.get_text(audio_path)
+            if not user_text:
+                logger.warning("[Live Chat] STT 识别结果为空")
+                return
+
+            logger.info(f"[Live Chat] STT 结果: {user_text}")
+
+            await self._process_live_user_text(
+                session,
+                user_text=user_text,
+                initial_metrics=None,
+                processing_start_time=wav_assembly_finish_time,
+            )
+
+        except Exception as e:
+            logger.error(f"[Live Chat] 处理音频失败: {e}", exc_info=True)
+            await websocket.send_json({"t": "error", "data": f"处理失败: {str(e)}"})
+
    async def _save_interrupted_message(
        self, session: LiveChatSession, user_text: str, bot_text: str
    ) -> None:
@@ -128,6 +128,17 @@ Finish the current voice capture segment.
 }
 ```

+#### `text_input`
+
+Send a plain text input directly while using `ct=live`. The server will still route through Live mode with TTS and interrupt handling.
+
+```json
+{
+  "t": "text_input",
+  "text": "Hello, what is the weather today?"
+}
+```
+
 #### `interrupt`

 Interrupt the current model or TTS response.