feat: enhance Live Mode with text input functionality and UI improvements

- Added a text input panel to allow users to send plain text messages while in Live Mode. - Updated the LiveMode.vue component to handle text input and integrate it with WebSocket communication. - Improved the layout and styling of the Live Mode interface for better user experience. - Documented the new `text_input` message type in the Live API README.
feat: add live API WebSocket endpoint with authentication and session management
2026-03-16 22:36:29 +08:00 · 2026-03-16 22:11:15 +08:00
11 changed files with 1445 additions and 564 deletions
@@ -326,6 +326,7 @@ async def run_live_agent(

    # 创建队列
    text_queue: asyncio.Queue[str | None] = asyncio.Queue()
+    delta_queue: asyncio.Queue[str | None] = asyncio.Queue()
    # audio_queue stored bytes or (text, bytes)
    audio_queue: asyncio.Queue[bytes | tuple[str, bytes] | None] = asyncio.Queue()

@@ -334,6 +335,7 @@ async def run_live_agent(
        _run_agent_feeder(
            agent_runner,
            text_queue,
+            delta_queue,
            max_step,
            show_tool_use,
            show_tool_call_result,
@@ -353,11 +355,42 @@ async def run_live_agent(

    # 3. 主循环：从 audio_queue 读取音频并 yield
    try:
-        while True:
-            queue_item = await audio_queue.get()
+        delta_done = False
+        audio_done = False
+        while not (delta_done and audio_done):
+            task_sources: dict[asyncio.Task, str] = {}
+            if not delta_done:
+                task = asyncio.create_task(delta_queue.get())
+                task_sources[task] = "delta"
+            if not audio_done:
+                task = asyncio.create_task(audio_queue.get())
+                task_sources[task] = "audio"
+
+            done, pending = await asyncio.wait(
+                list(task_sources),
+                return_when=asyncio.FIRST_COMPLETED,
+            )
+
+            for task in pending:
+                task.cancel()
+            if pending:
+                await asyncio.gather(*pending, return_exceptions=True)
+
+            for task in done:
+                source = task_sources[task]
+                queue_item = task.result()
+                if source == "delta":
+                    if queue_item is None:
+                        delta_done = True
+                        continue
+                    yield MessageChain(
+                        chain=[Plain(queue_item)], type="live_text_delta"
+                    )
+                    continue

                if queue_item is None:
-                break
+                    audio_done = True
+                    continue

                text = None
                if isinstance(queue_item, tuple):
@@ -421,6 +454,7 @@ async def run_live_agent(
 async def _run_agent_feeder(
    agent_runner: AgentRunner,
    text_queue: asyncio.Queue,
+    delta_queue: asyncio.Queue,
    max_step: int,
    show_tool_use: bool,
    show_tool_call_result: bool,
@@ -440,9 +474,13 @@ async def _run_agent_feeder(
            if chain is None:
                continue

+            if chain.type == "reasoning":
+                continue
+
            # 提取文本
            text = chain.get_plain_text()
            if text:
+                await delta_queue.put(text)
                buffer += text

                # 分句逻辑：匹配标点符号
@@ -477,6 +515,7 @@ async def _run_agent_feeder(
    finally:
        # 发送结束信号
        await text_queue.put(None)
+        await delta_queue.put(None)


 async def _safe_tts_stream_wrapper(
@@ -130,16 +130,6 @@ class LiveChatRoute(Route):

    async def live_chat_ws(self) -> None:
        """Legacy Live Chat WebSocket 处理器（默认 ct=live）"""
-        await self._unified_ws_loop(force_ct="live")
-
-    async def unified_chat_ws(self) -> None:
-        """Unified Chat WebSocket 处理器（支持 ct=live/chat）"""
-        await self._unified_ws_loop(force_ct=None)
-
-    async def _unified_ws_loop(self, force_ct: str | None = None) -> None:
-        """统一 WebSocket 循环"""
-        # WebSocket 不能通过 header 传递 token，需要从 query 参数获取
-        # 注意：WebSocket 上下文使用 websocket.args 而不是 request.args
        token = websocket.args.get("token")
        if not token:
            await websocket.close(1008, "Missing authentication token")
@@ -156,6 +146,49 @@ class LiveChatRoute(Route):
            await websocket.close(1008, "Invalid token")
            return

+        await self.run_ws_session(username=username, force_ct="live")
+
+    async def unified_chat_ws(self) -> None:
+        """Unified Chat WebSocket 处理器（支持 ct=live/chat）"""
+        token = websocket.args.get("token")
+        if not token:
+            await websocket.close(1008, "Missing authentication token")
+            return
+
+        try:
+            jwt_secret = self.config["dashboard"].get("jwt_secret")
+            payload = jwt.decode(token, jwt_secret, algorithms=["HS256"])
+            username = payload["username"]
+        except jwt.ExpiredSignatureError:
+            await websocket.close(1008, "Token expired")
+            return
+        except jwt.InvalidTokenError:
+            await websocket.close(1008, "Invalid token")
+            return
+
+        await self.run_ws_session(username=username, force_ct=None)
+
+    async def _unified_ws_loop(self, force_ct: str | None = None) -> None:
+        """统一 WebSocket 循环"""
+        # Keep the legacy entry point for internal call sites.
+        token = websocket.args.get("token")
+        if not token:
+            await websocket.close(1008, "Missing authentication token")
+            return
+        try:
+            jwt_secret = self.config["dashboard"].get("jwt_secret")
+            payload = jwt.decode(token, jwt_secret, algorithms=["HS256"])
+            username = payload["username"]
+        except jwt.ExpiredSignatureError:
+            await websocket.close(1008, "Token expired")
+            return
+        except jwt.InvalidTokenError:
+            await websocket.close(1008, "Invalid token")
+            return
+        await self.run_ws_session(username=username, force_ct=force_ct)
+
+    async def run_ws_session(self, username: str, force_ct: str | None = None) -> None:
+        """Run a live/unified websocket session for an authenticated username."""
        session_id = f"webchat_live!{username}!{uuid.uuid4()}"
        live_session = LiveChatSession(session_id, username)
        self.sessions[session_id] = live_session
@@ -690,6 +723,16 @@ class LiveChatRoute(Route):

        elif msg_type == "end_speaking":
            # 结束说话
+            if session.is_processing:
+                await websocket.send_json(
+                    {
+                        "t": "error",
+                        "data": "Session is busy",
+                        "code": "PROCESSING_ERROR",
+                    }
+                )
+                return
+
            stamp = message.get("stamp")
            if not stamp:
                logger.warning("[Live Chat] end_speaking 缺少 stamp")
@@ -703,45 +746,59 @@ class LiveChatRoute(Route):
            # 处理音频：STT -> LLM -> TTS
            await self._process_audio(session, audio_path, assemble_duration)

+        elif msg_type == "text_input":
+            if session.is_processing:
+                await websocket.send_json(
+                    {
+                        "t": "error",
+                        "data": "Session is busy",
+                        "code": "PROCESSING_ERROR",
+                    }
+                )
+                return
+
+            user_text = message.get("text")
+            if not isinstance(user_text, str):
+                user_text = message.get("message")
+
+            if not isinstance(user_text, str) or not user_text.strip():
+                await websocket.send_json(
+                    {
+                        "t": "error",
+                        "data": "message must be non-empty text",
+                        "code": "INVALID_MESSAGE_FORMAT",
+                    }
+                )
+                return
+
+            await self._process_live_user_text(
+                session,
+                user_text=user_text.strip(),
+                initial_metrics={"input_type": "text"},
+                processing_start_time=time.time(),
+            )
+
        elif msg_type == "interrupt":
            # 用户打断
            session.should_interrupt = True
            logger.info(f"[Live Chat] 用户打断: {session.username}")

-    async def _process_audio(
-        self, session: LiveChatSession, audio_path: str, assemble_duration: float
+    async def _process_live_user_text(
+        self,
+        session: LiveChatSession,
+        user_text: str,
+        initial_metrics: dict[str, Any] | None = None,
+        processing_start_time: float | None = None,
    ) -> None:
-        """处理音频：STT -> LLM -> 流式 TTS"""
+        """处理 Live 用户文本：走 run_live_agent pipeline 并回传流式 TTS."""
        try:
-            # 发送 WAV 组装耗时
-            await websocket.send_json(
-                {"t": "metrics", "data": {"wav_assemble_time": assemble_duration}}
-            )
-            wav_assembly_finish_time = time.time()
+            if initial_metrics:
+                await websocket.send_json({"t": "metrics", "data": initial_metrics})

+            processing_start = processing_start_time or time.time()
            session.is_processing = True
            session.should_interrupt = False

-            # 1. STT - 语音转文字
-            ctx = self.plugin_manager.context
-            stt_provider = ctx.provider_manager.stt_provider_insts[0]
-
-            if not stt_provider:
-                logger.error("[Live Chat] STT Provider 未配置")
-                await websocket.send_json({"t": "error", "data": "语音识别服务未配置"})
-                return
-
-            await websocket.send_json(
-                {"t": "metrics", "data": {"stt": stt_provider.meta().type}}
-            )
-
-            user_text = await stt_provider.get_text(audio_path)
-            if not user_text:
-                logger.warning("[Live Chat] STT 识别结果为空")
-                return
-
-            logger.info(f"[Live Chat] STT 结果: {user_text}")
-
            await websocket.send_json(
                {
                    "t": "user_msg",
@@ -761,7 +818,6 @@ class LiveChatRoute(Route):
                "action_type": "live",  # 标记为 live mode
            }

-            # 将消息放入队列
            await queue.put((session.username, cid, payload))

            # 3. 等待响应并流式发送 TTS 音频
@@ -776,11 +832,9 @@ class LiveChatRoute(Route):
                        # 用户打断，停止处理
                        logger.info("[Live Chat] 检测到用户打断")
                        await websocket.send_json({"t": "stop_play"})
-                        # 保存消息并标记为被打断
                        await self._save_interrupted_message(
                            session, user_text, bot_text
                        )
-                        # 清空队列中未处理的消息
                        while not back_queue.empty():
                            try:
                                back_queue.get_nowait()
@@ -805,6 +859,7 @@ class LiveChatRoute(Route):

                    result_type = result.get("type")
                    result_chain_type = result.get("chain_type")
+                    result_streaming = bool(result.get("streaming", False))
                    data = result.get("data", "")

                    if result_chain_type == "agent_stats":
@@ -827,29 +882,41 @@ class LiveChatRoute(Route):
                    if result_chain_type == "tts_stats":
                        try:
                            stats = json.loads(data)
-                            await websocket.send_json(
-                                {
-                                    "t": "metrics",
-                                    "data": stats,
-                                }
-                            )
+                            await websocket.send_json({"t": "metrics", "data": stats})
                        except Exception as e:
                            logger.error(f"[Live Chat] 解析 TTSStats 失败: {e}")
                        continue

+                    if result_chain_type == "live_text_delta":
+                        if data:
+                            await websocket.send_json(
+                                {
+                                    "t": "bot_delta_chunk",
+                                    "data": {"text": data},
+                                }
+                            )
+                        continue
+
                    if result_type == "plain":
-                        # 普通文本消息
+                        if (
+                            result_streaming
+                            and data
+                            and result_chain_type != "reasoning"
+                        ):
+                            await websocket.send_json(
+                                {
+                                    "t": "bot_delta_chunk",
+                                    "data": {"text": data},
+                                }
+                            )
                        bot_text += data

                    elif result_type == "audio_chunk":
-                        # 流式音频数据
                        if not audio_playing:
                            audio_playing = True
                            logger.debug("[Live Chat] 开始播放音频流")
-
-                            # Calculate latency from wav assembly finish to first audio chunk
                            speak_to_first_frame_latency = (
-                                time.time() - wav_assembly_finish_time
+                                time.time() - processing_start
                            )
                            await websocket.send_json(
                                {
@@ -869,19 +936,15 @@ class LiveChatRoute(Route):
                                }
                            )

-                        # 发送音频数据给前端
                        await websocket.send_json(
                            {
                                "t": "response",
-                                "data": data,  # base64 编码的音频数据
+                                "data": data,
                            }
                        )

                    elif result_type in ["complete", "end"]:
-                        # 处理完成
                        logger.info(f"[Live Chat] Bot 回复完成: {bot_text}")
-
-                        # 如果没有音频流，发送 bot 消息文本
                        if not audio_playing:
                            await websocket.send_json(
                                {
@@ -893,11 +956,8 @@ class LiveChatRoute(Route):
                                }
                            )

-                        # 发送结束标记
                        await websocket.send_json({"t": "end"})
-
-                        # 发送总耗时
-                        wav_to_tts_duration = time.time() - wav_assembly_finish_time
+                        wav_to_tts_duration = time.time() - processing_start
                        await websocket.send_json(
                            {
                                "t": "metrics",
@@ -909,13 +969,65 @@ class LiveChatRoute(Route):
                webchat_queue_mgr.remove_back_queue(message_id)

        except Exception as e:
-            logger.error(f"[Live Chat] 处理音频失败: {e}", exc_info=True)
+            logger.error(f"[Live Chat] 处理文本失败: {e}", exc_info=True)
            await websocket.send_json({"t": "error", "data": f"处理失败: {str(e)}"})

        finally:
            session.is_processing = False
            session.should_interrupt = False

+    async def _process_audio(
+        self, session: LiveChatSession, audio_path: str, assemble_duration: float
+    ) -> None:
+        """处理音频：STT -> LLM -> 流式 TTS"""
+        try:
+            await websocket.send_json(
+                {
+                    "t": "metrics",
+                    "data": {
+                        "wav_assemble_time": assemble_duration,
+                        "input_type": "audio",
+                    },
+                }
+            )
+            wav_assembly_finish_time = time.time()
+
+            # 1. STT - 语音转文字
+            ctx = self.plugin_manager.context
+            stt_provider = ctx.provider_manager.stt_provider_insts[0]
+
+            if not stt_provider:
+                logger.error("[Live Chat] STT Provider 未配置")
+                await websocket.send_json({"t": "error", "data": "语音识别服务未配置"})
+                return
+
+            await websocket.send_json(
+                {
+                    "t": "metrics",
+                    "data": {
+                        "stt": stt_provider.meta().type,
+                    },
+                }
+            )
+
+            user_text = await stt_provider.get_text(audio_path)
+            if not user_text:
+                logger.warning("[Live Chat] STT 识别结果为空")
+                return
+
+            logger.info(f"[Live Chat] STT 结果: {user_text}")
+
+            await self._process_live_user_text(
+                session,
+                user_text=user_text,
+                initial_metrics=None,
+                processing_start_time=wav_assembly_finish_time,
+            )
+
+        except Exception as e:
+            logger.error(f"[Live Chat] 处理音频失败: {e}", exc_info=True)
+            await websocket.send_json({"t": "error", "data": f"处理失败: {str(e)}"})
+
    async def _save_interrupted_message(
        self, session: LiveChatSession, user_text: str, bot_text: str
    ) -> None:
@@ -19,6 +19,7 @@ from astrbot.core.utils.datetime_utils import to_utc_isoformat

 from .api_key import ALL_OPEN_API_SCOPES
 from .chat import ChatRoute
+from .live_chat import LiveChatRoute
 from .route import Response, Route, RouteContext


@@ -29,12 +30,14 @@ class OpenApiRoute(Route):
        db: BaseDatabase,
        core_lifecycle: AstrBotCoreLifecycle,
        chat_route: ChatRoute,
+        live_chat_route: LiveChatRoute,
    ) -> None:
        super().__init__(context)
        self.db = db
        self.core_lifecycle = core_lifecycle
        self.platform_manager = core_lifecycle.platform_manager
        self.chat_route = chat_route
+        self.live_chat_route = live_chat_route

        self.routes = {
            "/v1/chat": ("POST", self.chat_send),
@@ -46,6 +49,7 @@ class OpenApiRoute(Route):
        }
        self.register_routes()
        self.app.websocket("/api/v1/chat/ws")(self.chat_ws)
+        self.app.websocket("/api/v1/live/ws")(self.live_ws)

    @staticmethod
    def _resolve_open_username(
@@ -534,6 +538,39 @@ class OpenApiRoute(Route):
        except Exception as e:
            logger.debug("Open API WS connection closed: %s", e)

+    async def live_ws(self) -> None:
+        authed, auth_err = await self._authenticate_chat_ws_api_key()
+        if not authed:
+            await self._send_chat_ws_error(auth_err or "Unauthorized", "UNAUTHORIZED")
+            await websocket.close(1008, auth_err or "Unauthorized")
+            return
+
+        username, username_err = self._resolve_open_username(
+            websocket.args.get("username")
+        )
+        if username_err or not username:
+            await self._send_chat_ws_error(
+                username_err or "Invalid username",
+                "BAD_USER",
+            )
+            await websocket.close(1008, username_err or "Invalid username")
+            return
+
+        ct = websocket.args.get("ct")
+        force_ct = ct.strip() if isinstance(ct, str) and ct.strip() else "live"
+        if force_ct not in {"live", "chat"}:
+            await self._send_chat_ws_error(
+                "ct must be 'live' or 'chat'",
+                "INVALID_MESSAGE",
+            )
+            await websocket.close(1008, "Invalid ct")
+            return
+
+        await self.live_chat_route.run_ws_session(
+            username=username,
+            force_ct=force_ct,
+        )
+
    async def upload_file(self):
        return await self.chat_route.post_file()

@@ -115,11 +115,13 @@ class AstrBotDashboard:
        self.ar = AuthRoute(self.context)
        self.api_key_route = ApiKeyRoute(self.context, db)
        self.chat_route = ChatRoute(self.context, db, core_lifecycle)
+        self.live_chat_route = LiveChatRoute(self.context, db, core_lifecycle)
        self.open_api_route = OpenApiRoute(
            self.context,
            db,
            core_lifecycle,
            self.chat_route,
+            self.live_chat_route,
        )
        self.chatui_project_route = ChatUIProjectRoute(self.context, db)
        self.tools_root = ToolsRoute(self.context, core_lifecycle)
@@ -138,7 +140,6 @@ class AstrBotDashboard:
        self.kb_route = KnowledgeBaseRoute(self.context, core_lifecycle)
        self.platform_route = PlatformRoute(self.context, core_lifecycle)
        self.backup_route = BackupRoute(self.context, db, core_lifecycle)
-        self.live_chat_route = LiveChatRoute(self.context, db, core_lifecycle)

        self.app.add_url_rule(
            "/api/plug/<path:subpath>",
@@ -244,6 +245,7 @@ class AstrBotDashboard:
        scope_map = {
            "/api/v1/chat": "chat",
            "/api/v1/chat/ws": "chat",
+            "/api/v1/live/ws": "chat",
            "/api/v1/chat/sessions": "chat",
            "/api/v1/configs": "config",
            "/api/v1/file": "file",
@@ -2,27 +2,73 @@
  <div class="live-mode-container">
    <div class="header-controls">
      <v-btn icon="mdi-close" @click="handleClose" flat variant="text" />
-            <v-btn :icon="isCodeMode ? 'mdi-code-tags-check' : 'mdi-code-tags'" @click="toggleCodeMode" flat
-                variant="text" :color="isCodeMode ? 'primary' : ''" />
-            <v-btn :icon="isNervousMode ? 'mdi-emoticon-confused' : 'mdi-emoticon-confused-outline'"
-                @click="toggleNervousMode" flat variant="text" :color="isNervousMode ? 'primary' : ''" />
+      <v-btn
+        :icon="isCodeMode ? 'mdi-code-tags-check' : 'mdi-code-tags'"
+        @click="toggleCodeMode"
+        flat
+        variant="text"
+        :color="isCodeMode ? 'primary' : ''"
+      />
+      <v-btn
+        :icon="
+          isNervousMode
+            ? 'mdi-emoticon-confused'
+            : 'mdi-emoticon-confused-outline'
+        "
+        @click="toggleNervousMode"
+        flat
+        variant="text"
+        :color="isNervousMode ? 'primary' : ''"
+      />
    </div>

-        <span style="color: gray; padding-left: 16px;">We're developing Astr Live Mode on ChatUI & Desktop right now. Stay tuned!</span>
+    <span style="color: gray; padding-left: 16px"
+      >We're developing Astr Live Mode on ChatUI & Desktop right now. Stay
+      tuned!</span
+    >

    <div class="live-mode-content">
+      <div class="text-input-panel">
+        <v-text-field
+          v-model="textInput"
+          label="给 Live 发文字"
+          variant="outlined"
+          density="comfortable"
+          hide-details
+          placeholder="在这里输入要发给 Live 的文字"
+          :disabled="!isActive || !isConnected || isProcessing"
+          @keydown.enter.exact.prevent="sendTextInput"
+        />
+        <v-btn
+          :disabled="!canSendText"
+          color="primary"
+          icon="mdi-send"
+          @click="sendTextInput"
+        />
+      </div>
      <div class="center-circle-container" @click="handleCircleClick">
        <!-- 爆炸效果层 -->
        <div v-if="isExploding" class="explosion-wave"></div>

-                <SiriOrb :energy="orbEnergy" :mode="isActive ? orbMode : 'idle'" :is-dark="isDark"
-                    :code-mode="isCodeMode" :nervous-mode="isNervousMode" class="siri-orb" />
+        <SiriOrb
+          :energy="orbEnergy"
+          :mode="isActive ? orbMode : 'idle'"
+          :is-dark="isDark"
+          :code-mode="isCodeMode"
+          :nervous-mode="isNervousMode"
+          class="siri-orb"
+        />
      </div>
      <div class="status-text">
        {{ statusText }}
      </div>
      <div class="messages-container" v-if="messages.length > 0">
-                <div v-for="(msg, index) in messages" :key="index" class="message-item" :class="msg.type">
+        <div
+          v-for="(msg, index) in messages"
+          :key="index"
+          class="message-item"
+          :class="msg.type"
+        >
          <div class="message-content">
            {{ msg.text }}
          </div>
@@ -30,36 +76,52 @@
      </div>

      <div class="metrics-container" v-if="Object.keys(metrics).length > 0">
-                <span v-if="metrics.wav_assemble_time">WAV Assemble: {{ (metrics.wav_assemble_time * 1000).toFixed(0)
-                    }}ms</span>
-                <span v-if="metrics.llm_ttft">LLM First Token Latency: {{ (metrics.llm_ttft * 1000).toFixed(0)
-                    }}ms</span>
-                <span v-if="metrics.llm_total_time">LLM Total Latency: {{ (metrics.llm_total_time * 1000).toFixed(0)
-                    }}ms</span>
-                <span v-if="metrics.tts_first_frame_time">TTS First Frame Latency: {{ (metrics.tts_first_frame_time *
-                    1000).toFixed(0) }}ms</span>
-                <span v-if="metrics.tts_total_time">TTS Total Larency: {{ (metrics.tts_total_time * 1000).toFixed(0)
-                    }}ms</span>
-                <span v-if="metrics.speak_to_first_frame">Speak -> First TTS Frame: {{ (metrics.speak_to_first_frame *
-                    1000).toFixed(0) }}ms</span>
-                <span v-if="metrics.wav_to_tts_total_time">Speak -> End: {{ (metrics.wav_to_tts_total_time *
-                    1000).toFixed(0) }}ms</span>
+        <span v-if="metrics.wav_assemble_time"
+          >WAV Assemble:
+          {{ (metrics.wav_assemble_time * 1000).toFixed(0) }}ms</span
+        >
+        <span v-if="metrics.llm_ttft"
+          >LLM First Token Latency:
+          {{ (metrics.llm_ttft * 1000).toFixed(0) }}ms</span
+        >
+        <span v-if="metrics.llm_total_time"
+          >LLM Total Latency:
+          {{ (metrics.llm_total_time * 1000).toFixed(0) }}ms</span
+        >
+        <span v-if="metrics.tts_first_frame_time"
+          >TTS First Frame Latency:
+          {{ (metrics.tts_first_frame_time * 1000).toFixed(0) }}ms</span
+        >
+        <span v-if="metrics.tts_total_time"
+          >TTS Total Larency:
+          {{ (metrics.tts_total_time * 1000).toFixed(0) }}ms</span
+        >
+        <span v-if="metrics.speak_to_first_frame"
+          >Speak -> First TTS Frame:
+          {{ (metrics.speak_to_first_frame * 1000).toFixed(0) }}ms</span
+        >
+        <span v-if="metrics.wav_to_tts_total_time"
+          >Speak -> End:
+          {{ (metrics.wav_to_tts_total_time * 1000).toFixed(0) }}ms</span
+        >
        <span v-if="metrics.stt">STT Provider: {{ metrics.stt }}</span>
        <span v-if="metrics.tts">TTS Provider: {{ metrics.tts }}</span>
-                <span v-if="metrics.chat_model">Chat Model: {{ metrics.chat_model }}</span>
+        <span v-if="metrics.chat_model"
+          >Chat Model: {{ metrics.chat_model }}</span
+        >
      </div>
    </div>
  </div>
 </template>

 <script setup lang="ts">
-import { ref, computed, onBeforeUnmount, watch } from 'vue';
-import { useTheme } from 'vuetify';
-import { useVADRecording } from '@/composables/useVADRecording';
-import SiriOrb from './LiveOrb.vue';
+import { ref, computed, onBeforeUnmount, watch } from "vue";
+import { useTheme } from "vuetify";
+import { useVADRecording } from "@/composables/useVADRecording";
+import SiriOrb from "./LiveOrb.vue";

 const emit = defineEmits<{
-    'close': [];
+  close: [];
 }>();

 const theme = useTheme();
@@ -95,9 +157,10 @@ let isDecoding = false;
 let isPlayingAudio = false; // 内部状态：是否正在播放音频
 let currentSource: AudioBufferSourceNode | null = null;

-
 // 消息历史
-const messages = ref<Array<{ type: 'user' | 'bot', text: string }>>([]);
+const messages = ref<Array<{ type: "user" | "bot"; text: string }>>([]);
+const textInput = ref("");
+const isConnected = ref(false);

 interface LiveMetrics {
  wav_assemble_time?: number;
@@ -114,41 +177,51 @@ interface LiveMetrics {
 const metrics = ref<LiveMetrics>({});

 // 当前语音片段标记
-let currentStamp = '';
+let currentStamp = "";

 const statusText = computed(() => {
-    if (!isActive.value) return 'Astr Live';
-    if (isProcessing.value) return '正在处理...';
-    if (isSpeaking.value) return '正在说话...';
-    if (isListening.value) return '正在听...';
-    return '准备就绪';
+  if (!isActive.value) return "Astr Live";
+  if (isProcessing.value) return "正在处理...";
+  if (isSpeaking.value) return "正在说话...";
+  if (isListening.value) return "正在听...";
+  return "准备就绪";
 });

 const getIcon = computed(() => {
-    if (!isActive.value) return 'mdi-microphone';
-    if (isSpeaking.value) return 'mdi-account-voice';
-    if (isProcessing.value) return 'mdi-loading';
-    return 'mdi-check';
+  if (!isActive.value) return "mdi-microphone";
+  if (isSpeaking.value) return "mdi-account-voice";
+  if (isProcessing.value) return "mdi-loading";
+  return "mdi-check";
 });

 const getIconColor = computed(() => {
-    if (!isActive.value) return isDark.value ? 'white' : 'black';
-    if (isSpeaking.value) return 'success';
-    if (isProcessing.value) return 'warning';
-    return 'primary';
+  if (!isActive.value) return isDark.value ? "white" : "black";
+  if (isSpeaking.value) return "success";
+  if (isProcessing.value) return "warning";
+  return "primary";
 });

 const orbEnergy = computed(() => {
  if (isPlaying.value) return botEnergy.value;
-    if (isSpeaking.value || isListening.value) return vadRecording.audioEnergy.value;
+  if (isSpeaking.value || isListening.value)
+    return vadRecording.audioEnergy.value;
  return 0;
 });

 const orbMode = computed(() => {
-    if (isProcessing.value) return 'processing';
-    if (isPlaying.value) return 'speaking';
-    if (isSpeaking.value || isListening.value) return 'listening';
-    return 'idle';
+  if (isProcessing.value) return "processing";
+  if (isPlaying.value) return "speaking";
+  if (isSpeaking.value || isListening.value) return "listening";
+  return "idle";
+});
+
+const canSendText = computed(() => {
+  return (
+    isConnected.value &&
+    isActive.value &&
+    Boolean(textInput.value.trim()) &&
+    !isProcessing.value
+  );
 });

 async function handleCircleClick() {
@@ -183,64 +256,72 @@ async function startLiveMode() {
    await vadRecording.startRecording(
      // onSpeechStart 回调
      () => {
-                console.log('[Live Mode] VAD 检测到开始说话');
+        console.log("[Live Mode] VAD 检测到开始说话");
        isListening.value = false;
        currentStamp = generateStamp();

        // 发送开始说话消息
        if (ws && ws.readyState === WebSocket.OPEN) {
          metrics.value = {}; // Reset metrics
-                    ws.send(JSON.stringify({
-                        t: 'start_speaking',
-                        stamp: currentStamp
-                    }));
+          ws.send(
+            JSON.stringify({
+              t: "start_speaking",
+              stamp: currentStamp,
+            }),
+          );
        }
      },
      // onSpeechEnd 回调
      (audio: Float32Array) => {
-                console.log('[Live Mode] VAD 检测到语音结束，音频长度:', audio.length);
+        console.log("[Live Mode] VAD 检测到语音结束，音频长度:", audio.length);

        // 将完整音频转换为 PCM16 并发送
        if (ws && ws.readyState === WebSocket.OPEN) {
          const pcm16 = new Int16Array(audio.length);
          for (let i = 0; i < audio.length; i++) {
            const s = Math.max(-1, Math.min(1, audio[i]));
-                        pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
+            pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
          }

          // Base64 编码（分块处理以避免堆栈溢出）
          const uint8 = new Uint8Array(pcm16.buffer);
-                    let base64 = '';
+          let base64 = "";
          const chunkSize = 0x8000; // 32KB chunks
          for (let i = 0; i < uint8.length; i += chunkSize) {
-                        const chunk = uint8.subarray(i, Math.min(i + chunkSize, uint8.length));
+            const chunk = uint8.subarray(
+              i,
+              Math.min(i + chunkSize, uint8.length),
+            );
            base64 += String.fromCharCode.apply(null, Array.from(chunk));
          }
          base64 = btoa(base64);

          // 发送完整音频
-                    ws.send(JSON.stringify({
-                        t: 'speaking_part',
-                        data: base64
-                    }));
+          ws.send(
+            JSON.stringify({
+              t: "speaking_part",
+              data: base64,
+            }),
+          );

          // 发送结束说话消息
-                    ws.send(JSON.stringify({
-                        t: 'end_speaking',
-                        stamp: currentStamp
-                    }));
+          ws.send(
+            JSON.stringify({
+              t: "end_speaking",
+              stamp: currentStamp,
+            }),
+          );

          isProcessing.value = true;
        }
-            }
+      },
    );

    isActive.value = true;
    isListening.value = true;
-
  } catch (error) {
-        console.error('启动 Live Mode 失败:', error);
-        alert('启动失败，请检查麦克风权限或网络连接');
+    console.error("启动 Live Mode 失败:", error);
+    alert("启动失败，请检查麦克风权限或网络连接");
    await stopLiveMode();
  }
 }
@@ -260,6 +341,9 @@ async function stopLiveMode() {
    audioContext = null;
  }

+  isConnected.value = false;
+  textInput.value = "";
+
  // 关闭 WebSocket
  if (ws) {
    ws.close();
@@ -274,37 +358,41 @@ async function stopLiveMode() {
 function connectWebSocket(): Promise<void> {
  return new Promise((resolve, reject) => {
    // 获取存储的 token
-        const token = localStorage.getItem('token');
+    const token = localStorage.getItem("token");
    if (!token) {
-            reject(new Error('未登录，请先登录'));
+      reject(new Error("未登录，请先登录"));
      return;
    }

-        const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-        const wsUrl = `${protocol}//localhost:6185/api/live_chat/ws?token=${encodeURIComponent(token)}`;
+    const protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
+    const wsUrl = `${protocol}//localhost:6185/api/live_chat/ws?token=${encodeURIComponent(
+      token,
+    )}`;

    ws = new WebSocket(wsUrl);

    ws.onopen = () => {
-            console.log('[Live Mode] WebSocket 连接成功');
+      console.log("[Live Mode] WebSocket 连接成功");
+      isConnected.value = true;
      resolve();
    };

    ws.onerror = (error) => {
-            console.error('[Live Mode] WebSocket 错误:', error);
+      console.error("[Live Mode] WebSocket 错误:", error);
      reject(error);
    };

    ws.onmessage = handleWebSocketMessage;

    ws.onclose = () => {
-            console.log('[Live Mode] WebSocket 连接关闭');
+      console.log("[Live Mode] WebSocket 连接关闭");
+      isConnected.value = false;
    };

    // 超时处理
    setTimeout(() => {
      if (ws?.readyState !== WebSocket.OPEN) {
-                reject(new Error('WebSocket 连接超时'));
+        reject(new Error("WebSocket 连接超时"));
      }
    }, 5000);
  });
@@ -318,61 +406,82 @@ function handleWebSocketMessage(event: MessageEvent) {
    const msgType = message.t;

    switch (msgType) {
-            case 'user_msg':
+      case "user_msg":
        messages.value.push({
-                    type: 'user',
-                    text: message.data.text
+          type: "user",
+          text: message.data.text,
        });
        break;

-            case 'bot_text_chunk':
+      case "bot_text_chunk":
        messages.value.push({
-                    type: 'bot',
-                    text: message.data.text
+          type: "bot",
+          text: message.data.text,
        });
        break;

-            case 'bot_msg':
+      case "bot_msg":
        messages.value.push({
-                    type: 'bot',
-                    text: message.data.text
+          type: "bot",
+          text: message.data.text,
        });
        isProcessing.value = false;
        isListening.value = true;
        break;

-            case 'response':
+      case "response":
        // 音频数据
        playAudioChunk(message.data);
        break;

-            case 'stop_play':
+      case "stop_play":
        // 停止播放
        stopAudioPlayback();
        break;

-            case 'end':
+      case "end":
        // 处理完成
        isProcessing.value = false;
        isListening.value = true;
        break;

-            case 'error':
-                console.error('[Live Mode] 错误:', message.data);
-                alert('处理出错: ' + message.data);
+      case "error":
+        console.error("[Live Mode] 错误:", message.data);
+        alert("处理出错: " + message.data);
        isProcessing.value = false;
        isListening.value = true;
        break;

-            case 'metrics':
+      case "metrics":
        metrics.value = { ...metrics.value, ...message.data };
        break;
    }
  } catch (error) {
-        console.error('[Live Mode] 处理消息失败:', error);
+    console.error("[Live Mode] 处理消息失败:", error);
  }
 }

+function sendTextInput() {
+  const text = textInput.value.trim();
+  if (!isConnected.value || !text || isProcessing.value || !isActive.value) {
+    return;
+  }
+
+  if (!ws || ws.readyState !== WebSocket.OPEN) {
+    return;
+  }
+
+  ws.send(
+    JSON.stringify({
+      t: "text_input",
+      text,
+    }),
+  );
+
+  isProcessing.value = true;
+  textInput.value = "";
+}
+
 function playAudioChunk(base64Data: string) {
  if (!audioContext) return;

@@ -389,9 +498,8 @@ function playAudioChunk(base64Data: string) {

    // 触发解码处理
    processRawAudioQueue();
-
  } catch (error) {
-        console.error('[Live Mode] 接收音频数据失败:', error);
+    console.error("[Live Mode] 接收音频数据失败:", error);
  }
 }

@@ -407,7 +515,9 @@ async function processRawAudioQueue() {

      try {
        // 解码
-                const audioBuffer = await audioContext.decodeAudioData(bytes.buffer as ArrayBuffer);
+        const audioBuffer = await audioContext.decodeAudioData(
+          bytes.buffer as ArrayBuffer,
+        );
        audioBufferQueue.push(audioBuffer);

        // 如果当前没有播放，立即开始播放
@@ -415,7 +525,7 @@ async function processRawAudioQueue() {
          playNextAudio();
        }
      } catch (err) {
-                console.error('[Live Mode] 解码音频失败:', err);
+        console.error("[Live Mode] 解码音频失败:", err);
      }
    }
  } finally {
@@ -461,9 +571,8 @@ function playNextAudio() {
      currentSource = null;
      playNextAudio();
    };
-
  } catch (error) {
-        console.error('[Live Mode] 播放音频失败:', error);
+    console.error("[Live Mode] 播放音频失败:", error);
    isPlayingAudio = false;
    isPlaying.value = false;
    playNextAudio(); // 尝试播放下一个
@@ -521,7 +630,7 @@ function updateBotEnergy() {

 function handleClose() {
  stopLiveMode();
-    emit('close');
+  emit("close");
 }

 function toggleCodeMode() {
@@ -537,7 +646,7 @@ watch(isSpeaking, (newVal) => {
  if (newVal && isPlaying.value) {
    // 用户在播放时开始说话，发送打断信号
    if (ws && ws.readyState === WebSocket.OPEN) {
-            ws.send(JSON.stringify({ t: 'interrupt' }));
+      ws.send(JSON.stringify({ t: "interrupt" }));
    }
    // 本地立即停止播放
    stopAudioPlayback();
@@ -555,7 +664,11 @@ onBeforeUnmount(() => {
  flex-direction: column;
  height: 100%;
  width: 100%;
-    background: linear-gradient(135deg, rgba(103, 58, 183, 0.05) 0%, rgba(63, 81, 181, 0.05) 100%);
+  background: linear-gradient(
+    135deg,
+    rgba(103, 58, 183, 0.05) 0%,
+    rgba(63, 81, 181, 0.05) 100%
+  );
 }

 .header-controls {
@@ -574,6 +687,21 @@ onBeforeUnmount(() => {
  padding: 40px;
 }

+.text-input-panel {
+  position: absolute;
+  top: 16px;
+  left: 16px;
+  right: 16px;
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  z-index: 15;
+}
+
+.text-input-panel .v-text-field {
+  flex: 1;
+}
+
 .center-circle-container {
  position: relative;
  display: flex;
@@ -617,7 +745,12 @@ onBeforeUnmount(() => {
  height: 150px;
  border-radius: 50%;
  opacity: 0.8;
-    background: radial-gradient(circle, transparent 50%, rgba(125, 80, 201, 0.8) 70%, transparent 100%);
+  background: radial-gradient(
+    circle,
+    transparent 50%,
+    rgba(125, 80, 201, 0.8) 70%,
+    transparent 100%
+  );
  animation: explode 3s cubic-bezier(0.16, 1, 0.3, 1) forwards;
  filter: blur(30px);
  z-index: 0;
@@ -640,7 +773,7 @@ onBeforeUnmount(() => {
  font-size: 24px;
  color: var(--v-theme-on-surface);
  margin-bottom: 40px;
-    font-family: 'Outfit', sans-serif;
+  font-family: "Outfit", sans-serif;
 }

 .messages-container {
@@ -98,14 +98,28 @@ axios.interceptors.request.use((config) => {
 // Some parts of the UI use fetch directly; without this, those requests will 401.
 const _origFetch = window.fetch.bind(window);
 window.fetch = (input: RequestInfo | URL, init?: RequestInit) => {
+  const requestUrl = (() => {
+    if (typeof input === 'string') return input;
+    if (input instanceof URL) return input.toString();
+    return input.url;
+  })();
+
+  let shouldAttachAuth = false;
+  try {
+    const resolvedUrl = new URL(requestUrl, window.location.origin);
+    shouldAttachAuth = resolvedUrl.origin === window.location.origin;
+  } catch (_) {
+    shouldAttachAuth = requestUrl.startsWith('/');
+  }
+
  const token = localStorage.getItem('token');
-  if (!token) return _origFetch(input, init);
+  const locale = localStorage.getItem('astrbot-locale');
+  if (!token && !locale) return _origFetch(input, init);

  const headers = new Headers(init?.headers || (typeof input !== 'string' && 'headers' in input ? (input as Request).headers : undefined));
-  if (!headers.has('Authorization')) {
+  if (shouldAttachAuth && token && !headers.has('Authorization')) {
    headers.set('Authorization', `Bearer ${token}`);
  }
-  const locale = localStorage.getItem('astrbot-locale');
  if (locale && !headers.has('Accept-Language')) {
    headers.set('Accept-Language', locale);
  }
@@ -29,6 +29,7 @@ X-API-Key: abk_xxx
 ## Common Endpoints

 - `POST /api/v1/chat`: send chat message (SSE stream, server generates UUID when `session_id` is omitted)
+- `GET /api/v1/live/ws`: Live API WebSocket (API Key auth, requires `username` query parameter, optional `ct=live|chat`)
 - `GET /api/v1/chat/sessions`: list sessions for a specific `username` with pagination
 - `GET /api/v1/configs`: list available config files
 - `POST /api/v1/file`: upload attachment
@@ -49,3 +50,7 @@ curl -N 'http://localhost:6185/api/v1/chat' \
 Use the interactive docs:

 - https://docs.astrbot.app/scalar.html
+
+For the full Live API wire protocol, see:
+
+- `docs/live-api/README.md`
@@ -0,0 +1,434 @@
+# AstrBot Live API Protocol
+
+This document describes the current WebSocket protocol for AstrBot Live API.
+
+## Endpoint
+
+- Legacy JWT endpoint: `/api/live_chat/ws`
+- Legacy unified JWT endpoint: `/api/unified_chat/ws`
+- Open API endpoint: `/api/v1/live/ws`
+
+## Authentication
+
+### Legacy dashboard endpoints
+
+Pass a dashboard JWT in the `token` query parameter.
+
+Example:
+
+```text
+ws://localhost:6185/api/live_chat/ws?token=<dashboard_jwt>
+```
+
+### Open API endpoint
+
+Use an API key and provide `username` in the query string.
+
+Examples:
+
+```text
+ws://localhost:6185/api/v1/live/ws?api_key=<api_key>&username=alice
+ws://localhost:6185/api/v1/live/ws?api_key=<api_key>&username=alice&ct=chat
+```
+
+`ct` values:
+
+- `live`: voice conversation mode
+- `chat`: unified chat mode over the same WebSocket transport
+
+The Open API endpoint reuses the `chat` API key scope.
+
+## Transport
+
+- Protocol: WebSocket
+- Payload format: UTF-8 JSON text frames
+- Audio upload format in `live` mode:
+  - client sends raw PCM frames encoded as Base64
+  - sample rate: `16000`
+  - channels: `1`
+  - sample width: `16-bit`
+
+## Top-Level Envelope
+
+### Client to server
+
+```json
+{
+  "t": "message_type",
+  "...": "message specific fields"
+}
+```
+
+When using the unified socket, the client can also include:
+
+```json
+{
+  "ct": "live|chat",
+  "t": "message_type"
+}
+```
+
+### Server to client
+
+Legacy `live` mode uses:
+
+```json
+{
+  "t": "message_type",
+  "data": {}
+}
+```
+
+Unified `chat` mode uses:
+
+```json
+{
+  "ct": "chat",
+  "type": "message_type",
+  "data": {}
+}
+```
+
+Some forwarded `chat` frames may also contain `t`, `streaming`, `chain_type`, `message_id`, or `session_id`.
+
+## Live Mode
+
+### Client messages
+
+#### `start_speaking`
+
+Start a voice capture segment.
+
+```json
+{
+  "t": "start_speaking",
+  "stamp": "seg_001"
+}
+```
+
+#### `speaking_part`
+
+Send one audio frame.
+
+```json
+{
+  "t": "speaking_part",
+  "data": "<base64_pcm_bytes>"
+}
+```
+
+#### `end_speaking`
+
+Finish the current voice capture segment.
+
+```json
+{
+  "t": "end_speaking",
+  "stamp": "seg_001"
+}
+```
+
+#### `text_input`
+
+Send a plain text input directly while using `ct=live`. The server will still route through Live mode with TTS and interrupt handling.
+
+```json
+{
+  "t": "text_input",
+  "text": "Hello, what is the weather today?"
+}
+```
+
+#### `interrupt`
+
+Interrupt the current model or TTS response.
+
+```json
+{
+  "t": "interrupt"
+}
+```
+
+### Server messages
+
+#### `metrics`
+
+Performance and provider metadata.
+
+Example:
+
+```json
+{
+  "t": "metrics",
+  "data": {
+    "wav_assemble_time": 0.12,
+    "stt": "whisper_api",
+    "llm_ttft": 0.84,
+    "tts_total_time": 1.72
+  }
+}
+```
+
+#### `user_msg`
+
+STT result from the uploaded audio.
+
+```json
+{
+  "t": "user_msg",
+  "data": {
+    "text": "Hello there",
+    "ts": 1710000000000
+  }
+}
+```
+
+#### `bot_delta_chunk`
+
+Raw model text delta. This is the token or chunk level stream and is not sentence segmented.
+
+```json
+{
+  "t": "bot_delta_chunk",
+  "data": {
+    "text": "Hel"
+  }
+}
+```
+
+Notes:
+
+- This event is generated directly from the model streaming path.
+- It is independent from TTS chunking.
+- Consumers should append `data.text` to a local buffer.
+
+#### `bot_text_chunk`
+
+Text associated with the current TTS chunk. This is usually sentence or phrase segmented.
+
+```json
+{
+  "t": "bot_text_chunk",
+  "data": {
+    "text": "Hello there."
+  }
+}
+```
+
+Notes:
+
+- This event is aligned to TTS output, not raw token streaming.
+- It may be coarser than `bot_delta_chunk`.
+
+#### `response`
+
+One TTS audio chunk, Base64 encoded.
+
+```json
+{
+  "t": "response",
+  "data": "<base64_audio_bytes>"
+}
+```
+
+#### `bot_msg`
+
+Final bot text when the response completed without audio streaming.
+
+```json
+{
+  "t": "bot_msg",
+  "data": {
+    "text": "Final reply text",
+    "ts": 1710000001234
+  }
+}
+```
+
+#### `stop_play`
+
+Stop client-side audio playback because the response was interrupted.
+
+```json
+{
+  "t": "stop_play"
+}
+```
+
+#### `end`
+
+Marks the end of the current response turn.
+
+```json
+{
+  "t": "end"
+}
+```
+
+#### `error`
+
+Recoverable or terminal processing error.
+
+```json
+{
+  "t": "error",
+  "data": "error message"
+}
+```
+
+## Unified Chat Mode
+
+Set `ct=chat` on the Open API endpoint or include `"ct": "chat"` in each client frame when using `/api/unified_chat/ws`.
+
+### Client messages
+
+#### `bind`
+
+Subscribe to an existing webchat session.
+
+```json
+{
+  "ct": "chat",
+  "t": "bind",
+  "session_id": "session_001"
+}
+```
+
+#### `send`
+
+Send a chat request.
+
+```json
+{
+  "ct": "chat",
+  "t": "send",
+  "username": "alice",
+  "session_id": "session_001",
+  "message_id": "msg_001",
+  "message": [
+    {
+      "type": "plain",
+      "text": "Please summarize this"
+    }
+  ],
+  "selected_provider": "openai_chat_completion",
+  "selected_model": "gpt-4.1-mini",
+  "enable_streaming": true
+}
+```
+
+`message` uses the same message-part schema as `POST /api/v1/chat`.
+
+#### `interrupt`
+
+Interrupt the current chat response.
+
+```json
+{
+  "ct": "chat",
+  "t": "interrupt"
+}
+```
+
+### Server messages
+
+#### `session_bound`
+
+Acknowledges a successful `bind`.
+
+```json
+{
+  "ct": "chat",
+  "type": "session_bound",
+  "session_id": "session_001",
+  "message_id": "ws_sub_xxx"
+}
+```
+
+#### Forwarded streaming events
+
+The server forwards the normal webchat queue payloads. Common examples:
+
+```json
+{
+  "ct": "chat",
+  "type": "plain",
+  "data": "Hello",
+  "streaming": true,
+  "chain_type": null,
+  "message_id": "msg_001"
+}
+```
+
+```json
+{
+  "ct": "chat",
+  "type": "image",
+  "data": "[IMAGE]file.jpg",
+  "streaming": false,
+  "message_id": "msg_001"
+}
+```
+
+```json
+{
+  "ct": "chat",
+  "type": "agent_stats",
+  "data": {
+    "time_to_first_token": 0.8
+  }
+}
+```
+
+```json
+{
+  "ct": "chat",
+  "type": "message_saved",
+  "data": {
+    "id": 123,
+    "created_at": "2026-03-16T10:00:00Z"
+  }
+}
+```
+
+```json
+{
+  "ct": "chat",
+  "type": "end",
+  "data": "",
+  "streaming": false,
+  "message_id": "msg_001"
+}
+```
+
+#### Chat errors
+
+```json
+{
+  "ct": "chat",
+  "t": "error",
+  "code": "INVALID_MESSAGE_FORMAT",
+  "data": "message must be list"
+}
+```
+
+## Recommended Client Strategy
+
+For `live` mode:
+
+1. Append every `bot_delta_chunk.data.text` into a raw transcript buffer.
+2. Use `bot_text_chunk` only when you need text aligned with audio playback.
+3. Decode and play each `response` audio chunk in arrival order.
+4. Reset per-turn buffers after `end`.
+
+For `chat` mode:
+
+1. Treat `plain + streaming=true` as incremental text.
+2. Treat `complete` or `end` as the end of a response turn.
+3. Persist `message_saved` metadata if you need server-side history IDs.
+
+## Compatibility Notes
+
+- `bot_text_chunk` remains sentence or phrase segmented for TTS compatibility.
+- `bot_delta_chunk` is the new delta-level text event for real-time rendering.
+- The legacy JWT endpoints and the new Open API endpoint share the same runtime behavior after authentication.
@@ -257,6 +257,56 @@
        }
      }
    },
+    "/api/v1/live/ws": {
+      "get": {
+        "tags": [
+          "Open API"
+        ],
+        "summary": "Live API WebSocket",
+        "description": "WebSocket endpoint for Live API. Authenticate with API Key using query parameter `api_key` or header `Authorization: Bearer <api_key>`, and pass `username` as a query parameter. Use `ct=live` for voice mode or `ct=chat` for unified chat mode. See docs/live-api/README.md for the full frame-level protocol.",
+        "security": [
+          {
+            "ApiKeyHeader": []
+          }
+        ],
+        "parameters": [
+          {
+            "name": "username",
+            "in": "query",
+            "required": true,
+            "schema": {
+              "type": "string"
+            },
+            "description": "Target username for the live session."
+          },
+          {
+            "name": "ct",
+            "in": "query",
+            "schema": {
+              "type": "string",
+              "enum": [
+                "live",
+                "chat"
+              ],
+              "default": "live"
+            },
+            "description": "Session mode. `live` for voice conversation, `ct=chat` for the unified chat WebSocket."
+          }
+        ],
+        "responses": {
+          "101": {
+            "description": "WebSocket protocol switch"
+          },
+          "401": {
+            "$ref": "#/components/responses/Unauthorized"
+          },
+          "403": {
+            "$ref": "#/components/responses/Forbidden"
+          }
+        },
+        "x-websocket": true
+      }
+    },
    "/api/v1/im/message": {
      "post": {
        "tags": [
@@ -46,6 +46,7 @@ X-API-Key: abk_xxx
 调用 AstrBot 内建的 Agent 进行对话交互。支持插件调用、工具调用等能力，与 IM 端对话能力一致。

 - `POST /api/v1/chat`：发送对话消息（SSE 流式返回，不传 `session_id` 会自动创建 UUID）
+- `GET /api/v1/live/ws`：Live API WebSocket（API Key 鉴权，查询参数必须包含 `username`，可选 `ct=live|chat`）
 - `GET /api/v1/chat/sessions`：分页获取指定 `username` 的会话
 - `GET /api/v1/configs`：获取可用配置文件列表

@@ -148,3 +149,7 @@ curl -N 'http://localhost:6185/api/v1/chat' \
 交互式 API 文档请查看：

 - https://docs.astrbot.app/scalar.html
+
+Live API 协议说明请查看：
+
+- `docs/live-api/README.md`
@@ -257,6 +257,56 @@
        }
      }
    },
+    "/api/v1/live/ws": {
+      "get": {
+        "tags": [
+          "Open API"
+        ],
+        "summary": "Live API WebSocket",
+        "description": "WebSocket endpoint for Live API. Authenticate with API Key using query parameter `api_key` or header `Authorization: Bearer <api_key>`, and pass `username` as a query parameter. Use `ct=live` for voice mode or `ct=chat` for unified chat mode. See docs/live-api/README.md for the full frame-level protocol.",
+        "security": [
+          {
+            "ApiKeyHeader": []
+          }
+        ],
+        "parameters": [
+          {
+            "name": "username",
+            "in": "query",
+            "required": true,
+            "schema": {
+              "type": "string"
+            },
+            "description": "Target username for the live session."
+          },
+          {
+            "name": "ct",
+            "in": "query",
+            "schema": {
+              "type": "string",
+              "enum": [
+                "live",
+                "chat"
+              ],
+              "default": "live"
+            },
+            "description": "Session mode. `live` for voice conversation, `chat` for the unified chat WebSocket."
+          }
+        ],
+        "responses": {
+          "101": {
+            "description": "WebSocket protocol switch"
+          },
+          "401": {
+            "$ref": "#/components/responses/Unauthorized"
+          },
+          "403": {
+            "$ref": "#/components/responses/Forbidden"
+          }
+        },
+        "x-websocket": true
+      }
+    },
    "/api/v1/im/message": {
      "post": {
        "tags": [