From c5a2827def0e69b6c924a199442deef035bcebf6 Mon Sep 17 00:00:00 2001
From: kawayiYokami <289104862@qq.com>
Date: Thu, 25 Dec 2025 03:54:05 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A4=9A=E6=96=87=E6=9C=AC=E5=9D=97?=
 =?UTF-8?q?=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 astrbot/core/provider/entities.py             | 38 ++++++--
 .../core/provider/sources/anthropic_source.py | 91 ++++++++++++-------
 .../core/provider/sources/gemini_source.py    | 37 ++++++--
 .../core/provider/sources/openai_source.py    | 31 +++++--
 packages/astrbot/process_llm_request.py       | 29 ++++--
 5 files changed, 160 insertions(+), 66 deletions(-)

diff --git a/astrbot/core/provider/entities.py b/astrbot/core/provider/entities.py
index d13e9b56a..5f794442e 100644
--- a/astrbot/core/provider/entities.py
+++ b/astrbot/core/provider/entities.py
@@ -92,6 +92,8 @@ class ProviderRequest:
     """会话 ID"""
     image_urls: list[str] = field(default_factory=list)
     """图片 URL 列表"""
+    extra_content_blocks: list[dict] = field(default_factory=list)
+    """额外的内容块列表，用于在用户消息后添加额外的文本块（如系统提醒、指令等）"""
     func_tool: ToolSet | None = None
     """可用的函数工具"""
     contexts: list[dict] = field(default_factory=list)
@@ -166,13 +168,21 @@ class ProviderRequest:
 
     async def assemble_context(self) -> dict:
         """将请求(prompt 和 image_urls)包装成 OpenAI 的消息格式。"""
+        # 构建内容块列表
+        content_blocks = []
+
+        # 1. 用户原始发言（OpenAI 建议：用户发言在前）
+        if self.prompt and self.prompt.strip():
+            content_blocks.append({"type": "text", "text": self.prompt})
+        elif self.image_urls:
+            # 如果没有文本但有图片，添加占位文本
+            content_blocks.append({"type": "text", "text": "[图片]"})
+
+        # 2. 额外的内容块（系统提醒、指令等）
+        content_blocks.extend(self.extra_content_blocks)
+
+        # 3. 图片内容
         if self.image_urls:
-            user_content = {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": self.prompt if self.prompt else "[图片]"},
-                ],
-            }
             for image_url in self.image_urls:
                 if image_url.startswith("http"):
                     image_path = await download_image_by_url(image_url)
@@ -185,11 +195,21 @@ class ProviderRequest:
                 if not image_data:
                     logger.warning(f"图片 {image_url} 得到的结果为空，将忽略。")
                     continue
-                user_content["content"].append(
+                content_blocks.append(
                     {"type": "image_url", "image_url": {"url": image_data}},
                 )
-            return user_content
-        return {"role": "user", "content": self.prompt}
+
+        # 只有当只有一个来自 prompt 的文本块且没有额外内容块时，才降级为简单格式以保持向后兼容
+        if (
+            len(content_blocks) == 1
+            and content_blocks[0]["type"] == "text"
+            and not self.extra_content_blocks
+            and not self.image_urls
+        ):
+            return {"role": "user", "content": content_blocks[0]["text"]}
+
+        # 否则返回多模态格式
+        return {"role": "user", "content": content_blocks}
 
     async def _encode_image_bs64(self, image_url: str) -> str:
         """将图片转换为 base64"""
diff --git a/astrbot/core/provider/sources/anthropic_source.py b/astrbot/core/provider/sources/anthropic_source.py
index 0ff61e393..d982af2e4 100644
--- a/astrbot/core/provider/sources/anthropic_source.py
+++ b/astrbot/core/provider/sources/anthropic_source.py
@@ -388,48 +388,71 @@ class ProviderAnthropic(Provider):
         async for llm_response in self._query_stream(payloads, func_tool):
             yield llm_response
 
-    async def assemble_context(self, text: str, image_urls: list[str] | None = None):
+    async def assemble_context(
+        self,
+        text: str,
+        image_urls: list[str] | None = None,
+        extra_content_blocks: list[dict] | None = None,
+    ):
         """组装上下文，支持文本和图片"""
-        if not image_urls:
-            return {"role": "user", "content": text}
-
         content = []
-        content.append({"type": "text", "text": text})
 
-        for image_url in image_urls:
-            if image_url.startswith("http"):
-                image_path = await download_image_by_url(image_url)
-                image_data = await self.encode_image_bs64(image_path)
-            elif image_url.startswith("file:///"):
-                image_path = image_url.replace("file:///", "")
-                image_data = await self.encode_image_bs64(image_path)
-            else:
-                image_data = await self.encode_image_bs64(image_url)
+        # 1. 用户原始发言（OpenAI 建议：用户发言在前）
+        if text:
+            content.append({"type": "text", "text": text})
+        elif image_urls:
+            # 如果没有文本但有图片，添加占位文本
+            content.append({"type": "text", "text": "[图片]"})
 
-            if not image_data:
-                logger.warning(f"图片 {image_url} 得到的结果为空，将忽略。")
-                continue
+        # 2. 额外的内容块（系统提醒、指令等）
+        if extra_content_blocks:
+            # 过滤出文本块，因为 Anthropic 主要支持文本和图片
+            text_blocks = [
+                block for block in extra_content_blocks if block.get("type") == "text"
+            ]
+            content.extend(text_blocks)
 
-            # Get mime type for the image
-            mime_type, _ = guess_type(image_url)
-            if not mime_type:
-                mime_type = "image/jpeg"  # Default to JPEG if can't determine
+        # 3. 图片内容
+        if image_urls:
+            for image_url in image_urls:
+                if image_url.startswith("http"):
+                    image_path = await download_image_by_url(image_url)
+                    image_data = await self.encode_image_bs64(image_path)
+                elif image_url.startswith("file:///"):
+                    image_path = image_url.replace("file:///", "")
+                    image_data = await self.encode_image_bs64(image_path)
+                else:
+                    image_data = await self.encode_image_bs64(image_url)
 
-            content.append(
-                {
-                    "type": "image",
-                    "source": {
-                        "type": "base64",
-                        "media_type": mime_type,
-                        "data": (
-                            image_data.split("base64,")[1]
-                            if "base64," in image_data
-                            else image_data
-                        ),
+                if not image_data:
+                    logger.warning(f"图片 {image_url} 得到的结果为空，将忽略。")
+                    continue
+
+                # Get mime type for the image
+                mime_type, _ = guess_type(image_url)
+                if not mime_type:
+                    mime_type = "image/jpeg"  # Default to JPEG if can't determine
+
+                content.append(
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": mime_type,
+                            "data": (
+                                image_data.split("base64,")[1]
+                                if "base64," in image_data
+                                else image_data
+                            ),
+                        },
                     },
-                },
-            )
+                )
 
+        # 如果只有一个文本块且没有图片，返回简单格式以保持向后兼容
+        if len(content) == 1 and content[0]["type"] == "text":
+            return {"role": "user", "content": content[0]["text"]}
+
+        # 否则返回多模态格式
         return {"role": "user", "content": content}
 
     async def encode_image_bs64(self, image_url: str) -> str:
diff --git a/astrbot/core/provider/sources/gemini_source.py b/astrbot/core/provider/sources/gemini_source.py
index 7f3700643..487acd431 100644
--- a/astrbot/core/provider/sources/gemini_source.py
+++ b/astrbot/core/provider/sources/gemini_source.py
@@ -797,13 +797,29 @@ class ProviderGoogleGenAI(Provider):
         self.chosen_api_key = key
         self._init_client()
 
-    async def assemble_context(self, text: str, image_urls: list[str] | None = None):
+    async def assemble_context(
+        self,
+        text: str,
+        image_urls: list[str] | None = None,
+        extra_content_blocks: list[dict] | None = None,
+    ):
         """组装上下文。"""
+        # 构建内容块列表
+        content_blocks = []
+
+        # 1. 用户原始发言（OpenAI 建议：用户发言在前）
+        if text:
+            content_blocks.append({"type": "text", "text": text})
+        elif image_urls:
+            # 如果没有文本但有图片，添加占位文本
+            content_blocks.append({"type": "text", "text": "[图片]"})
+
+        # 2. 额外的内容块（系统提醒、指令等）
+        if extra_content_blocks:
+            content_blocks.extend(extra_content_blocks)
+
+        # 3. 图片内容
         if image_urls:
-            user_content = {
-                "role": "user",
-                "content": [{"type": "text", "text": text if text else "[图片]"}],
-            }
             for image_url in image_urls:
                 if image_url.startswith("http"):
                     image_path = await download_image_by_url(image_url)
@@ -816,14 +832,19 @@ class ProviderGoogleGenAI(Provider):
                 if not image_data:
                     logger.warning(f"图片 {image_url} 得到的结果为空，将忽略。")
                     continue
-                user_content["content"].append(
+                content_blocks.append(
                     {
                         "type": "image_url",
                         "image_url": {"url": image_data},
                     },
                 )
-            return user_content
-        return {"role": "user", "content": text}
+
+        # 如果只有文本且没有额外内容块，返回简单格式以保持向后兼容
+        if len(content_blocks) == 1 and content_blocks[0]["type"] == "text":
+            return {"role": "user", "content": content_blocks[0]["text"]}
+
+        # 否则返回多模态格式
+        return {"role": "user", "content": content_blocks}
 
     async def encode_image_bs64(self, image_url: str) -> str:
         """将图片转换为 base64"""
diff --git a/astrbot/core/provider/sources/openai_source.py b/astrbot/core/provider/sources/openai_source.py
index a716d0a5a..97bb992e7 100644
--- a/astrbot/core/provider/sources/openai_source.py
+++ b/astrbot/core/provider/sources/openai_source.py
@@ -624,13 +624,25 @@ class ProviderOpenAIOfficial(Provider):
         self,
         text: str,
         image_urls: list[str] | None = None,
+        extra_content_blocks: list[dict] | None = None,
     ) -> dict:
         """组装成符合 OpenAI 格式的 role 为 user 的消息段"""
+        # 构建内容块列表
+        content_blocks = []
+
+        # 1. 用户原始发言（OpenAI 建议：用户发言在前）
+        if text:
+            content_blocks.append({"type": "text", "text": text})
+        elif image_urls:
+            # 如果没有文本但有图片，添加占位文本
+            content_blocks.append({"type": "text", "text": "[图片]"})
+
+        # 2. 额外的内容块（系统提醒、指令等）
+        if extra_content_blocks:
+            content_blocks.extend(extra_content_blocks)
+
+        # 3. 图片内容
         if image_urls:
-            user_content = {
-                "role": "user",
-                "content": [{"type": "text", "text": text if text else "[图片]"}],
-            }
             for image_url in image_urls:
                 if image_url.startswith("http"):
                     image_path = await download_image_by_url(image_url)
@@ -643,14 +655,19 @@ class ProviderOpenAIOfficial(Provider):
                 if not image_data:
                     logger.warning(f"图片 {image_url} 得到的结果为空，将忽略。")
                     continue
-                user_content["content"].append(
+                content_blocks.append(
                     {
                         "type": "image_url",
                         "image_url": {"url": image_data},
                     },
                 )
-            return user_content
-        return {"role": "user", "content": text}
+
+        # 如果只有文本且没有额外内容块，返回简单格式以保持向后兼容
+        if len(content_blocks) == 1 and content_blocks[0]["type"] == "text":
+            return {"role": "user", "content": content_blocks[0]["text"]}
+
+        # 否则返回多模态格式
+        return {"role": "user", "content": content_blocks}
 
     async def encode_image_bs64(self, image_url: str) -> str:
         """将图片转换为 base64"""
diff --git a/packages/astrbot/process_llm_request.py b/packages/astrbot/process_llm_request.py
index 89a4df3a2..532aac219 100644
--- a/packages/astrbot/process_llm_request.py
+++ b/packages/astrbot/process_llm_request.py
@@ -85,7 +85,12 @@ class ProcessLLMRequest:
                 req.image_urls,
             )
             if caption:
-                req.prompt = f"(Image Caption: {caption})\n\n{req.prompt}"
+                req.extra_content_blocks.append(
+                    {
+                        "type": "text",
+                        "text": f"<image_caption>{caption}</image_caption>",
+                    }
+                )
                 req.image_urls = []
         except Exception as e:
             logger.error(f"处理图片描述失败: {e}")
@@ -129,13 +134,14 @@ class ProcessLLMRequest:
             else:
                 req.prompt = prefix + req.prompt
 
+        # 收集系统提醒信息
+        system_parts = []
+
         # user identifier
         if cfg.get("identifier"):
             user_id = event.message_obj.sender.user_id
             user_nickname = event.message_obj.sender.nickname
-            req.prompt = (
-                f"\n[User ID: {user_id}, Nickname: {user_nickname}]\n{req.prompt}"
-            )
+            system_parts.append(f"User ID: {user_id}, Nickname: {user_nickname}")
 
         # group name identifier
         if cfg.get("group_name_display") and event.message_obj.group_id:
@@ -146,7 +152,7 @@ class ProcessLLMRequest:
                 return
             group_name = event.message_obj.group.group_name
             if group_name:
-                req.system_prompt += f"\nGroup name: {group_name}\n"
+                system_parts.append(f"Group name: {group_name}")
 
         # time info
         if cfg.get("datetime_system_prompt"):
@@ -162,7 +168,7 @@ class ProcessLLMRequest:
                 current_time = (
                     datetime.datetime.now().astimezone().strftime("%Y-%m-%d %H:%M (%Z)")
                 )
-            req.system_prompt += f"\nCurrent datetime: {current_time}\n"
+            system_parts.append(f"Current datetime: {current_time}")
 
         img_cap_prov_id: str = cfg.get("default_image_caption_provider_id") or ""
         if req.conversation:
@@ -225,10 +231,17 @@ class ProcessLLMRequest:
                 except BaseException as e:
                     logger.error(f"处理引用图片失败: {e}")
 
-            # 3. 将所有部分组合成文本并直接注入到当前消息中
+            # 3. 将所有部分组合成文本并添加到 extra_content_blocks 中
             # 确保引用内容被正确的标签包裹
             quoted_content = "\n".join(content_parts)
             # 确保所有内容都在<Quoted Message>标签内
             quoted_text = f"<Quoted Message>\n{quoted_content}\n</Quoted Message>"
 
-            req.prompt = f"{quoted_text}\n\n{req.prompt}"
+            req.extra_content_blocks.append({"type": "text", "text": quoted_text})
+
+        # 统一包裹所有系统提醒
+        if system_parts:
+            system_content = (
+                "<system_reminder>" + "".join(system_parts) + "</system_reminder>"
+            )
+            req.extra_content_blocks.append({"type": "text", "text": system_content})