Feature: 增加图片转述提供商配置、支持用户自定义模型模态能力 (#2422)

* feat: 增加图片转述提供商配置、支持用户自定义模型模态能力 * fix: 修复 LLMRequestSubStage 中会话管理方法参数不一致的问题，简化方法调用
2025-08-13 19:11:17 +08:00
parent 0759a11a85
commit 1d766001bb
9 changed files with 124 additions and 38 deletions
@@ -559,6 +559,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "gpt-4o-mini",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "Azure OpenAI": {
                        "id": "azure",
@@ -573,6 +574,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "gpt-4o-mini",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "xAI": {
                        "id": "xai",
@@ -586,6 +588,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "grok-2-latest",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "Anthropic": {
                        "id": "claude",
@@ -600,6 +603,7 @@ CONFIG_METADATA_2 = {
                            "model": "claude-3-5-sonnet-latest",
                            "max_tokens": 4096,
                        },
+                        "modalities": ["text", "image"],
                    },
                    "Ollama": {
                        "id": "ollama_default",
@@ -612,6 +616,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "llama3.1-8b",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "LM Studio": {
                        "id": "lm_studio",
@@ -624,6 +629,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "llama-3.1-8b",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "Gemini(OpenAI兼容)": {
                        "id": "gemini_default",
@@ -637,6 +643,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "gemini-1.5-flash",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "Gemini": {
                        "id": "gemini_default",
@@ -663,6 +670,7 @@ CONFIG_METADATA_2 = {
                        "gm_thinking_config": {
                            "budget": 0,
                        },
+                        "modalities": ["text", "image"],
                    },
                    "DeepSeek": {
                        "id": "deepseek_default",
@@ -676,6 +684,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "deepseek-chat",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "302.AI": {
                        "id": "302ai",
@@ -689,6 +698,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "gpt-4.1-mini",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "硅基流动": {
                        "id": "siliconflow",
@@ -702,6 +712,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "deepseek-ai/DeepSeek-V3",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "PPIO派欧云": {
                        "id": "ppio",
@@ -715,6 +726,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "deepseek/deepseek-r1",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "Kimi": {
                        "id": "moonshot",
@@ -728,6 +740,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "moonshot-v1-8k",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "智谱 AI": {
                        "id": "zhipu_default",
@@ -741,6 +754,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "glm-4-flash",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "Dify": {
                        "id": "dify_app_default",
@@ -785,6 +799,7 @@ CONFIG_METADATA_2 = {
                        "model_config": {
                            "model": "Qwen/Qwen3-32B",
                        },
+                        "modalities": ["text", "image"],
                    },
                    "FastGPT": {
                        "id": "fastgpt",
@@ -1001,6 +1016,15 @@ CONFIG_METADATA_2 = {
                    },
                },
                "items": {
+                    "modalities": {
+                        "description": "模型能力",
+                        "type": "list",
+                        "items": {"type": "string"},
+                        "options": ["text", "image"],
+                        "labels": ["文本", "图像"],
+                        "render_type": "checkbox",
+                        "hint": "模型支持的模态。如所填写的模型不支持图像，请取消勾选图像。",
+                    },
                    "provider": {
                        "type": "string",
                        "invisible": True,
@@ -65,6 +65,20 @@ class LLMRequestSubStage(Stage):

        return _ctx.get_using_provider(umo=event.unified_msg_origin)

+    async def _get_session_conv(self, event: AstrMessageEvent):
+        umo = event.unified_msg_origin
+        conv_mgr = self.conv_manager
+
+        # 获取对话上下文
+        cid = await conv_mgr.get_curr_conversation_id(umo)
+        if not cid:
+            cid = await conv_mgr.new_conversation(umo, event.get_platform_id())
+        conversation = await conv_mgr.get_conversation(umo, cid)
+        if not conversation:
+            cid = await conv_mgr.new_conversation(umo, event.get_platform_id())
+            conversation = await conv_mgr.get_conversation(umo, cid)
+        return conversation
+
    async def process(
        self, event: AstrMessageEvent, _nested: bool = False
    ) -> Union[None, AsyncGenerator[None, None]]:
@@ -107,24 +121,7 @@ class LLMRequestSubStage(Stage):
                    image_path = await comp.convert_to_file_path()
                    req.image_urls.append(image_path)

-            # 获取对话上下文
-            conversation_id = await self.conv_manager.get_curr_conversation_id(
-                event.unified_msg_origin
-            )
-            if not conversation_id:
-                conversation_id = await self.conv_manager.new_conversation(
-                    event.unified_msg_origin, event.get_platform_id()
-                )
-            conversation = await self.conv_manager.get_conversation(
-                event.unified_msg_origin, conversation_id
-            )
-            if not conversation:
-                conversation_id = await self.conv_manager.new_conversation(
-                    event.unified_msg_origin, event.get_platform_id()
-                )
-                conversation = await self.conv_manager.get_conversation(
-                    event.unified_msg_origin, conversation_id
-                )
+            conversation = await self._get_session_conv(event)
            req.conversation = conversation
            req.contexts = json.loads(conversation.history)

@@ -168,6 +165,13 @@ class LLMRequestSubStage(Stage):
        # fix messages
        req.contexts = self.fix_messages(req.contexts)

+        # check provider modalities
+        # 如果提供商不支持图像，但请求中包含图像，则清空图像列表。图片转述的检测和调用发生在这之前，因此这里可以这样处理。
+        if req.image_urls:
+            provider_cfg = provider.provider_config.get("modalities", ["text", "image"])
+            if "image" not in provider_cfg:
+                req.image_urls = []
+
        # Call Agent
        tool_loop_agent = ToolLoopAgent(
            provider=provider,
@@ -98,9 +98,35 @@ function saveEditedContent() {

            <v-col cols="12" sm="5" class="config-input">
              <div v-if="metadata[metadataKey].items[key]" class="w-100">
+                <!-- List item with options-->
+                <div v-if="metadata[metadataKey].items[key]?.type === 'list' && metadata[metadataKey].items[key]?.options && !metadata[metadataKey].items[key]?.invisible && metadata[metadataKey].items[key]?.render_type === 'checkbox'" 
+                  class="d-flex flex-wrap gap-20">
+                  <v-checkbox
+                    v-for="(option, index) in metadata[metadataKey].items[key]?.options"
+                    v-model="iterable[key]"
+                    :label="metadata[metadataKey].items[key]?.labels ? metadata[metadataKey].items[key].labels[index] : option"
+                    :value="option"
+                    class="mr-2"
+                    color="primary"
+                    hide-details
+                  ></v-checkbox>
+                </div>
+                <!-- List item with options-->
+                <v-combobox
+                  v-else-if="metadata[metadataKey].items[key]?.type === 'list' && metadata[metadataKey].items[key]?.options && !metadata[metadataKey].items[key]?.invisible"
+                  v-model="iterable[key]"
+                  :items="metadata[metadataKey].items[key]?.options"
+                  :disabled="metadata[metadataKey].items[key]?.readonly"
+                  density="compact"
+                  variant="outlined"
+                  class="config-field"
+                  hide-details
+                  chips
+                  multiple
+                ></v-combobox>
                <!-- Select input -->
                <v-select
-                  v-if="metadata[metadataKey].items[key]?.options && !metadata[metadataKey].items[key]?.invisible"
+                  v-else-if="metadata[metadataKey].items[key]?.options && !metadata[metadataKey].items[key]?.invisible"
                  v-model="iterable[key]"
                  :items="metadata[metadataKey].items[key]?.options"
                  :disabled="metadata[metadataKey].items[key]?.readonly"
@@ -4,7 +4,7 @@
      未选择
    </span>
    <span v-else>
-      {{ modelValue }}
+      {{ modelValue === 'default' ? '默认人格' : modelValue }}
    </span>
    <v-btn size="small" color="primary" variant="tonal" @click="openDialog">
      {{ buttonText }}
@@ -30,7 +30,7 @@
            :active="selectedPersona === persona.persona_id"
            rounded="md"
            class="ma-1">
-            <v-list-item-title>{{ persona.persona_id }}</v-list-item-title>
+            <v-list-item-title>{{ persona.persona_id === 'default' ? '默认人格' : persona.persona_id }}</v-list-item-title>
            <v-list-item-subtitle>
              {{ persona.system_prompt ? persona.system_prompt.substring(0, 50) + '...' : '无描述' }}
            </v-list-item-subtitle>
@@ -101,11 +101,24 @@ async function loadPersonas() {
  try {
    const response = await axios.get('/api/persona/list')
    if (response.data.status === 'ok') {
-      personaList.value = response.data.data || []
+      const personas = response.data.data || []
+      // 添加默认人格选项
+      personaList.value = [
+        {
+          persona_id: 'default',
+          system_prompt: 'You are a helpful and friendly assistant.'
+        },
+        ...personas
+      ]
    }
  } catch (error) {
    console.error('加载人格列表失败:', error)
-    personaList.value = []
+    personaList.value = [
+      {
+        persona_id: 'default',
+        system_prompt: 'You are a helpful and friendly assistant.'
+      }
+    ]
  } finally {
    loading.value = false
  }
@@ -457,7 +457,6 @@ export default {
        // Theme is now handled globally by the customizer store.
        // 设置输入框标签
        this.inputFieldLabel = this.tm('input.chatPrompt');
-        this.checkStatus();
        this.getConversations();
        let inputField = document.getElementById('input-field');
        inputField.addEventListener('paste', this.handlePaste);
@@ -487,7 +487,7 @@ export default {
                begin_dialogs: [],
                tools: []
            };
-            this.toolSelectValue = '1'; // 默认选择指定工具
+            this.toolSelectValue = '0';
            this.expandedPanels = [];
            this.showPersonaDialog = true;
        },
@@ -536,11 +536,6 @@ export default {
      this.showAddProviderDialog = false;
    },

-    // 废弃旧方法，保留为兼容
-    addFromDefaultConfigTmpl(index) {
-      this.selectProviderTemplate(index[0]);
-    },
-
    configExistingProvider(provider) {
      this.newSelectedProviderName = provider.id;
      this.newSelectedProviderConfig = {};
@@ -575,11 +570,13 @@ export default {
            if (!(key in target)) {
              target[key] = Array.isArray(reference[key]) ? [] : {};
            }
-            mergeConfigWithOrder(
-              target[key],
-              source && source[key] ? source[key] : {},
-              reference[key]
-            );
+            if (!Array.isArray(reference[key])) {
+              mergeConfigWithOrder(
+                target[key],
+                source && source[key] ? source[key] : {},
+                reference[key]
+              );
+            }
          } else if (!(key in target)) {
            // 只有当target中不存在该键时才从reference复制
            target[key] = reference[key];
@@ -30,8 +30,8 @@ class LongTermMemory:
            logger.error(e)
            max_cnt = 300
        image_caption = cfg["image_caption"]
-        image_caption_prompt = cfg["image_caption_prompt"] # TODO: 去掉这个配置项
-        image_caption_provider_id = cfg["image_caption_provider_id"] # TODO: 去掉这个配置项
+        image_caption_prompt = cfg["image_caption_prompt"]
+        image_caption_provider_id = cfg["image_caption_provider_id"]
        active_reply = cfg["active_reply"]
        enable_active_reply = active_reply.get("enable", False)
        ar_method = active_reply["method"]
@@ -1230,6 +1230,7 @@ UID: {user_id} 此 ID 可用于设置管理员。
            req.system_prompt += f"\nCurrent datetime: {current_time}\n"

        if req.conversation:
+            # persona inject
            persona_id = req.conversation.persona_id
            if not persona_id and persona_id != "[%None]":  # [%None] 为用户取消人格
                persona_id = self.context.persona_manager.selected_default_persona_v3[
@@ -1247,6 +1248,7 @@ UID: {user_id} 此 ID 可用于设置管理员。
                    req.system_prompt += prompt
                if begin_dialogs := persona["_begin_dialogs_processed"]:
                    req.contexts[:0] = begin_dialogs
+
            # tools select
            tmgr = self.context.get_llm_tool_manager()
            if (persona and persona.get("tools") is None) or not persona:
@@ -1261,6 +1263,27 @@ UID: {user_id} 此 ID 可用于设置管理员。
            req.func_tool = toolset
            logger.debug(f"Tool set for persona {persona_id}: {toolset.names()}")

+            # image caption
+            img_cap_prov_id = cfg.get("default_image_caption_provider_id")
+            if img_cap_prov_id and req.image_urls:
+                img_cap_prompt = cfg.get(
+                    "image_caption_prompt", "Please describe the image."
+                )
+                try:
+                    if prov := self.context.get_provider_by_id(img_cap_prov_id):
+                        logger.debug(
+                            f"Processing image caption with provider: {img_cap_prov_id}"
+                        )
+                        llm_resp = await prov.text_chat(
+                            prompt=img_cap_prompt,
+                            image_urls=req.image_urls,
+                        )
+                        if llm_resp.completion_text:
+                            req.prompt = f"(Image Caption: {llm_resp.completion_text})\n\n{req.prompt}"
+                        req.image_urls = []
+                except Exception as e:
+                    logger.error(f"处理图片描述失败: {e}")
+
        if quote:
            sender_info = ""
            if quote.sender_nickname:
@@ -1304,7 +1327,7 @@ UID: {user_id} 此 ID 可用于设置管理员。
        if self.ltm and self.ltm_enabled(event):
            try:
                await self.ltm.after_req_llm(event)
-            except BaseException as e:
+            except Exception as e:
                logger.error(f"ltm: {e}")

    @filter.permission_type(filter.PermissionType.ADMIN)