diff --git a/astrbot/core/agent/context/compressor.py b/astrbot/core/agent/context/compressor.py new file mode 100644 index 000000000..e79481c3a --- /dev/null +++ b/astrbot/core/agent/context/compressor.py @@ -0,0 +1,140 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from astrbot.api import logger + +from ..message import Message + +if TYPE_CHECKING: + from astrbot.core.provider.provider import Provider + +from ..context.truncator import ContextTruncator + + +class ContextCompressor(ABC): + """ + Abstract base class for context compressors. + Provides an interface for compressing message lists. + """ + + @abstractmethod + async def compress(self, messages: list[Message]) -> list[Message]: + """Compress the message list. + + Args: + messages: The original message list. + + Returns: + The compressed message list. + """ + pass + + +class DefaultCompressor(ContextCompressor): + """Default compressor implementation. + Returns the original messages. + """ + + async def compress(self, messages: list[Message]) -> list[Message]: + return messages + + +class TruncateByTurnsCompressor(ContextCompressor): + """Truncate by turns compressor implementation. + Truncates the message list by removing older turns. + """ + + def __init__(self, truncate_turns: int = 1): + """Initialize the truncate by turns compressor. + + Args: + truncate_turns: The number of turns to remove when truncating (default: 1). + """ + self.truncate_turns = truncate_turns + + async def compress(self, messages: list[Message]) -> list[Message]: + truncator = ContextTruncator() + truncated_messages = truncator.truncate_by_turns( + messages, + keep_most_recent_turns=0, + dequeue_turns=self.truncate_turns, + ) + return truncated_messages + + +class LLMSummaryCompressor(ContextCompressor): + """LLM-based summary compressor. + Uses LLM to summarize the old conversation history, keeping the latest messages. + """ + + def __init__( + self, + provider: "Provider", + keep_recent: int = 4, + instruction_text: str | None = None, + ): + """Initialize the LLM summary compressor. + + Args: + provider: The LLM provider instance. + keep_recent: The number of latest messages to keep (default: 4). + """ + self.provider = provider + self.keep_recent = keep_recent + + self.instruction_text = instruction_text or ( + "Based on our full conversation history, produce a concise summary of key takeaways and/or project progress.\n" + "1. Systematically cover all core topics discussed and the final conclusion/outcome for each; clearly highlight the latest primary focus.\n" + "2. If any tools were used, summarize tool usage (total call count) and extract the most valuable insights from tool outputs.\n" + "3. If there was an initial user goal, state it first and describe the current progress/status.\n" + "4. Write the summary in the user's language.\n" + ) + + async def compress(self, messages: list[Message]) -> list[Message]: + """Use LLM to generate a summary of the conversation history. + + Process: + 1. Divide messages: keep the system message and the latest N messages. + 2. Send the old messages + the instruction message to the LLM. + 3. Reconstruct the message list: [system message, summary message, latest messages]. + """ + if len(messages) <= self.keep_recent + 1: + return messages + + # keep the system message + system_msg = messages[0] if messages and messages[0].role == "system" else None + start_idx = 1 if system_msg else 0 + + messages_to_summarize = messages[start_idx : -self.keep_recent] + recent_messages = messages[-self.keep_recent :] + + if not messages_to_summarize: + return messages + + # build payload + instruction_message = Message(role="user", content=self.instruction_text) + llm_payload = messages_to_summarize + [instruction_message] + + # generate summary + try: + response = await self.provider.text_chat(contexts=llm_payload) + summary_content = response.completion_text + except Exception as e: + logger.error(f"Failed to generate summary: {e}") + return messages + + # build result + result = [] + if system_msg: + result.append(system_msg) + + result.append( + Message( + role="system", + content=f"History conversation summary: {summary_content}", + ), + ) + + result.extend(recent_messages) + + return result diff --git a/astrbot/core/agent/context/manager.py b/astrbot/core/agent/context/manager.py new file mode 100644 index 000000000..ba1dd4b9b --- /dev/null +++ b/astrbot/core/agent/context/manager.py @@ -0,0 +1,141 @@ +from typing import TYPE_CHECKING + +from astrbot import logger + +from ..message import Message +from .compressor import LLMSummaryCompressor, TruncateByTurnsCompressor +from .token_counter import TokenCounter +from .truncator import ContextTruncator + +if TYPE_CHECKING: + from astrbot.core.provider.provider import Provider + + +class ContextManager: + """Context compression manager.""" + + COMPRESSION_THRESHOLD = 0.82 + """compression trigger threshold""" + + def __init__( + self, + max_context_tokens: int = 0, + truncate_turns: int = 1, + llm_compress_instruction: str | None = None, + llm_compress_keep_recent: int = 4, + llm_compress_provider: "Provider | None" = None, + ): + """Initialize the context manager. + + There are two strategies to handle context limit reached: + 1. Truncate by turns: remove older messages by turns. + 2. LLM-based compression: use LLM to summarize old messages. + + Args: + max_context_tokens: The maximum context tokens. <= 0 means no limit. + truncate_turns: For turncate strategy. The number of turns to discard when truncating. + llm_compress_instruction: The instruction text for LLM compression. + llm_compress_keep_recent: The number of recent messages to keep during LLM compression. + llm_compress_provider: The LLM provider for compression. + """ + self.max_context_tokens = max_context_tokens + self.truncate_turns = truncate_turns + + self.token_counter = TokenCounter() + self.truncator = ContextTruncator() + + if llm_compress_provider: + self.compressor = LLMSummaryCompressor( + provider=llm_compress_provider, + keep_recent=llm_compress_keep_recent, + instruction_text=llm_compress_instruction, + ) + else: + self.compressor = TruncateByTurnsCompressor(truncate_turns=truncate_turns) + + async def process(self, messages: list[Message]) -> list[Message]: + """Process the messages. + + Args: + messages: The original message list. + + Returns: + The processed message list. + """ + if self.max_context_tokens <= 0: + return messages + + # check if the messages need to be compressed + needs_compression, _ = await self._initial_token_check(messages) + + # compress/truncate the messages if needed + messages = await self._run_compression(messages, needs_compression) + + return messages + + async def _initial_token_check( + self, messages: list[Message] + ) -> tuple[bool, int | None]: + """ + Check if the messages need to be compressed. + + Args: + messages: The original message list. + + Returns: + tuple: (whether to compress, initial token count) + """ + if not messages: + return False, None + if self.max_context_tokens <= 0: + return False, None + + total_tokens = self.token_counter.count_tokens(messages) + + logger.debug( + f"ContextManager: total tokens = {total_tokens}, max_context_tokens = {self.max_context_tokens}" + ) + usage_rate = total_tokens / self.max_context_tokens + + needs_compression = usage_rate > self.COMPRESSION_THRESHOLD + return needs_compression, total_tokens if needs_compression else None + + async def _run_compression( + self, messages: list[Message], needs_compression: bool + ) -> list[Message]: + """ + Compress/truncate the messages if needed. + + Args: + messages: The original message list. + needs_compression: Whether to compress. + + Returns: + The compressed/truncated message list. + """ + if not needs_compression: + return messages + if self.max_context_tokens <= 0: + return messages + + messages = await self.compressor.compress(messages) + + # double check + tokens_after_summary = self.token_counter.count_tokens(messages) + if tokens_after_summary / self.max_context_tokens > self.COMPRESSION_THRESHOLD: + # still over 82%, truncate by half + messages = self._compress_by_halving(messages) + + return messages + + def _compress_by_halving(self, messages: list[Message]) -> list[Message]: + """ + 对半砍策略:删除中间50%的消息 + + Args: + messages: 原始消息列表 + + Returns: + 截断后的消息列表 + """ + return self.truncator.truncate_by_halving(messages) diff --git a/astrbot/core/agent/context/token_counter.py b/astrbot/core/agent/context/token_counter.py new file mode 100644 index 000000000..753ebc49b --- /dev/null +++ b/astrbot/core/agent/context/token_counter.py @@ -0,0 +1,30 @@ +import json + +from ..message import Message, TextPart + + +class TokenCounter: + def count_tokens(self, messages: list[Message]) -> int: + total = 0 + for msg in messages: + content = msg.content + if isinstance(content, str): + total += self._estimate_tokens(content) + elif isinstance(content, list): + # 处理多模态内容 + for part in content: + if isinstance(part, TextPart): + total += self._estimate_tokens(part.text) + + # 处理 Tool Calls + if msg.tool_calls: + for tc in msg.tool_calls: + tc_str = json.dumps(tc if isinstance(tc, dict) else tc.model_dump()) + total += self._estimate_tokens(tc_str) + + return total + + def _estimate_tokens(self, text: str) -> int: + chinese_count = len([c for c in text if "\u4e00" <= c <= "\u9fff"]) + other_count = len(text) - chinese_count + return int(chinese_count * 0.6 + other_count * 0.3) diff --git a/astrbot/core/agent/context/truncator.py b/astrbot/core/agent/context/truncator.py new file mode 100644 index 000000000..6bf80bffe --- /dev/null +++ b/astrbot/core/agent/context/truncator.py @@ -0,0 +1,94 @@ +from ..message import Message + + +class ContextTruncator: + """Context truncator.""" + + def fix_messages(self, messages: list[Message]) -> list[Message]: + fixed_messages = [] + for message in messages: + if message.role == "tool": + # tool block 前面必须要有 user 和 assistant block + if len(fixed_messages) < 2: + # 这种情况可能是上下文被截断导致的 + # 我们直接将之前的上下文都清空 + fixed_messages = [] + else: + fixed_messages.append(message) + else: + fixed_messages.append(message) + return fixed_messages + + def truncate_by_turns( + self, + messages: list[Message], + keep_most_recent_turns: int, + dequeue_turns: int = 1, + ) -> list[Message]: + """截断上下文列表,确保不超过最大长度。 + 一个 turn 包含一个 user 消息和一个 assistant 消息。 + 这个方法会保证截断后的上下文列表符合 OpenAI 的上下文格式。 + + Args: + messages: 上下文列表 + keep_most_recent_turns: 保留最近的对话轮数 + dequeue_turns: 一次性丢弃的对话轮数 + + Returns: + 截断后的上下文列表 + """ + if keep_most_recent_turns == -1: + return messages + if len(messages) <= keep_most_recent_turns: + return messages + if len(messages) // 2 <= keep_most_recent_turns: + return messages + + system_message = None + if messages[0].role == "system": + system_message = messages[0] + messages = messages[1:] + + truncated_contexts = messages[ + -(keep_most_recent_turns - dequeue_turns + 1) * 2 : + ] + # 找到第一个role 为 user 的索引,确保上下文格式正确 + index = next( + (i for i, item in enumerate(truncated_contexts) if item.role == "user"), + None, + ) + if index is not None and index > 0: + truncated_contexts = truncated_contexts[index:] + + if system_message is not None: + truncated_contexts = [system_message] + truncated_contexts + + return self.fix_messages(truncated_contexts) + + def truncate_by_halving( + self, + messages: list[Message], + ) -> list[Message]: + """对半砍策略,删除 50% 的消息""" + if len(messages) <= 2: + return messages + + first_non_system = 0 + for i, msg in enumerate(messages): + if msg.role != "system": + first_non_system = i + break + + messages_to_delete = (len(messages) - first_non_system) // 2 + + result = messages[:first_non_system] + result.extend(messages[first_non_system + messages_to_delete :]) + + index = next( + (i for i, item in enumerate(result) if item.role == "user"), + None, + ) + if index is not None: + result = result[index:] + + return self.fix_messages(result) diff --git a/astrbot/core/agent/runners/tool_loop_agent_runner.py b/astrbot/core/agent/runners/tool_loop_agent_runner.py index 4b0c601b4..e04e70434 100644 --- a/astrbot/core/agent/runners/tool_loop_agent_runner.py +++ b/astrbot/core/agent/runners/tool_loop_agent_runner.py @@ -25,6 +25,8 @@ from astrbot.core.provider.entities import ( ) from astrbot.core.provider.provider import Provider +from ..context.manager import ContextManager +from ..context.truncator import ContextTruncator from ..hooks import BaseAgentRunHooks from ..message import AssistantMessageSegment, Message, ToolCallMessageSegment from ..response import AgentResponseData, AgentStats @@ -51,6 +53,33 @@ class ToolLoopAgentRunner(BaseAgentRunner[TContext]): ) -> None: self.req = request self.streaming = kwargs.get("streaming", False) + + # enforce max turns, will discard older turns when exceeded BEFORE compression + # -1 means no limit + self.enforce_max_turns = kwargs.get("enforce_max_turns", -1) + + # llm compressor + self.llm_compress_instruction = kwargs.get("llm_compress_instruction", None) + self.llm_compress_keep_recent = kwargs.get("llm_compress_keep_recent", 0) + self.llm_compress_provider: Provider | None = kwargs.get( + "llm_compress_provider", None + ) + # truncate by turns compressor + self.truncate_turns = kwargs.get("truncate_turns", 1) + + # we will do compress when: + # 1. before requesting LLM + # TODO: 2. after LLM output a tool call + self.context_manager = ContextManager( + # <=0 will never trigger context compression + max_context_tokens=provider.provider_config.get("max_context_tokens", 0), + truncate_turns=self.truncate_turns, + llm_compress_instruction=self.llm_compress_instruction, + llm_compress_keep_recent=self.llm_compress_keep_recent, + llm_compress_provider=self.llm_compress_provider, + ) + self.context_truncator = ContextTruncator() + self.provider = provider self.final_llm_resp = None self._state = AgentState.IDLE @@ -92,6 +121,12 @@ class ToolLoopAgentRunner(BaseAgentRunner[TContext]): else: yield await self.provider.text_chat(**payload) + async def do_context_compress(self): + """检查并执行上下文压缩。""" + original_messages = self.run_context.messages + compressed_messages = await self.context_manager.process(original_messages) + self.run_context.messages = compressed_messages + @override async def step(self): """Process a single step of the agent. @@ -110,6 +145,24 @@ class ToolLoopAgentRunner(BaseAgentRunner[TContext]): self._transition_state(AgentState.RUNNING) llm_resp_result = None + # do truncate + if self.enforce_max_turns != -1: + try: + truncated_messages = self.context_truncator.truncate_by_turns( + self.run_context.messages, + keep_most_recent_turns=self.enforce_max_turns, + dequeue_turns=self.truncate_turns, + ) + self.run_context.messages = truncated_messages + except Exception as e: + logger.error(f"Error during context truncation: {e}", exc_info=True) + + # check compress + try: + await self.do_context_compress() + except Exception as e: + logger.error(f"Error during context compression: {e}", exc_info=True) + async for llm_response in self._iter_llm_responses(): if llm_response.is_chunk: # update ttft diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 0d7495b68..38d3eb0d0 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -83,6 +83,16 @@ DEFAULT_CONFIG = { "default_personality": "default", "persona_pool": ["*"], "prompt_prefix": "{{prompt}}", + "context_limit_reached_strategy": "truncate_by_turns", # or llm_compress + "llm_compress_instruction": ( + "Based on our full conversation history, produce a concise summary of key takeaways and/or project progress.\n" + "1. Systematically cover all core topics discussed and the final conclusion/outcome for each; clearly highlight the latest primary focus.\n" + "2. If any tools were used, summarize tool usage (total call count) and extract the most valuable insights from tool outputs.\n" + "3. If there was an initial user goal, state it first and describe the current progress/status.\n" + "4. Write the summary in the user's language.\n" + ), + "llm_compress_keep_recent": 4, + "llm_compress_provider_id": "", "max_context_length": -1, "dequeue_context_length": 1, "streaming_response": False, @@ -179,6 +189,7 @@ class ChatProviderTemplate(TypedDict): model: str modalities: list custom_extra_body: dict[str, Any] + max_context_tokens: int CHAT_PROVIDER_TEMPLATE = { @@ -187,6 +198,7 @@ CHAT_PROVIDER_TEMPLATE = { "model": "", "modalities": [], "custom_extra_body": {}, + "max_context_tokens": 0, } """ @@ -2033,6 +2045,11 @@ CONFIG_METADATA_2 = { "type": "string", "hint": "模型名称,如 gpt-4o-mini, deepseek-chat。", }, + "max_context_tokens": { + "description": "模型上下文窗口大小", + "type": "int", + "hint": "模型最大上下文 Token 大小。如果为 0,则会自动从模型元数据填充(如有),也可手动修改。", + }, "dify_api_key": { "description": "API Key", "type": "string", @@ -2540,6 +2557,66 @@ CONFIG_METADATA_3 = { # "provider_settings.enable": True, # }, # }, + "truncate_and_compress": { + "description": "上下文管理策略", + "type": "object", + "items": { + "provider_settings.max_context_length": { + "description": "最多携带对话轮数", + "type": "int", + "hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制", + "condition": { + "provider_settings.agent_runner_type": "local", + }, + }, + "provider_settings.dequeue_context_length": { + "description": "丢弃对话轮数", + "type": "int", + "hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数", + "condition": { + "provider_settings.agent_runner_type": "local", + }, + }, + "provider_settings.context_limit_reached_strategy": { + "description": "超出模型上下文窗口时的处理方式", + "type": "string", + "options": ["truncate_by_turns", "llm_compress"], + "labels": ["按对话轮数截断", "由 LLM 压缩上下文"], + "condition": { + "provider_settings.agent_runner_type": "local", + }, + "hint": "", + }, + "provider_settings.llm_compress_instruction": { + "description": "上下文压缩提示词", + "type": "text", + "hint": "如果为空则使用默认提示词。", + "condition": { + "provider_settings.context_limit_reached_strategy": "llm_compress", + "provider_settings.agent_runner_type": "local", + }, + }, + "provider_settings.llm_compress_keep_recent": { + "description": "压缩时保留最近对话轮数", + "type": "int", + "hint": "始终保留的最近 N 轮对话。", + "condition": { + "provider_settings.context_limit_reached_strategy": "llm_compress", + "provider_settings.agent_runner_type": "local", + }, + }, + "provider_settings.llm_compress_provider_id": { + "description": "用于上下文压缩的模型提供商 ID", + "type": "string", + "_special": "select_provider", + "hint": "留空时将降级为“按对话轮数截断”的策略。", + "condition": { + "provider_settings.context_limit_reached_strategy": "llm_compress", + "provider_settings.agent_runner_type": "local", + }, + }, + }, + }, "others": { "description": "其他配置", "type": "object", @@ -2604,22 +2681,6 @@ CONFIG_METADATA_3 = { "provider_settings.streaming_response": True, }, }, - "provider_settings.max_context_length": { - "description": "最多携带对话轮数", - "type": "int", - "hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制", - "condition": { - "provider_settings.agent_runner_type": "local", - }, - }, - "provider_settings.dequeue_context_length": { - "description": "丢弃对话轮数", - "type": "int", - "hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数", - "condition": { - "provider_settings.agent_runner_type": "local", - }, - }, "provider_settings.wake_prefix": { "description": "LLM 聊天额外唤醒前缀 ", "type": "string", diff --git a/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py b/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py index ed6dc32cf..9c1c1c9a9 100644 --- a/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py +++ b/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py @@ -1,7 +1,6 @@ """本地 Agent 模式的 LLM 调用 Stage""" import asyncio -import copy import json from collections.abc import AsyncGenerator @@ -24,6 +23,7 @@ from astrbot.core.provider.entities import ( ) from astrbot.core.star.star_handler import EventType, star_map from astrbot.core.utils.file_extract import extract_file_moonshotai +from astrbot.core.utils.llm_metadata import LLM_METADATAS from astrbot.core.utils.metrics import Metric from astrbot.core.utils.session_lock import session_lock_manager @@ -41,11 +41,6 @@ class InternalAgentSubStage(Stage): self.ctx = ctx conf = ctx.astrbot_config settings = conf["provider_settings"] - self.max_context_length = settings["max_context_length"] # int - self.dequeue_context_length: int = min( - max(1, settings["dequeue_context_length"]), - self.max_context_length - 1, - ) self.streaming_response: bool = settings["streaming_response"] self.unsupported_streaming_strategy: str = settings[ "unsupported_streaming_strategy" @@ -65,6 +60,23 @@ class InternalAgentSubStage(Stage): "moonshotai_api_key", "" ) + # 上下文管理相关 + self.context_limit_reached_strategy: str = settings.get( + "context_limit_reached_strategy", "truncate_by_turns" + ) + self.llm_compress_instruction: str = settings.get( + "llm_compress_instruction", "" + ) + self.llm_compress_keep_recent: int = settings.get("llm_compress_keep_recent", 4) + self.llm_compress_provider_id: str = settings.get( + "llm_compress_provider_id", "" + ) + self.max_context_length = settings["max_context_length"] # int + self.dequeue_context_length: int = min( + max(1, settings["dequeue_context_length"]), + self.max_context_length - 1, + ) + self.conv_manager = ctx.plugin_manager.context.conversation_manager def _select_provider(self, event: AstrMessageEvent): @@ -167,34 +179,6 @@ class InternalAgentSubStage(Stage): }, ) - def _truncate_contexts( - self, - contexts: list[dict], - ) -> list[dict]: - """截断上下文列表,确保不超过最大长度""" - if self.max_context_length == -1: - return contexts - - if len(contexts) // 2 <= self.max_context_length: - return contexts - - truncated_contexts = contexts[ - -(self.max_context_length - self.dequeue_context_length + 1) * 2 : - ] - # 找到第一个role 为 user 的索引,确保上下文格式正确 - index = next( - ( - i - for i, item in enumerate(truncated_contexts) - if item.get("role") == "user" - ), - None, - ) - if index is not None and index > 0: - truncated_contexts = truncated_contexts[index:] - - return truncated_contexts - def _modalities_fix( self, provider: Provider, @@ -328,21 +312,25 @@ class InternalAgentSubStage(Stage): history=message_to_save, ) - def _fix_messages(self, messages: list[dict]) -> list[dict]: - """验证并且修复上下文""" - fixed_messages = [] - for message in messages: - if message.get("role") == "tool": - # tool block 前面必须要有 user 和 assistant block - if len(fixed_messages) < 2: - # 这种情况可能是上下文被截断导致的 - # 我们直接将之前的上下文都清空 - fixed_messages = [] - else: - fixed_messages.append(message) - else: - fixed_messages.append(message) - return fixed_messages + def _get_compress_provider(self) -> Provider | None: + if not self.llm_compress_provider_id: + return None + if self.context_limit_reached_strategy != "llm_compress": + return None + provider = self.ctx.plugin_manager.context.get_provider_by_id( + self.llm_compress_provider_id, + ) + if provider is None: + logger.warning( + f"未找到指定的上下文压缩模型 {self.llm_compress_provider_id},将跳过压缩。", + ) + return None + if not isinstance(provider, Provider): + logger.warning( + f"指定的上下文压缩模型 {self.llm_compress_provider_id} 不是对话模型,将跳过压缩。" + ) + return None + return provider async def process( self, event: AstrMessageEvent, provider_wake_prefix: str @@ -426,9 +414,10 @@ class InternalAgentSubStage(Stage): await self._apply_kb(event, req) # truncate contexts to fit max length - if req.contexts: - req.contexts = self._truncate_contexts(req.contexts) - self._fix_messages(req.contexts) + # NOW moved to ContextManager inside ToolLoopAgentRunner + # if req.contexts: + # req.contexts = self._truncate_contexts(req.contexts) + # self._fix_messages(req.contexts) # session_id if not req.session_id: @@ -444,8 +433,6 @@ class InternalAgentSubStage(Stage): self.unsupported_streaming_strategy == "turn_off" and not event.platform_meta.support_streaming_message ) - # 备份 req.contexts - backup_contexts = copy.deepcopy(req.contexts) # run agent agent_runner = AgentRunner() @@ -456,6 +443,15 @@ class InternalAgentSubStage(Stage): context=self.ctx.plugin_manager.context, event=event, ) + + # inject model context length limit + if provider.provider_config.get("max_context_tokens", 0) <= 0: + model = provider.get_model() + if model_info := LLM_METADATAS.get(model): + provider.provider_config["max_context_tokens"] = model_info[ + "limit" + ]["context"] + await agent_runner.reset( provider=provider, request=req, @@ -466,6 +462,11 @@ class InternalAgentSubStage(Stage): tool_executor=FunctionToolExecutor(), agent_hooks=MAIN_AGENT_HOOKS, streaming=streaming_response, + llm_compress_instruction=self.llm_compress_instruction, + llm_compress_keep_recent=self.llm_compress_keep_recent, + llm_compress_provider=self._get_compress_provider(), + truncate_turns=self.dequeue_context_length, + enforce_max_turns=self.max_context_length, ) if streaming_response and not stream_to_general: @@ -511,9 +512,6 @@ class InternalAgentSubStage(Stage): ): yield - # 恢复备份的 contexts - req.contexts = backup_contexts - await self._save_to_history( event, req, diff --git a/dashboard/src/components/shared/ConfigItemRenderer.vue b/dashboard/src/components/shared/ConfigItemRenderer.vue index 88674eb0a..24ea8f9ce 100644 --- a/dashboard/src/components/shared/ConfigItemRenderer.vue +++ b/dashboard/src/components/shared/ConfigItemRenderer.vue @@ -144,7 +144,7 @@ color="primary" density="compact" hide-details - class="flex-grow-1" + style="flex: 1" > @@ -325,4 +325,8 @@ function getSpecialSubtype(value) { .gap-20 { gap: 20px; } + +:deep(.v-field__input) { + font-size: 14px; +} diff --git a/dashboard/src/composables/useProviderSources.ts b/dashboard/src/composables/useProviderSources.ts index e8bf58f45..dc0059b04 100644 --- a/dashboard/src/composables/useProviderSources.ts +++ b/dashboard/src/composables/useProviderSources.ts @@ -510,7 +510,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) { const metadata = getModelMetadata(modelName) let modalities: string[] - + if (!metadata) { modalities = ['text', 'image', 'tool_use'] } else { @@ -523,13 +523,19 @@ export function useProviderSources(options: UseProviderSourcesOptions) { } } + let max_context_tokens = 0 + if (metadata?.limit?.context && typeof metadata.limit.context === 'number') { + max_context_tokens = metadata.limit.context + } + const newProvider = { id: newId, enable: false, provider_source_id: sourceId, model: modelName, modalities, - custom_extra_body: {} + custom_extra_body: {}, + max_context_tokens: max_context_tokens } try { diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index ff9a68256..e0f694c33 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -11,7 +11,12 @@ }, "agent_runner_type": { "description": "Runner", - "labels": ["Built-in Agent", "Dify", "Coze", "Alibaba Cloud Bailian Application"] + "labels": [ + "Built-in Agent", + "Dify", + "Coze", + "Alibaba Cloud Bailian Application" + ] }, "coze_agent_runner_provider_id": { "description": "Coze Agent Runner Provider ID" @@ -128,6 +133,39 @@ } } }, + "truncate_and_compress": { + "description": "Context Management Strategy", + "provider_settings": { + "max_context_length": { + "description": "Maximum Conversation Turns", + "hint": "Discards the oldest parts when this count is exceeded. One conversation round counts as 1, -1 means unlimited" + }, + "dequeue_context_length": { + "description": "Dequeue Conversation Turns", + "hint": "Number of conversation turns to discard at once when maximum context length is exceeded" + }, + "context_limit_reached_strategy": { + "description": "Handling When Model Context Window is Exceeded", + "labels": [ + "Truncate by Turns", + "Compress by LLM" + ], + "hint": "When 'Truncate by Turns' is selected, the oldest N conversation turns will be discarded based on the 'Dequeue Conversation Turns' setting above. When 'Compress by LLM' is selected, the specified model will be used for context compression." + }, + "llm_compress_instruction": { + "description": "Context Compression Instruction", + "hint": "If empty, the default prompt will be used." + }, + "llm_compress_keep_recent": { + "description": "Keep Recent Turns When Compressing", + "hint": "Always keep the most recent N turns of conversation when compressing context." + }, + "llm_compress_provider_id": { + "description": "Model Provider ID for Context Compression", + "hint": "When left empty, will fall back to the 'Truncate by Turns' strategy." + } + } + }, "others": { "description": "Other Settings", "provider_settings": { @@ -161,15 +199,10 @@ "unsupported_streaming_strategy": { "description": "Platforms Without Streaming Support", "hint": "Select the handling method for platforms that don't support streaming responses. Real-time segmented reply sends content immediately when the system detects segment points like punctuation during streaming reception", - "labels": ["Real-time Segmented Reply", "Disable Streaming Response"] - }, - "max_context_length": { - "description": "Maximum Conversation Rounds", - "hint": "Discards the oldest parts when this count is exceeded. One conversation round counts as 1, -1 means unlimited" - }, - "dequeue_context_length": { - "description": "Dequeue Conversation Rounds", - "hint": "Number of conversation rounds to discard at once when maximum context length is exceeded" + "labels": [ + "Real-time Segmented Reply", + "Disable Streaming Response" + ] }, "wake_prefix": { "description": "Additional LLM Chat Wake Prefix", @@ -387,7 +420,10 @@ }, "split_mode": { "description": "Split Mode", - "labels": ["Regex", "Words List"] + "labels": [ + "Regex", + "Words List" + ] }, "regex": { "description": "Segmentation Regular Expression" @@ -488,4 +524,4 @@ } } } -} +} \ No newline at end of file diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index aba9cfd35..589aa54a0 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -133,6 +133,36 @@ } } }, + "truncate_and_compress": { + "description": "上下文管理策略", + "provider_settings": { + "max_context_length": { + "description": "最多携带对话轮数", + "hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制" + }, + "dequeue_context_length": { + "description": "丢弃对话轮数", + "hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数" + }, + "context_limit_reached_strategy": { + "description": "超出模型上下文窗口时的处理方式", + "labels": ["按对话轮数截断", "由 LLM 压缩上下文"], + "hint": "当按对话轮数截断时,会根据上面\"丢弃对话轮数\"的配置丢弃最旧的 N 轮对话。当由 LLM 压缩上下文时,会使用指定的模型进行上下文压缩。" + }, + "llm_compress_instruction": { + "description": "上下文压缩提示词", + "hint": "如果为空则使用默认提示词。" + }, + "llm_compress_keep_recent": { + "description": "压缩时保留最近对话轮数", + "hint": "始终保留的最近 N 轮对话。" + }, + "llm_compress_provider_id": { + "description": "用于上下文压缩的模型提供商 ID", + "hint": "留空时将降级为\"按对话轮数截断\"的策略。" + } + } + }, "others": { "description": "其他配置", "provider_settings": { @@ -171,14 +201,7 @@ "关闭流式回复" ] }, - "max_context_length": { - "description": "最多携带对话轮数", - "hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制" - }, - "dequeue_context_length": { - "description": "丢弃对话轮数", - "hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数" - }, + "wake_prefix": { "description": "LLM 聊天额外唤醒前缀", "hint": "如果唤醒前缀为 /, 额外聊天唤醒前缀为 chat,则需要 /chat 才会触发 LLM 请求"