feat: context compressor

Co-authored-by: kawayiYokami <289104862@qq.com>
This commit is contained in:
Soulter
2026-01-05 00:28:54 +08:00
parent 7f5cc7cf1a
commit d842155770
11 changed files with 683 additions and 97 deletions
+140
View File
@@ -0,0 +1,140 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
from astrbot.api import logger
from ..message import Message
if TYPE_CHECKING:
from astrbot.core.provider.provider import Provider
from ..context.truncator import ContextTruncator
class ContextCompressor(ABC):
"""
Abstract base class for context compressors.
Provides an interface for compressing message lists.
"""
@abstractmethod
async def compress(self, messages: list[Message]) -> list[Message]:
"""Compress the message list.
Args:
messages: The original message list.
Returns:
The compressed message list.
"""
pass
class DefaultCompressor(ContextCompressor):
"""Default compressor implementation.
Returns the original messages.
"""
async def compress(self, messages: list[Message]) -> list[Message]:
return messages
class TruncateByTurnsCompressor(ContextCompressor):
"""Truncate by turns compressor implementation.
Truncates the message list by removing older turns.
"""
def __init__(self, truncate_turns: int = 1):
"""Initialize the truncate by turns compressor.
Args:
truncate_turns: The number of turns to remove when truncating (default: 1).
"""
self.truncate_turns = truncate_turns
async def compress(self, messages: list[Message]) -> list[Message]:
truncator = ContextTruncator()
truncated_messages = truncator.truncate_by_turns(
messages,
keep_most_recent_turns=0,
dequeue_turns=self.truncate_turns,
)
return truncated_messages
class LLMSummaryCompressor(ContextCompressor):
"""LLM-based summary compressor.
Uses LLM to summarize the old conversation history, keeping the latest messages.
"""
def __init__(
self,
provider: "Provider",
keep_recent: int = 4,
instruction_text: str | None = None,
):
"""Initialize the LLM summary compressor.
Args:
provider: The LLM provider instance.
keep_recent: The number of latest messages to keep (default: 4).
"""
self.provider = provider
self.keep_recent = keep_recent
self.instruction_text = instruction_text or (
"Based on our full conversation history, produce a concise summary of key takeaways and/or project progress.\n"
"1. Systematically cover all core topics discussed and the final conclusion/outcome for each; clearly highlight the latest primary focus.\n"
"2. If any tools were used, summarize tool usage (total call count) and extract the most valuable insights from tool outputs.\n"
"3. If there was an initial user goal, state it first and describe the current progress/status.\n"
"4. Write the summary in the user's language.\n"
)
async def compress(self, messages: list[Message]) -> list[Message]:
"""Use LLM to generate a summary of the conversation history.
Process:
1. Divide messages: keep the system message and the latest N messages.
2. Send the old messages + the instruction message to the LLM.
3. Reconstruct the message list: [system message, summary message, latest messages].
"""
if len(messages) <= self.keep_recent + 1:
return messages
# keep the system message
system_msg = messages[0] if messages and messages[0].role == "system" else None
start_idx = 1 if system_msg else 0
messages_to_summarize = messages[start_idx : -self.keep_recent]
recent_messages = messages[-self.keep_recent :]
if not messages_to_summarize:
return messages
# build payload
instruction_message = Message(role="user", content=self.instruction_text)
llm_payload = messages_to_summarize + [instruction_message]
# generate summary
try:
response = await self.provider.text_chat(contexts=llm_payload)
summary_content = response.completion_text
except Exception as e:
logger.error(f"Failed to generate summary: {e}")
return messages
# build result
result = []
if system_msg:
result.append(system_msg)
result.append(
Message(
role="system",
content=f"History conversation summary: {summary_content}",
),
)
result.extend(recent_messages)
return result
+141
View File
@@ -0,0 +1,141 @@
from typing import TYPE_CHECKING
from astrbot import logger
from ..message import Message
from .compressor import LLMSummaryCompressor, TruncateByTurnsCompressor
from .token_counter import TokenCounter
from .truncator import ContextTruncator
if TYPE_CHECKING:
from astrbot.core.provider.provider import Provider
class ContextManager:
"""Context compression manager."""
COMPRESSION_THRESHOLD = 0.82
"""compression trigger threshold"""
def __init__(
self,
max_context_tokens: int = 0,
truncate_turns: int = 1,
llm_compress_instruction: str | None = None,
llm_compress_keep_recent: int = 4,
llm_compress_provider: "Provider | None" = None,
):
"""Initialize the context manager.
There are two strategies to handle context limit reached:
1. Truncate by turns: remove older messages by turns.
2. LLM-based compression: use LLM to summarize old messages.
Args:
max_context_tokens: The maximum context tokens. <= 0 means no limit.
truncate_turns: For turncate strategy. The number of turns to discard when truncating.
llm_compress_instruction: The instruction text for LLM compression.
llm_compress_keep_recent: The number of recent messages to keep during LLM compression.
llm_compress_provider: The LLM provider for compression.
"""
self.max_context_tokens = max_context_tokens
self.truncate_turns = truncate_turns
self.token_counter = TokenCounter()
self.truncator = ContextTruncator()
if llm_compress_provider:
self.compressor = LLMSummaryCompressor(
provider=llm_compress_provider,
keep_recent=llm_compress_keep_recent,
instruction_text=llm_compress_instruction,
)
else:
self.compressor = TruncateByTurnsCompressor(truncate_turns=truncate_turns)
async def process(self, messages: list[Message]) -> list[Message]:
"""Process the messages.
Args:
messages: The original message list.
Returns:
The processed message list.
"""
if self.max_context_tokens <= 0:
return messages
# check if the messages need to be compressed
needs_compression, _ = await self._initial_token_check(messages)
# compress/truncate the messages if needed
messages = await self._run_compression(messages, needs_compression)
return messages
async def _initial_token_check(
self, messages: list[Message]
) -> tuple[bool, int | None]:
"""
Check if the messages need to be compressed.
Args:
messages: The original message list.
Returns:
tuple: (whether to compress, initial token count)
"""
if not messages:
return False, None
if self.max_context_tokens <= 0:
return False, None
total_tokens = self.token_counter.count_tokens(messages)
logger.debug(
f"ContextManager: total tokens = {total_tokens}, max_context_tokens = {self.max_context_tokens}"
)
usage_rate = total_tokens / self.max_context_tokens
needs_compression = usage_rate > self.COMPRESSION_THRESHOLD
return needs_compression, total_tokens if needs_compression else None
async def _run_compression(
self, messages: list[Message], needs_compression: bool
) -> list[Message]:
"""
Compress/truncate the messages if needed.
Args:
messages: The original message list.
needs_compression: Whether to compress.
Returns:
The compressed/truncated message list.
"""
if not needs_compression:
return messages
if self.max_context_tokens <= 0:
return messages
messages = await self.compressor.compress(messages)
# double check
tokens_after_summary = self.token_counter.count_tokens(messages)
if tokens_after_summary / self.max_context_tokens > self.COMPRESSION_THRESHOLD:
# still over 82%, truncate by half
messages = self._compress_by_halving(messages)
return messages
def _compress_by_halving(self, messages: list[Message]) -> list[Message]:
"""
对半砍策略:删除中间50%的消息
Args:
messages: 原始消息列表
Returns:
截断后的消息列表
"""
return self.truncator.truncate_by_halving(messages)
@@ -0,0 +1,30 @@
import json
from ..message import Message, TextPart
class TokenCounter:
def count_tokens(self, messages: list[Message]) -> int:
total = 0
for msg in messages:
content = msg.content
if isinstance(content, str):
total += self._estimate_tokens(content)
elif isinstance(content, list):
# 处理多模态内容
for part in content:
if isinstance(part, TextPart):
total += self._estimate_tokens(part.text)
# 处理 Tool Calls
if msg.tool_calls:
for tc in msg.tool_calls:
tc_str = json.dumps(tc if isinstance(tc, dict) else tc.model_dump())
total += self._estimate_tokens(tc_str)
return total
def _estimate_tokens(self, text: str) -> int:
chinese_count = len([c for c in text if "\u4e00" <= c <= "\u9fff"])
other_count = len(text) - chinese_count
return int(chinese_count * 0.6 + other_count * 0.3)
+94
View File
@@ -0,0 +1,94 @@
from ..message import Message
class ContextTruncator:
"""Context truncator."""
def fix_messages(self, messages: list[Message]) -> list[Message]:
fixed_messages = []
for message in messages:
if message.role == "tool":
# tool block 前面必须要有 user 和 assistant block
if len(fixed_messages) < 2:
# 这种情况可能是上下文被截断导致的
# 我们直接将之前的上下文都清空
fixed_messages = []
else:
fixed_messages.append(message)
else:
fixed_messages.append(message)
return fixed_messages
def truncate_by_turns(
self,
messages: list[Message],
keep_most_recent_turns: int,
dequeue_turns: int = 1,
) -> list[Message]:
"""截断上下文列表,确保不超过最大长度。
一个 turn 包含一个 user 消息和一个 assistant 消息。
这个方法会保证截断后的上下文列表符合 OpenAI 的上下文格式。
Args:
messages: 上下文列表
keep_most_recent_turns: 保留最近的对话轮数
dequeue_turns: 一次性丢弃的对话轮数
Returns:
截断后的上下文列表
"""
if keep_most_recent_turns == -1:
return messages
if len(messages) <= keep_most_recent_turns:
return messages
if len(messages) // 2 <= keep_most_recent_turns:
return messages
system_message = None
if messages[0].role == "system":
system_message = messages[0]
messages = messages[1:]
truncated_contexts = messages[
-(keep_most_recent_turns - dequeue_turns + 1) * 2 :
]
# 找到第一个role 为 user 的索引,确保上下文格式正确
index = next(
(i for i, item in enumerate(truncated_contexts) if item.role == "user"),
None,
)
if index is not None and index > 0:
truncated_contexts = truncated_contexts[index:]
if system_message is not None:
truncated_contexts = [system_message] + truncated_contexts
return self.fix_messages(truncated_contexts)
def truncate_by_halving(
self,
messages: list[Message],
) -> list[Message]:
"""对半砍策略,删除 50% 的消息"""
if len(messages) <= 2:
return messages
first_non_system = 0
for i, msg in enumerate(messages):
if msg.role != "system":
first_non_system = i
break
messages_to_delete = (len(messages) - first_non_system) // 2
result = messages[:first_non_system]
result.extend(messages[first_non_system + messages_to_delete :])
index = next(
(i for i, item in enumerate(result) if item.role == "user"),
None,
)
if index is not None:
result = result[index:]
return self.fix_messages(result)
@@ -25,6 +25,8 @@ from astrbot.core.provider.entities import (
)
from astrbot.core.provider.provider import Provider
from ..context.manager import ContextManager
from ..context.truncator import ContextTruncator
from ..hooks import BaseAgentRunHooks
from ..message import AssistantMessageSegment, Message, ToolCallMessageSegment
from ..response import AgentResponseData, AgentStats
@@ -51,6 +53,33 @@ class ToolLoopAgentRunner(BaseAgentRunner[TContext]):
) -> None:
self.req = request
self.streaming = kwargs.get("streaming", False)
# enforce max turns, will discard older turns when exceeded BEFORE compression
# -1 means no limit
self.enforce_max_turns = kwargs.get("enforce_max_turns", -1)
# llm compressor
self.llm_compress_instruction = kwargs.get("llm_compress_instruction", None)
self.llm_compress_keep_recent = kwargs.get("llm_compress_keep_recent", 0)
self.llm_compress_provider: Provider | None = kwargs.get(
"llm_compress_provider", None
)
# truncate by turns compressor
self.truncate_turns = kwargs.get("truncate_turns", 1)
# we will do compress when:
# 1. before requesting LLM
# TODO: 2. after LLM output a tool call
self.context_manager = ContextManager(
# <=0 will never trigger context compression
max_context_tokens=provider.provider_config.get("max_context_tokens", 0),
truncate_turns=self.truncate_turns,
llm_compress_instruction=self.llm_compress_instruction,
llm_compress_keep_recent=self.llm_compress_keep_recent,
llm_compress_provider=self.llm_compress_provider,
)
self.context_truncator = ContextTruncator()
self.provider = provider
self.final_llm_resp = None
self._state = AgentState.IDLE
@@ -92,6 +121,12 @@ class ToolLoopAgentRunner(BaseAgentRunner[TContext]):
else:
yield await self.provider.text_chat(**payload)
async def do_context_compress(self):
"""检查并执行上下文压缩。"""
original_messages = self.run_context.messages
compressed_messages = await self.context_manager.process(original_messages)
self.run_context.messages = compressed_messages
@override
async def step(self):
"""Process a single step of the agent.
@@ -110,6 +145,24 @@ class ToolLoopAgentRunner(BaseAgentRunner[TContext]):
self._transition_state(AgentState.RUNNING)
llm_resp_result = None
# do truncate
if self.enforce_max_turns != -1:
try:
truncated_messages = self.context_truncator.truncate_by_turns(
self.run_context.messages,
keep_most_recent_turns=self.enforce_max_turns,
dequeue_turns=self.truncate_turns,
)
self.run_context.messages = truncated_messages
except Exception as e:
logger.error(f"Error during context truncation: {e}", exc_info=True)
# check compress
try:
await self.do_context_compress()
except Exception as e:
logger.error(f"Error during context compression: {e}", exc_info=True)
async for llm_response in self._iter_llm_responses():
if llm_response.is_chunk:
# update ttft
+77 -16
View File
@@ -83,6 +83,16 @@ DEFAULT_CONFIG = {
"default_personality": "default",
"persona_pool": ["*"],
"prompt_prefix": "{{prompt}}",
"context_limit_reached_strategy": "truncate_by_turns", # or llm_compress
"llm_compress_instruction": (
"Based on our full conversation history, produce a concise summary of key takeaways and/or project progress.\n"
"1. Systematically cover all core topics discussed and the final conclusion/outcome for each; clearly highlight the latest primary focus.\n"
"2. If any tools were used, summarize tool usage (total call count) and extract the most valuable insights from tool outputs.\n"
"3. If there was an initial user goal, state it first and describe the current progress/status.\n"
"4. Write the summary in the user's language.\n"
),
"llm_compress_keep_recent": 4,
"llm_compress_provider_id": "",
"max_context_length": -1,
"dequeue_context_length": 1,
"streaming_response": False,
@@ -179,6 +189,7 @@ class ChatProviderTemplate(TypedDict):
model: str
modalities: list
custom_extra_body: dict[str, Any]
max_context_tokens: int
CHAT_PROVIDER_TEMPLATE = {
@@ -187,6 +198,7 @@ CHAT_PROVIDER_TEMPLATE = {
"model": "",
"modalities": [],
"custom_extra_body": {},
"max_context_tokens": 0,
}
"""
@@ -2033,6 +2045,11 @@ CONFIG_METADATA_2 = {
"type": "string",
"hint": "模型名称,如 gpt-4o-mini, deepseek-chat。",
},
"max_context_tokens": {
"description": "模型上下文窗口大小",
"type": "int",
"hint": "模型最大上下文 Token 大小。如果为 0,则会自动从模型元数据填充(如有),也可手动修改。",
},
"dify_api_key": {
"description": "API Key",
"type": "string",
@@ -2540,6 +2557,66 @@ CONFIG_METADATA_3 = {
# "provider_settings.enable": True,
# },
# },
"truncate_and_compress": {
"description": "上下文管理策略",
"type": "object",
"items": {
"provider_settings.max_context_length": {
"description": "最多携带对话轮数",
"type": "int",
"hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制",
"condition": {
"provider_settings.agent_runner_type": "local",
},
},
"provider_settings.dequeue_context_length": {
"description": "丢弃对话轮数",
"type": "int",
"hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数",
"condition": {
"provider_settings.agent_runner_type": "local",
},
},
"provider_settings.context_limit_reached_strategy": {
"description": "超出模型上下文窗口时的处理方式",
"type": "string",
"options": ["truncate_by_turns", "llm_compress"],
"labels": ["按对话轮数截断", "由 LLM 压缩上下文"],
"condition": {
"provider_settings.agent_runner_type": "local",
},
"hint": "",
},
"provider_settings.llm_compress_instruction": {
"description": "上下文压缩提示词",
"type": "text",
"hint": "如果为空则使用默认提示词。",
"condition": {
"provider_settings.context_limit_reached_strategy": "llm_compress",
"provider_settings.agent_runner_type": "local",
},
},
"provider_settings.llm_compress_keep_recent": {
"description": "压缩时保留最近对话轮数",
"type": "int",
"hint": "始终保留的最近 N 轮对话。",
"condition": {
"provider_settings.context_limit_reached_strategy": "llm_compress",
"provider_settings.agent_runner_type": "local",
},
},
"provider_settings.llm_compress_provider_id": {
"description": "用于上下文压缩的模型提供商 ID",
"type": "string",
"_special": "select_provider",
"hint": "留空时将降级为“按对话轮数截断”的策略。",
"condition": {
"provider_settings.context_limit_reached_strategy": "llm_compress",
"provider_settings.agent_runner_type": "local",
},
},
},
},
"others": {
"description": "其他配置",
"type": "object",
@@ -2604,22 +2681,6 @@ CONFIG_METADATA_3 = {
"provider_settings.streaming_response": True,
},
},
"provider_settings.max_context_length": {
"description": "最多携带对话轮数",
"type": "int",
"hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制",
"condition": {
"provider_settings.agent_runner_type": "local",
},
},
"provider_settings.dequeue_context_length": {
"description": "丢弃对话轮数",
"type": "int",
"hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数",
"condition": {
"provider_settings.agent_runner_type": "local",
},
},
"provider_settings.wake_prefix": {
"description": "LLM 聊天额外唤醒前缀 ",
"type": "string",
@@ -1,7 +1,6 @@
"""本地 Agent 模式的 LLM 调用 Stage"""
import asyncio
import copy
import json
from collections.abc import AsyncGenerator
@@ -24,6 +23,7 @@ from astrbot.core.provider.entities import (
)
from astrbot.core.star.star_handler import EventType, star_map
from astrbot.core.utils.file_extract import extract_file_moonshotai
from astrbot.core.utils.llm_metadata import LLM_METADATAS
from astrbot.core.utils.metrics import Metric
from astrbot.core.utils.session_lock import session_lock_manager
@@ -41,11 +41,6 @@ class InternalAgentSubStage(Stage):
self.ctx = ctx
conf = ctx.astrbot_config
settings = conf["provider_settings"]
self.max_context_length = settings["max_context_length"] # int
self.dequeue_context_length: int = min(
max(1, settings["dequeue_context_length"]),
self.max_context_length - 1,
)
self.streaming_response: bool = settings["streaming_response"]
self.unsupported_streaming_strategy: str = settings[
"unsupported_streaming_strategy"
@@ -65,6 +60,23 @@ class InternalAgentSubStage(Stage):
"moonshotai_api_key", ""
)
# 上下文管理相关
self.context_limit_reached_strategy: str = settings.get(
"context_limit_reached_strategy", "truncate_by_turns"
)
self.llm_compress_instruction: str = settings.get(
"llm_compress_instruction", ""
)
self.llm_compress_keep_recent: int = settings.get("llm_compress_keep_recent", 4)
self.llm_compress_provider_id: str = settings.get(
"llm_compress_provider_id", ""
)
self.max_context_length = settings["max_context_length"] # int
self.dequeue_context_length: int = min(
max(1, settings["dequeue_context_length"]),
self.max_context_length - 1,
)
self.conv_manager = ctx.plugin_manager.context.conversation_manager
def _select_provider(self, event: AstrMessageEvent):
@@ -167,34 +179,6 @@ class InternalAgentSubStage(Stage):
},
)
def _truncate_contexts(
self,
contexts: list[dict],
) -> list[dict]:
"""截断上下文列表,确保不超过最大长度"""
if self.max_context_length == -1:
return contexts
if len(contexts) // 2 <= self.max_context_length:
return contexts
truncated_contexts = contexts[
-(self.max_context_length - self.dequeue_context_length + 1) * 2 :
]
# 找到第一个role 为 user 的索引,确保上下文格式正确
index = next(
(
i
for i, item in enumerate(truncated_contexts)
if item.get("role") == "user"
),
None,
)
if index is not None and index > 0:
truncated_contexts = truncated_contexts[index:]
return truncated_contexts
def _modalities_fix(
self,
provider: Provider,
@@ -328,21 +312,25 @@ class InternalAgentSubStage(Stage):
history=message_to_save,
)
def _fix_messages(self, messages: list[dict]) -> list[dict]:
"""验证并且修复上下文"""
fixed_messages = []
for message in messages:
if message.get("role") == "tool":
# tool block 前面必须要有 user 和 assistant block
if len(fixed_messages) < 2:
# 这种情况可能是上下文被截断导致的
# 我们直接将之前的上下文都清空
fixed_messages = []
else:
fixed_messages.append(message)
else:
fixed_messages.append(message)
return fixed_messages
def _get_compress_provider(self) -> Provider | None:
if not self.llm_compress_provider_id:
return None
if self.context_limit_reached_strategy != "llm_compress":
return None
provider = self.ctx.plugin_manager.context.get_provider_by_id(
self.llm_compress_provider_id,
)
if provider is None:
logger.warning(
f"未找到指定的上下文压缩模型 {self.llm_compress_provider_id},将跳过压缩。",
)
return None
if not isinstance(provider, Provider):
logger.warning(
f"指定的上下文压缩模型 {self.llm_compress_provider_id} 不是对话模型,将跳过压缩。"
)
return None
return provider
async def process(
self, event: AstrMessageEvent, provider_wake_prefix: str
@@ -426,9 +414,10 @@ class InternalAgentSubStage(Stage):
await self._apply_kb(event, req)
# truncate contexts to fit max length
if req.contexts:
req.contexts = self._truncate_contexts(req.contexts)
self._fix_messages(req.contexts)
# NOW moved to ContextManager inside ToolLoopAgentRunner
# if req.contexts:
# req.contexts = self._truncate_contexts(req.contexts)
# self._fix_messages(req.contexts)
# session_id
if not req.session_id:
@@ -444,8 +433,6 @@ class InternalAgentSubStage(Stage):
self.unsupported_streaming_strategy == "turn_off"
and not event.platform_meta.support_streaming_message
)
# 备份 req.contexts
backup_contexts = copy.deepcopy(req.contexts)
# run agent
agent_runner = AgentRunner()
@@ -456,6 +443,15 @@ class InternalAgentSubStage(Stage):
context=self.ctx.plugin_manager.context,
event=event,
)
# inject model context length limit
if provider.provider_config.get("max_context_tokens", 0) <= 0:
model = provider.get_model()
if model_info := LLM_METADATAS.get(model):
provider.provider_config["max_context_tokens"] = model_info[
"limit"
]["context"]
await agent_runner.reset(
provider=provider,
request=req,
@@ -466,6 +462,11 @@ class InternalAgentSubStage(Stage):
tool_executor=FunctionToolExecutor(),
agent_hooks=MAIN_AGENT_HOOKS,
streaming=streaming_response,
llm_compress_instruction=self.llm_compress_instruction,
llm_compress_keep_recent=self.llm_compress_keep_recent,
llm_compress_provider=self._get_compress_provider(),
truncate_turns=self.dequeue_context_length,
enforce_max_turns=self.max_context_length,
)
if streaming_response and not stream_to_general:
@@ -511,9 +512,6 @@ class InternalAgentSubStage(Stage):
):
yield
# 恢复备份的 contexts
req.contexts = backup_contexts
await self._save_to_history(
event,
req,
@@ -144,7 +144,7 @@
color="primary"
density="compact"
hide-details
class="flex-grow-1"
style="flex: 1"
></v-slider>
<v-text-field
:model-value="modelValue"
@@ -154,7 +154,7 @@
class="config-field"
type="number"
hide-details
style="max-width: 140px;"
style="flex: 1"
></v-text-field>
</div>
@@ -325,4 +325,8 @@ function getSpecialSubtype(value) {
.gap-20 {
gap: 20px;
}
:deep(.v-field__input) {
font-size: 14px;
}
</style>
@@ -510,7 +510,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
const metadata = getModelMetadata(modelName)
let modalities: string[]
if (!metadata) {
modalities = ['text', 'image', 'tool_use']
} else {
@@ -523,13 +523,19 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
}
}
let max_context_tokens = 0
if (metadata?.limit?.context && typeof metadata.limit.context === 'number') {
max_context_tokens = metadata.limit.context
}
const newProvider = {
id: newId,
enable: false,
provider_source_id: sourceId,
model: modelName,
modalities,
custom_extra_body: {}
custom_extra_body: {},
max_context_tokens: max_context_tokens
}
try {
@@ -11,7 +11,12 @@
},
"agent_runner_type": {
"description": "Runner",
"labels": ["Built-in Agent", "Dify", "Coze", "Alibaba Cloud Bailian Application"]
"labels": [
"Built-in Agent",
"Dify",
"Coze",
"Alibaba Cloud Bailian Application"
]
},
"coze_agent_runner_provider_id": {
"description": "Coze Agent Runner Provider ID"
@@ -128,6 +133,39 @@
}
}
},
"truncate_and_compress": {
"description": "Context Management Strategy",
"provider_settings": {
"max_context_length": {
"description": "Maximum Conversation Turns",
"hint": "Discards the oldest parts when this count is exceeded. One conversation round counts as 1, -1 means unlimited"
},
"dequeue_context_length": {
"description": "Dequeue Conversation Turns",
"hint": "Number of conversation turns to discard at once when maximum context length is exceeded"
},
"context_limit_reached_strategy": {
"description": "Handling When Model Context Window is Exceeded",
"labels": [
"Truncate by Turns",
"Compress by LLM"
],
"hint": "When 'Truncate by Turns' is selected, the oldest N conversation turns will be discarded based on the 'Dequeue Conversation Turns' setting above. When 'Compress by LLM' is selected, the specified model will be used for context compression."
},
"llm_compress_instruction": {
"description": "Context Compression Instruction",
"hint": "If empty, the default prompt will be used."
},
"llm_compress_keep_recent": {
"description": "Keep Recent Turns When Compressing",
"hint": "Always keep the most recent N turns of conversation when compressing context."
},
"llm_compress_provider_id": {
"description": "Model Provider ID for Context Compression",
"hint": "When left empty, will fall back to the 'Truncate by Turns' strategy."
}
}
},
"others": {
"description": "Other Settings",
"provider_settings": {
@@ -161,15 +199,10 @@
"unsupported_streaming_strategy": {
"description": "Platforms Without Streaming Support",
"hint": "Select the handling method for platforms that don't support streaming responses. Real-time segmented reply sends content immediately when the system detects segment points like punctuation during streaming reception",
"labels": ["Real-time Segmented Reply", "Disable Streaming Response"]
},
"max_context_length": {
"description": "Maximum Conversation Rounds",
"hint": "Discards the oldest parts when this count is exceeded. One conversation round counts as 1, -1 means unlimited"
},
"dequeue_context_length": {
"description": "Dequeue Conversation Rounds",
"hint": "Number of conversation rounds to discard at once when maximum context length is exceeded"
"labels": [
"Real-time Segmented Reply",
"Disable Streaming Response"
]
},
"wake_prefix": {
"description": "Additional LLM Chat Wake Prefix",
@@ -387,7 +420,10 @@
},
"split_mode": {
"description": "Split Mode",
"labels": ["Regex", "Words List"]
"labels": [
"Regex",
"Words List"
]
},
"regex": {
"description": "Segmentation Regular Expression"
@@ -488,4 +524,4 @@
}
}
}
}
}
@@ -133,6 +133,36 @@
}
}
},
"truncate_and_compress": {
"description": "上下文管理策略",
"provider_settings": {
"max_context_length": {
"description": "最多携带对话轮数",
"hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制"
},
"dequeue_context_length": {
"description": "丢弃对话轮数",
"hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数"
},
"context_limit_reached_strategy": {
"description": "超出模型上下文窗口时的处理方式",
"labels": ["按对话轮数截断", "由 LLM 压缩上下文"],
"hint": "当按对话轮数截断时,会根据上面\"丢弃对话轮数\"的配置丢弃最旧的 N 轮对话。当由 LLM 压缩上下文时,会使用指定的模型进行上下文压缩。"
},
"llm_compress_instruction": {
"description": "上下文压缩提示词",
"hint": "如果为空则使用默认提示词。"
},
"llm_compress_keep_recent": {
"description": "压缩时保留最近对话轮数",
"hint": "始终保留的最近 N 轮对话。"
},
"llm_compress_provider_id": {
"description": "用于上下文压缩的模型提供商 ID",
"hint": "留空时将降级为\"按对话轮数截断\"的策略。"
}
}
},
"others": {
"description": "其他配置",
"provider_settings": {
@@ -171,14 +201,7 @@
"关闭流式回复"
]
},
"max_context_length": {
"description": "最多携带对话轮数",
"hint": "超出这个数量时丢弃最旧的部分,一轮聊天记为 1 条,-1 为不限制"
},
"dequeue_context_length": {
"description": "丢弃对话轮数",
"hint": "超出最多携带对话轮数时, 一次丢弃的聊天轮数"
},
"wake_prefix": {
"description": "LLM 聊天额外唤醒前缀",
"hint": "如果唤醒前缀为 /, 额外聊天唤醒前缀为 chat,则需要 /chat 才会触发 LLM 请求"