Files
AstrBot/astrbot/core/knowledge_base/models.py
T
lxfight ad96d676e6 feat: 实现知识库核心后端模块
- 实现完整的知识库数据模型(知识库、文档、文档块、会话配置)
- 实现基于 SQLite 的向量数据库存储和检索
- 实现文档解析器(PDF、TXT)和固定大小分块器
- 实现混合检索系统(密集向量检索 + BM25 稀疏检索 + RRF 融合)
- 实现知识库生命周期管理和消息注入器
- 支持会话级别的知识库配置和关联
2025-10-19 18:40:55 +08:00

185 lines
6.4 KiB
Python

"""知识库管理功能的数据模型定义
该模块定义了知识库系统所需的数据模型,包括:
- KnowledgeBase: 知识库表 (存储在独立的 kb.db)
- KBDocument: 文档表 (存储在独立的 kb.db)
- KBChunk: 文档块表 (存储在独立的 kb.db)
- KBMedia: 多媒体资源表 (存储在独立的 kb.db)
- KBSessionConfig: 会话配置表 (存储在独立的 kb.db)
注意:
- 所有模型存储在独立的知识库数据库 (data/knowledge_base/kb.db)
- 与主数据库 (astrbot.db) 完全解耦
"""
import uuid
from datetime import datetime, timezone
from typing import Optional
from sqlmodel import Field, SQLModel, Text, UniqueConstraint
class KnowledgeBase(SQLModel, table=True):
"""知识库表
存储知识库的基本信息和统计数据。
"""
__tablename__ = "knowledge_bases"
id: int | None = Field(
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
)
kb_id: str = Field(
max_length=36,
nullable=False,
unique=True,
default_factory=lambda: str(uuid.uuid4()),
index=True,
)
kb_name: str = Field(max_length=100, nullable=False)
description: Optional[str] = Field(default=None, sa_type=Text)
emoji: Optional[str] = Field(default="📚", max_length=10)
embedding_provider_id: Optional[str] = Field(default=None, max_length=100)
rerank_provider_id: Optional[str] = Field(default=None, max_length=100)
# 分块配置参数
chunk_size: Optional[int] = Field(default=512, nullable=True)
chunk_overlap: Optional[int] = Field(default=50, nullable=True)
# 检索配置参数
top_k_dense: Optional[int] = Field(default=50, nullable=True)
top_k_sparse: Optional[int] = Field(default=50, nullable=True)
top_m_final: Optional[int] = Field(default=5, nullable=True)
enable_rerank: Optional[bool] = Field(default=True, nullable=True)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
)
doc_count: int = Field(default=0, nullable=False)
chunk_count: int = Field(default=0, nullable=False)
class KBDocument(SQLModel, table=True):
"""文档表
存储上传到知识库的文档元数据。
"""
__tablename__ = "kb_documents"
id: int | None = Field(
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
)
doc_id: str = Field(
max_length=36,
nullable=False,
unique=True,
default_factory=lambda: str(uuid.uuid4()),
index=True,
)
kb_id: str = Field(max_length=36, nullable=False, index=True)
doc_name: str = Field(max_length=255, nullable=False)
file_type: str = Field(max_length=20, nullable=False)
file_size: int = Field(nullable=False)
file_path: str = Field(max_length=512, nullable=False)
chunk_count: int = Field(default=0, nullable=False)
media_count: int = Field(default=0, nullable=False)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
)
class KBChunk(SQLModel, table=True):
"""文档块表
存储文档分块后的文本内容和向量索引关联信息。
"""
__tablename__ = "kb_chunks"
id: int | None = Field(
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
)
chunk_id: str = Field(
max_length=36,
nullable=False,
unique=True,
default_factory=lambda: str(uuid.uuid4()),
index=True,
)
doc_id: str = Field(max_length=36, nullable=False, index=True)
kb_id: str = Field(max_length=36, nullable=False, index=True)
chunk_index: int = Field(nullable=False)
content: str = Field(sa_type=Text, nullable=False)
char_count: int = Field(nullable=False)
vec_doc_id: str = Field(max_length=100, nullable=False, index=True)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
class KBMedia(SQLModel, table=True):
"""多媒体资源表
存储从文档中提取的图片、视频等多媒体资源。
"""
__tablename__ = "kb_media"
id: int | None = Field(
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
)
media_id: str = Field(
max_length=36,
nullable=False,
unique=True,
default_factory=lambda: str(uuid.uuid4()),
index=True,
)
doc_id: str = Field(max_length=36, nullable=False, index=True)
kb_id: str = Field(max_length=36, nullable=False, index=True)
media_type: str = Field(max_length=20, nullable=False)
file_name: str = Field(max_length=255, nullable=False)
file_path: str = Field(max_length=512, nullable=False)
file_size: int = Field(nullable=False)
mime_type: str = Field(max_length=100, nullable=False)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
class KBSessionConfig(SQLModel, table=True):
"""会话知识库配置表
存储会话或平台级别的知识库关联配置。
该表存储在知识库独立数据库中,保持完全解耦。
支持两种配置范围:
- platform: 平台级别配置 (如 'qq', 'telegram')
- session: 会话级别配置 (如 'qq:group:12345')
"""
__tablename__ = "kb_session_config"
id: int | None = Field(
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
)
config_id: str = Field(
max_length=36,
nullable=False,
unique=True,
default_factory=lambda: str(uuid.uuid4()),
)
scope: str = Field(max_length=20, nullable=False)
scope_id: str = Field(max_length=255, nullable=False, index=True)
kb_ids: str = Field(sa_type=Text, nullable=False)
top_k: Optional[int] = Field(default=None, nullable=True)
enable_rerank: Optional[bool] = Field(default=None, nullable=True)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
)
__table_args__ = (
UniqueConstraint("scope", "scope_id", name="uix_scope_scope_id"),
)