ad96d676e6
- 实现完整的知识库数据模型(知识库、文档、文档块、会话配置) - 实现基于 SQLite 的向量数据库存储和检索 - 实现文档解析器(PDF、TXT)和固定大小分块器 - 实现混合检索系统(密集向量检索 + BM25 稀疏检索 + RRF 融合) - 实现知识库生命周期管理和消息注入器 - 支持会话级别的知识库配置和关联
185 lines
6.4 KiB
Python
185 lines
6.4 KiB
Python
"""知识库管理功能的数据模型定义
|
|
|
|
该模块定义了知识库系统所需的数据模型,包括:
|
|
- KnowledgeBase: 知识库表 (存储在独立的 kb.db)
|
|
- KBDocument: 文档表 (存储在独立的 kb.db)
|
|
- KBChunk: 文档块表 (存储在独立的 kb.db)
|
|
- KBMedia: 多媒体资源表 (存储在独立的 kb.db)
|
|
- KBSessionConfig: 会话配置表 (存储在独立的 kb.db)
|
|
|
|
注意:
|
|
- 所有模型存储在独立的知识库数据库 (data/knowledge_base/kb.db)
|
|
- 与主数据库 (astrbot.db) 完全解耦
|
|
"""
|
|
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from sqlmodel import Field, SQLModel, Text, UniqueConstraint
|
|
|
|
|
|
class KnowledgeBase(SQLModel, table=True):
|
|
"""知识库表
|
|
|
|
存储知识库的基本信息和统计数据。
|
|
"""
|
|
|
|
__tablename__ = "knowledge_bases"
|
|
|
|
id: int | None = Field(
|
|
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
|
|
)
|
|
kb_id: str = Field(
|
|
max_length=36,
|
|
nullable=False,
|
|
unique=True,
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
index=True,
|
|
)
|
|
kb_name: str = Field(max_length=100, nullable=False)
|
|
description: Optional[str] = Field(default=None, sa_type=Text)
|
|
emoji: Optional[str] = Field(default="📚", max_length=10)
|
|
embedding_provider_id: Optional[str] = Field(default=None, max_length=100)
|
|
rerank_provider_id: Optional[str] = Field(default=None, max_length=100)
|
|
# 分块配置参数
|
|
chunk_size: Optional[int] = Field(default=512, nullable=True)
|
|
chunk_overlap: Optional[int] = Field(default=50, nullable=True)
|
|
# 检索配置参数
|
|
top_k_dense: Optional[int] = Field(default=50, nullable=True)
|
|
top_k_sparse: Optional[int] = Field(default=50, nullable=True)
|
|
top_m_final: Optional[int] = Field(default=5, nullable=True)
|
|
enable_rerank: Optional[bool] = Field(default=True, nullable=True)
|
|
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
updated_at: datetime = Field(
|
|
default_factory=lambda: datetime.now(timezone.utc),
|
|
sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
|
|
)
|
|
doc_count: int = Field(default=0, nullable=False)
|
|
chunk_count: int = Field(default=0, nullable=False)
|
|
|
|
|
|
class KBDocument(SQLModel, table=True):
|
|
"""文档表
|
|
|
|
存储上传到知识库的文档元数据。
|
|
"""
|
|
|
|
__tablename__ = "kb_documents"
|
|
|
|
id: int | None = Field(
|
|
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
|
|
)
|
|
doc_id: str = Field(
|
|
max_length=36,
|
|
nullable=False,
|
|
unique=True,
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
index=True,
|
|
)
|
|
kb_id: str = Field(max_length=36, nullable=False, index=True)
|
|
doc_name: str = Field(max_length=255, nullable=False)
|
|
file_type: str = Field(max_length=20, nullable=False)
|
|
file_size: int = Field(nullable=False)
|
|
file_path: str = Field(max_length=512, nullable=False)
|
|
chunk_count: int = Field(default=0, nullable=False)
|
|
media_count: int = Field(default=0, nullable=False)
|
|
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
updated_at: datetime = Field(
|
|
default_factory=lambda: datetime.now(timezone.utc),
|
|
sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
|
|
)
|
|
|
|
|
|
class KBChunk(SQLModel, table=True):
|
|
"""文档块表
|
|
|
|
存储文档分块后的文本内容和向量索引关联信息。
|
|
"""
|
|
|
|
__tablename__ = "kb_chunks"
|
|
|
|
id: int | None = Field(
|
|
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
|
|
)
|
|
chunk_id: str = Field(
|
|
max_length=36,
|
|
nullable=False,
|
|
unique=True,
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
index=True,
|
|
)
|
|
doc_id: str = Field(max_length=36, nullable=False, index=True)
|
|
kb_id: str = Field(max_length=36, nullable=False, index=True)
|
|
chunk_index: int = Field(nullable=False)
|
|
content: str = Field(sa_type=Text, nullable=False)
|
|
char_count: int = Field(nullable=False)
|
|
vec_doc_id: str = Field(max_length=100, nullable=False, index=True)
|
|
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
|
|
|
|
class KBMedia(SQLModel, table=True):
|
|
"""多媒体资源表
|
|
|
|
存储从文档中提取的图片、视频等多媒体资源。
|
|
"""
|
|
|
|
__tablename__ = "kb_media"
|
|
|
|
id: int | None = Field(
|
|
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
|
|
)
|
|
media_id: str = Field(
|
|
max_length=36,
|
|
nullable=False,
|
|
unique=True,
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
index=True,
|
|
)
|
|
doc_id: str = Field(max_length=36, nullable=False, index=True)
|
|
kb_id: str = Field(max_length=36, nullable=False, index=True)
|
|
media_type: str = Field(max_length=20, nullable=False)
|
|
file_name: str = Field(max_length=255, nullable=False)
|
|
file_path: str = Field(max_length=512, nullable=False)
|
|
file_size: int = Field(nullable=False)
|
|
mime_type: str = Field(max_length=100, nullable=False)
|
|
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
|
|
|
|
class KBSessionConfig(SQLModel, table=True):
|
|
"""会话知识库配置表
|
|
|
|
存储会话或平台级别的知识库关联配置。
|
|
该表存储在知识库独立数据库中,保持完全解耦。
|
|
|
|
支持两种配置范围:
|
|
- platform: 平台级别配置 (如 'qq', 'telegram')
|
|
- session: 会话级别配置 (如 'qq:group:12345')
|
|
"""
|
|
|
|
__tablename__ = "kb_session_config"
|
|
|
|
id: int | None = Field(
|
|
primary_key=True, sa_column_kwargs={"autoincrement": True}, default=None
|
|
)
|
|
config_id: str = Field(
|
|
max_length=36,
|
|
nullable=False,
|
|
unique=True,
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
)
|
|
scope: str = Field(max_length=20, nullable=False)
|
|
scope_id: str = Field(max_length=255, nullable=False, index=True)
|
|
kb_ids: str = Field(sa_type=Text, nullable=False)
|
|
top_k: Optional[int] = Field(default=None, nullable=True)
|
|
enable_rerank: Optional[bool] = Field(default=None, nullable=True)
|
|
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
updated_at: datetime = Field(
|
|
default_factory=lambda: datetime.now(timezone.utc),
|
|
sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
|
|
)
|
|
|
|
__table_args__ = (
|
|
UniqueConstraint("scope", "scope_id", name="uix_scope_scope_id"),
|
|
)
|