feat: refactor knowledge base parsers and add MarkitdownParser for docx, xls, xlsx support
This commit is contained in:
@@ -8,7 +8,7 @@ from astrbot.core.db.vec_db.base import BaseVecDB
|
||||
from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB
|
||||
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
|
||||
from astrbot.core.provider.manager import ProviderManager
|
||||
from .parsers.base import BaseParser
|
||||
from .parsers.util import select_parser
|
||||
from .chunking.base import BaseChunker
|
||||
from astrbot.core import logger
|
||||
|
||||
@@ -24,13 +24,11 @@ class KBHelper:
|
||||
provider_manager: ProviderManager,
|
||||
kb_root_dir: str,
|
||||
chunker: BaseChunker,
|
||||
parsers: dict[str, BaseParser],
|
||||
):
|
||||
self.kb_db = kb_db
|
||||
self.kb = kb
|
||||
self.prov_mgr = provider_manager
|
||||
self.kb_root_dir = kb_root_dir
|
||||
self.parsers = parsers
|
||||
self.chunker = chunker
|
||||
|
||||
self.kb_dir = Path(self.kb_root_dir) / self.kb.kb_id
|
||||
@@ -138,9 +136,7 @@ class KBHelper:
|
||||
if progress_callback:
|
||||
await progress_callback("parsing", 0, 100)
|
||||
|
||||
parser = self.parsers.get(file_type)
|
||||
if not parser:
|
||||
raise ValueError(f"不支持的文件类型: {file_type}")
|
||||
parser = await select_parser(f".{file_type}")
|
||||
parse_result = await parser.parse(file_content, file_name)
|
||||
text_content = parse_result.text
|
||||
media_items = parse_result.media
|
||||
|
||||
@@ -8,9 +8,6 @@ from .retrieval.sparse_retriever import SparseRetriever
|
||||
from .retrieval.rank_fusion import RankFusion
|
||||
from .kb_db_sqlite import KBSQLiteDatabase
|
||||
|
||||
from .parsers.text_parser import TextParser
|
||||
from .parsers.pdf_parser import PDFParser
|
||||
|
||||
# from .chunking.fixed_size import FixedSizeChunker
|
||||
from .chunking.recursive import RecursiveCharacterChunker
|
||||
from .kb_helper import KBHelper
|
||||
@@ -21,12 +18,6 @@ from .models import KnowledgeBase
|
||||
FILES_PATH = "data/knowledge_base"
|
||||
DB_PATH = Path(FILES_PATH) / "kb.db"
|
||||
"""Knowledge Base storage root directory"""
|
||||
PARSERS = {
|
||||
"txt": TextParser(),
|
||||
"md": TextParser(),
|
||||
"markdown": TextParser(),
|
||||
"pdf": PDFParser(),
|
||||
}
|
||||
CHUNKER = RecursiveCharacterChunker()
|
||||
|
||||
|
||||
@@ -85,7 +76,6 @@ class KnowledgeBaseManager:
|
||||
provider_manager=self.provider_manager,
|
||||
kb_root_dir=FILES_PATH,
|
||||
chunker=CHUNKER,
|
||||
parsers=PARSERS,
|
||||
)
|
||||
await kb_helper.initialize()
|
||||
self.kb_insts[record.kb_id] = kb_helper
|
||||
@@ -127,7 +117,6 @@ class KnowledgeBaseManager:
|
||||
provider_manager=self.provider_manager,
|
||||
kb_root_dir=FILES_PATH,
|
||||
chunker=CHUNKER,
|
||||
parsers=PARSERS,
|
||||
)
|
||||
await kb_helper.initialize()
|
||||
self.kb_insts[kb.kb_id] = kb_helper
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
import io
|
||||
import os
|
||||
|
||||
from astrbot.core.knowledge_base.parsers.base import (
|
||||
BaseParser,
|
||||
ParseResult,
|
||||
)
|
||||
from markitdown_no_magika import MarkItDown, StreamInfo
|
||||
|
||||
|
||||
class MarkitdownParser(BaseParser):
|
||||
"""解析 docx, xls, xlsx 格式"""
|
||||
|
||||
async def parse(self, file_content: bytes, file_name: str) -> ParseResult:
|
||||
md = MarkItDown(enable_plugins=False)
|
||||
bio = io.BytesIO(file_content)
|
||||
stream_info = StreamInfo(
|
||||
extension=os.path.splitext(file_name)[1].lower(),
|
||||
filename=file_name,
|
||||
)
|
||||
result = md.convert(bio, stream_info=stream_info)
|
||||
return ParseResult(
|
||||
text=result.markdown,
|
||||
media=[],
|
||||
)
|
||||
@@ -0,0 +1,13 @@
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
async def select_parser(ext: str) -> BaseParser:
|
||||
if ext in [".md", ".txt", ".markdown", ".xlsx", ".docx", ".xls"]:
|
||||
from .markitdown_parser import MarkitdownParser
|
||||
|
||||
return MarkitdownParser()
|
||||
elif ext == ".pdf":
|
||||
from .pdf_parser import PDFParser
|
||||
|
||||
return PDFParser()
|
||||
raise ValueError(f"暂时不支持的文件格式: {ext}")
|
||||
@@ -45,7 +45,7 @@
|
||||
"title": "Upload Document",
|
||||
"selectFile": "Select File",
|
||||
"dropzone": "Drop files here or click to select",
|
||||
"supportedFormats": "Supported formats: TXT, PDF, Markdown",
|
||||
"supportedFormats": "Supported formats: ",
|
||||
"maxSize": "Max file size: 128MB",
|
||||
"chunkSettings": "Chunk Settings",
|
||||
"batchSettings": "Batch Settings",
|
||||
|
||||
@@ -46,7 +46,7 @@
|
||||
"title": "上传文档",
|
||||
"selectFile": "选择文件",
|
||||
"dropzone": "拖放文件到这里或点击选择",
|
||||
"supportedFormats": "支持的格式: TXT, PDF, Markdown",
|
||||
"supportedFormats": "支持的格式: ",
|
||||
"maxSize": "最大文件大小: 128MB",
|
||||
"chunkSettings": "分块设置",
|
||||
"batchSettings": "批处理设置",
|
||||
|
||||
@@ -73,10 +73,12 @@
|
||||
@dragover.prevent="isDragging = true" @dragleave="isDragging = false" @click="fileInput?.click()">
|
||||
<v-icon size="64" color="primary">mdi-cloud-upload</v-icon>
|
||||
<p class="mt-4 text-h6">{{ t('upload.dropzone') }}</p>
|
||||
<p class="text-caption text-medium-emphasis mt-2">{{ t('upload.supportedFormats') }}</p>
|
||||
<p class="text-caption text-medium-emphasis mt-2">{{ t('upload.supportedFormats') }}.txt, .md, .pdf, .docx,
|
||||
.xls, .xlsx</p>
|
||||
<p class="text-caption text-medium-emphasis">{{ t('upload.maxSize') }}</p>
|
||||
<p class="text-caption text-medium-emphasis">最多可上传 10 个文件</p>
|
||||
<input ref="fileInput" type="file" multiple hidden accept=".txt,.md,.pdf" @change="handleFileSelect" />
|
||||
<input ref="fileInput" type="file" multiple hidden accept=".txt,.md,.pdf,.docx,.xls,.xlsx"
|
||||
@change="handleFileSelect" />
|
||||
</div>
|
||||
|
||||
<div v-if="selectedFiles.length > 0" class="mt-4">
|
||||
|
||||
@@ -55,6 +55,7 @@ dependencies = [
|
||||
"aiofiles>=25.1.0",
|
||||
"rank-bm25>=0.2.2",
|
||||
"jieba>=0.42.1",
|
||||
"markitdown-no-magika[docx,xls,xlsx]>=0.1.2",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
Reference in New Issue
Block a user