From eb201c042034dad2feb075618bdd0866bf9e5676 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Sat, 25 Oct 2025 22:00:54 +0800 Subject: [PATCH 1/3] feat: refactor knowledge base parsers and add MarkitdownParser for docx, xls, xlsx support --- astrbot/core/knowledge_base/kb_helper.py | 8 ++---- astrbot/core/knowledge_base/kb_mgr.py | 11 -------- .../parsers/markitdown_parser.py | 25 +++++++++++++++++++ astrbot/core/knowledge_base/parsers/util.py | 13 ++++++++++ .../en-US/features/knowledge-base/detail.json | 2 +- .../zh-CN/features/knowledge-base/detail.json | 2 +- .../components/DocumentsTab.vue | 6 +++-- pyproject.toml | 1 + 8 files changed, 47 insertions(+), 21 deletions(-) create mode 100644 astrbot/core/knowledge_base/parsers/markitdown_parser.py create mode 100644 astrbot/core/knowledge_base/parsers/util.py diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py index 5c07c93b2..09b9c9fc8 100644 --- a/astrbot/core/knowledge_base/kb_helper.py +++ b/astrbot/core/knowledge_base/kb_helper.py @@ -8,7 +8,7 @@ from astrbot.core.db.vec_db.base import BaseVecDB from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider from astrbot.core.provider.manager import ProviderManager -from .parsers.base import BaseParser +from .parsers.util import select_parser from .chunking.base import BaseChunker from astrbot.core import logger @@ -24,13 +24,11 @@ class KBHelper: provider_manager: ProviderManager, kb_root_dir: str, chunker: BaseChunker, - parsers: dict[str, BaseParser], ): self.kb_db = kb_db self.kb = kb self.prov_mgr = provider_manager self.kb_root_dir = kb_root_dir - self.parsers = parsers self.chunker = chunker self.kb_dir = Path(self.kb_root_dir) / self.kb.kb_id @@ -138,9 +136,7 @@ class KBHelper: if progress_callback: await progress_callback("parsing", 0, 100) - parser = self.parsers.get(file_type) - if not parser: - raise ValueError(f"不支持的文件类型: {file_type}") + parser = await select_parser(f".{file_type}") parse_result = await parser.parse(file_content, file_name) text_content = parse_result.text media_items = parse_result.media diff --git a/astrbot/core/knowledge_base/kb_mgr.py b/astrbot/core/knowledge_base/kb_mgr.py index 4f64383c4..c1c63d08a 100644 --- a/astrbot/core/knowledge_base/kb_mgr.py +++ b/astrbot/core/knowledge_base/kb_mgr.py @@ -8,9 +8,6 @@ from .retrieval.sparse_retriever import SparseRetriever from .retrieval.rank_fusion import RankFusion from .kb_db_sqlite import KBSQLiteDatabase -from .parsers.text_parser import TextParser -from .parsers.pdf_parser import PDFParser - # from .chunking.fixed_size import FixedSizeChunker from .chunking.recursive import RecursiveCharacterChunker from .kb_helper import KBHelper @@ -21,12 +18,6 @@ from .models import KnowledgeBase FILES_PATH = "data/knowledge_base" DB_PATH = Path(FILES_PATH) / "kb.db" """Knowledge Base storage root directory""" -PARSERS = { - "txt": TextParser(), - "md": TextParser(), - "markdown": TextParser(), - "pdf": PDFParser(), -} CHUNKER = RecursiveCharacterChunker() @@ -85,7 +76,6 @@ class KnowledgeBaseManager: provider_manager=self.provider_manager, kb_root_dir=FILES_PATH, chunker=CHUNKER, - parsers=PARSERS, ) await kb_helper.initialize() self.kb_insts[record.kb_id] = kb_helper @@ -127,7 +117,6 @@ class KnowledgeBaseManager: provider_manager=self.provider_manager, kb_root_dir=FILES_PATH, chunker=CHUNKER, - parsers=PARSERS, ) await kb_helper.initialize() self.kb_insts[kb.kb_id] = kb_helper diff --git a/astrbot/core/knowledge_base/parsers/markitdown_parser.py b/astrbot/core/knowledge_base/parsers/markitdown_parser.py new file mode 100644 index 000000000..50af984e0 --- /dev/null +++ b/astrbot/core/knowledge_base/parsers/markitdown_parser.py @@ -0,0 +1,25 @@ +import io +import os + +from astrbot.core.knowledge_base.parsers.base import ( + BaseParser, + ParseResult, +) +from markitdown_no_magika import MarkItDown, StreamInfo + + +class MarkitdownParser(BaseParser): + """解析 docx, xls, xlsx 格式""" + + async def parse(self, file_content: bytes, file_name: str) -> ParseResult: + md = MarkItDown(enable_plugins=False) + bio = io.BytesIO(file_content) + stream_info = StreamInfo( + extension=os.path.splitext(file_name)[1].lower(), + filename=file_name, + ) + result = md.convert(bio, stream_info=stream_info) + return ParseResult( + text=result.markdown, + media=[], + ) diff --git a/astrbot/core/knowledge_base/parsers/util.py b/astrbot/core/knowledge_base/parsers/util.py new file mode 100644 index 000000000..e8252b912 --- /dev/null +++ b/astrbot/core/knowledge_base/parsers/util.py @@ -0,0 +1,13 @@ +from .base import BaseParser + + +async def select_parser(ext: str) -> BaseParser: + if ext in [".md", ".txt", ".markdown", ".xlsx", ".docx", ".xls"]: + from .markitdown_parser import MarkitdownParser + + return MarkitdownParser() + elif ext == ".pdf": + from .pdf_parser import PDFParser + + return PDFParser() + raise ValueError(f"暂时不支持的文件格式: {ext}") diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json index 9af7ef593..e323a7e8a 100644 --- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json +++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json @@ -45,7 +45,7 @@ "title": "Upload Document", "selectFile": "Select File", "dropzone": "Drop files here or click to select", - "supportedFormats": "Supported formats: TXT, PDF, Markdown", + "supportedFormats": "Supported formats: ", "maxSize": "Max file size: 128MB", "chunkSettings": "Chunk Settings", "batchSettings": "Batch Settings", diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json index 3dd9f751e..81e744b91 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json +++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json @@ -46,7 +46,7 @@ "title": "上传文档", "selectFile": "选择文件", "dropzone": "拖放文件到这里或点击选择", - "supportedFormats": "支持的格式: TXT, PDF, Markdown", + "supportedFormats": "支持的格式: ", "maxSize": "最大文件大小: 128MB", "chunkSettings": "分块设置", "batchSettings": "批处理设置", diff --git a/dashboard/src/views/knowledge-base/components/DocumentsTab.vue b/dashboard/src/views/knowledge-base/components/DocumentsTab.vue index 34d0d54d4..9e146e86a 100644 --- a/dashboard/src/views/knowledge-base/components/DocumentsTab.vue +++ b/dashboard/src/views/knowledge-base/components/DocumentsTab.vue @@ -73,10 +73,12 @@ @dragover.prevent="isDragging = true" @dragleave="isDragging = false" @click="fileInput?.click()"> mdi-cloud-upload

{{ t('upload.dropzone') }}

-

{{ t('upload.supportedFormats') }}

+

{{ t('upload.supportedFormats') }}.txt, .md, .pdf, .docx, + .xls, .xlsx

{{ t('upload.maxSize') }}

最多可上传 10 个文件

- +
diff --git a/pyproject.toml b/pyproject.toml index a14b0c25e..7acc3a813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dependencies = [ "aiofiles>=25.1.0", "rank-bm25>=0.2.2", "jieba>=0.42.1", + "markitdown-no-magika[docx,xls,xlsx]>=0.1.2", ] [project.scripts] From 0823f7aa48ac0da9d48d8f12f387ebc054abd170 Mon Sep 17 00:00:00 2001 From: Soulter <37870767+Soulter@users.noreply.github.com> Date: Sat, 25 Oct 2025 22:04:17 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E5=9C=A8=E6=A3=80=E6=9F=A5=E5=AD=97?= =?UTF-8?q?=E9=9D=A2=E9=87=8F=E9=9B=86=E5=90=88=E7=9A=84=E6=88=90=E5=91=98?= =?UTF-8?q?=E8=B5=84=E6=A0=BC=E6=97=B6=E4=BD=BF=E7=94=A8=20set?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> --- astrbot/core/knowledge_base/parsers/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/astrbot/core/knowledge_base/parsers/util.py b/astrbot/core/knowledge_base/parsers/util.py index e8252b912..41cc5e4de 100644 --- a/astrbot/core/knowledge_base/parsers/util.py +++ b/astrbot/core/knowledge_base/parsers/util.py @@ -2,7 +2,7 @@ from .base import BaseParser async def select_parser(ext: str) -> BaseParser: - if ext in [".md", ".txt", ".markdown", ".xlsx", ".docx", ".xls"]: + if ext in {".md", ".txt", ".markdown", ".xlsx", ".docx", ".xls"}: from .markitdown_parser import MarkitdownParser return MarkitdownParser() From 562e62a8c08ead09bde94d9c5a221f95a7670956 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Sun, 26 Oct 2025 13:02:22 +0800 Subject: [PATCH 3/3] feat: add new dependencies for PDF processing, file handling, and text ranking --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements.txt b/requirements.txt index 5af016b28..714676e4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,3 +44,8 @@ sqlmodel deprecated sqlalchemy[asyncio] audioop-lts; python_version>='3.13' +pypdf +aiofiles +rank-bm25 +jieba +markitdown-no-magika[docx,xls,xlsx] \ No newline at end of file