diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py
index 5c07c93b2..09b9c9fc8 100644
--- a/astrbot/core/knowledge_base/kb_helper.py
+++ b/astrbot/core/knowledge_base/kb_helper.py
@@ -8,7 +8,7 @@ from astrbot.core.db.vec_db.base import BaseVecDB
from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
from astrbot.core.provider.manager import ProviderManager
-from .parsers.base import BaseParser
+from .parsers.util import select_parser
from .chunking.base import BaseChunker
from astrbot.core import logger
@@ -24,13 +24,11 @@ class KBHelper:
provider_manager: ProviderManager,
kb_root_dir: str,
chunker: BaseChunker,
- parsers: dict[str, BaseParser],
):
self.kb_db = kb_db
self.kb = kb
self.prov_mgr = provider_manager
self.kb_root_dir = kb_root_dir
- self.parsers = parsers
self.chunker = chunker
self.kb_dir = Path(self.kb_root_dir) / self.kb.kb_id
@@ -138,9 +136,7 @@ class KBHelper:
if progress_callback:
await progress_callback("parsing", 0, 100)
- parser = self.parsers.get(file_type)
- if not parser:
- raise ValueError(f"不支持的文件类型: {file_type}")
+ parser = await select_parser(f".{file_type}")
parse_result = await parser.parse(file_content, file_name)
text_content = parse_result.text
media_items = parse_result.media
diff --git a/astrbot/core/knowledge_base/kb_mgr.py b/astrbot/core/knowledge_base/kb_mgr.py
index 4f64383c4..c1c63d08a 100644
--- a/astrbot/core/knowledge_base/kb_mgr.py
+++ b/astrbot/core/knowledge_base/kb_mgr.py
@@ -8,9 +8,6 @@ from .retrieval.sparse_retriever import SparseRetriever
from .retrieval.rank_fusion import RankFusion
from .kb_db_sqlite import KBSQLiteDatabase
-from .parsers.text_parser import TextParser
-from .parsers.pdf_parser import PDFParser
-
# from .chunking.fixed_size import FixedSizeChunker
from .chunking.recursive import RecursiveCharacterChunker
from .kb_helper import KBHelper
@@ -21,12 +18,6 @@ from .models import KnowledgeBase
FILES_PATH = "data/knowledge_base"
DB_PATH = Path(FILES_PATH) / "kb.db"
"""Knowledge Base storage root directory"""
-PARSERS = {
- "txt": TextParser(),
- "md": TextParser(),
- "markdown": TextParser(),
- "pdf": PDFParser(),
-}
CHUNKER = RecursiveCharacterChunker()
@@ -85,7 +76,6 @@ class KnowledgeBaseManager:
provider_manager=self.provider_manager,
kb_root_dir=FILES_PATH,
chunker=CHUNKER,
- parsers=PARSERS,
)
await kb_helper.initialize()
self.kb_insts[record.kb_id] = kb_helper
@@ -127,7 +117,6 @@ class KnowledgeBaseManager:
provider_manager=self.provider_manager,
kb_root_dir=FILES_PATH,
chunker=CHUNKER,
- parsers=PARSERS,
)
await kb_helper.initialize()
self.kb_insts[kb.kb_id] = kb_helper
diff --git a/astrbot/core/knowledge_base/parsers/markitdown_parser.py b/astrbot/core/knowledge_base/parsers/markitdown_parser.py
new file mode 100644
index 000000000..50af984e0
--- /dev/null
+++ b/astrbot/core/knowledge_base/parsers/markitdown_parser.py
@@ -0,0 +1,25 @@
+import io
+import os
+
+from astrbot.core.knowledge_base.parsers.base import (
+ BaseParser,
+ ParseResult,
+)
+from markitdown_no_magika import MarkItDown, StreamInfo
+
+
+class MarkitdownParser(BaseParser):
+ """解析 docx, xls, xlsx 格式"""
+
+ async def parse(self, file_content: bytes, file_name: str) -> ParseResult:
+ md = MarkItDown(enable_plugins=False)
+ bio = io.BytesIO(file_content)
+ stream_info = StreamInfo(
+ extension=os.path.splitext(file_name)[1].lower(),
+ filename=file_name,
+ )
+ result = md.convert(bio, stream_info=stream_info)
+ return ParseResult(
+ text=result.markdown,
+ media=[],
+ )
diff --git a/astrbot/core/knowledge_base/parsers/util.py b/astrbot/core/knowledge_base/parsers/util.py
new file mode 100644
index 000000000..e8252b912
--- /dev/null
+++ b/astrbot/core/knowledge_base/parsers/util.py
@@ -0,0 +1,13 @@
+from .base import BaseParser
+
+
+async def select_parser(ext: str) -> BaseParser:
+ if ext in [".md", ".txt", ".markdown", ".xlsx", ".docx", ".xls"]:
+ from .markitdown_parser import MarkitdownParser
+
+ return MarkitdownParser()
+ elif ext == ".pdf":
+ from .pdf_parser import PDFParser
+
+ return PDFParser()
+ raise ValueError(f"暂时不支持的文件格式: {ext}")
diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json
index 9af7ef593..e323a7e8a 100644
--- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json
+++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json
@@ -45,7 +45,7 @@
"title": "Upload Document",
"selectFile": "Select File",
"dropzone": "Drop files here or click to select",
- "supportedFormats": "Supported formats: TXT, PDF, Markdown",
+ "supportedFormats": "Supported formats: ",
"maxSize": "Max file size: 128MB",
"chunkSettings": "Chunk Settings",
"batchSettings": "Batch Settings",
diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json
index 3dd9f751e..81e744b91 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json
@@ -46,7 +46,7 @@
"title": "上传文档",
"selectFile": "选择文件",
"dropzone": "拖放文件到这里或点击选择",
- "supportedFormats": "支持的格式: TXT, PDF, Markdown",
+ "supportedFormats": "支持的格式: ",
"maxSize": "最大文件大小: 128MB",
"chunkSettings": "分块设置",
"batchSettings": "批处理设置",
diff --git a/dashboard/src/views/knowledge-base/components/DocumentsTab.vue b/dashboard/src/views/knowledge-base/components/DocumentsTab.vue
index 34d0d54d4..9e146e86a 100644
--- a/dashboard/src/views/knowledge-base/components/DocumentsTab.vue
+++ b/dashboard/src/views/knowledge-base/components/DocumentsTab.vue
@@ -73,10 +73,12 @@
@dragover.prevent="isDragging = true" @dragleave="isDragging = false" @click="fileInput?.click()">
{{ t('upload.dropzone') }}
-{{ t('upload.supportedFormats') }}
+{{ t('upload.supportedFormats') }}.txt, .md, .pdf, .docx, + .xls, .xlsx
{{ t('upload.maxSize') }}
最多可上传 10 个文件
- +