From faccf70cbd2ba1f67a20856ec04af523b715cb57 Mon Sep 17 00:00:00 2001
From: xiangchang_24 <1476897511@qq.com>
Date: Tue, 16 Jun 2026 13:03:22 +0800
Subject: [PATCH 1/3] =?UTF-8?q?feat(knowledge):=20=E6=96=B0=E5=A2=9E=20que?=
=?UTF-8?q?ry=5Fkeywords=20=E5=B7=A5=E5=85=B7=EF=BC=8C=E5=9F=BA=E4=BA=8E?=
=?UTF-8?q?=E5=85=B3=E9=94=AE=E8=AF=8D=E5=91=BD=E4=B8=AD=E7=9A=84=20BM25?=
=?UTF-8?q?=20=E6=A3=80=E7=B4=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 QueryKeywordsInputSchema,接收 keywords 列表
- 新增 query_keywords 工具函数,强制 search_mode=keyword 走 BM25 通道
- 过滤空字符串和纯空格关键词,避免检索异常
- 注册到 get_common_kb_tools(),Agent 可自动发现
- 更新 changelog 和 roadmap
---
.../package/yuxi/agents/toolkits/kbs/tools.py | 67 ++++++++++++++++++-
backend/package/yuxi/knowledge/schemas.py | 10 +++
docs/develop-guides/changelog.md | 1 +
docs/develop-guides/roadmap.md | 2 +-
4 files changed, 76 insertions(+), 4 deletions(-)
diff --git a/backend/package/yuxi/agents/toolkits/kbs/tools.py b/backend/package/yuxi/agents/toolkits/kbs/tools.py
index c27f45253..069f212b5 100644
--- a/backend/package/yuxi/agents/toolkits/kbs/tools.py
+++ b/backend/package/yuxi/agents/toolkits/kbs/tools.py
@@ -13,6 +13,7 @@
FindOutputSchema,
OpenInputSchema,
OpenOutputSchema,
+ QueryKeywordsInputSchema,
SearchInputSchema,
SearchOutputSchema,
)
@@ -153,6 +154,7 @@ def mindmap_to_text(node, level=0):
QueryKBInput = SearchInputSchema
+QueryKeywordsInput = QueryKeywordsInputSchema
OpenKBDocumentInput = OpenInputSchema
FindKBDocumentInput = FindInputSchema
@@ -241,6 +243,64 @@ async def query_kb(kb_id: str, query_text: str, file_name: str | None = None, ru
return f"检索失败: {str(e)}"
+@tool(category="knowledge", tags=["知识库"], args_schema=QueryKeywordsInput)
+async def query_keywords(
+ kb_id: str,
+ keywords: list[str],
+ file_name: str | None = None,
+ runtime: ToolRuntime = None,
+) -> Any:
+ """基于关键词在指定知识库中检索内容
+
+ 当用户明确知道要搜索的关键词(如专有名词、技术术语、代码符号、特定指标等)
+ 时使用此工具,走 BM25 关键词命中排序。如果需要语义理解型的模糊检索,请使用 query_kb。
+
+ Args:
+ kb_id: 知识库资源 ID,也就是 kb_id
+ keywords: 关键词列表
+ file_name: 可选文件名关键词过滤
+
+ Returns:
+ 检索结果列表,结构与 query_kb 一致
+ """
+ if not kb_id:
+ return "请提供 kb_id"
+ if not keywords:
+ return "请提供关键词列表"
+
+ knowledge_base = _get_knowledge_base()
+ retrievers = knowledge_base.get_retrievers()
+ visible_kbs = await _resolve_visible_knowledge_bases_for_query(runtime)
+ target_info, target_kb_id, target_error = _find_query_target(
+ kb_id=kb_id,
+ retrievers=retrievers,
+ visible_kbs=visible_kbs,
+ )
+ if target_error:
+ return target_error
+
+ try:
+ retriever = target_info["retriever"]
+ # 拼接关键词为查询文本,强制使用 keyword/BM25 模式
+ query_text = " ".join(keywords)
+ kwargs: dict[str, Any] = {"search_mode": "keyword"}
+ if file_name:
+ kwargs["file_name"] = file_name
+
+ if inspect.iscoroutinefunction(retriever):
+ result = await retriever(query_text, **kwargs)
+ else:
+ result = retriever(query_text, **kwargs)
+
+ if isinstance(result, dict) and result.get("kb_id") == target_kb_id and isinstance(result.get("results"), list):
+ return SearchOutputSchema(**result).model_dump()
+ return KnowledgeBase.build_search_output(target_kb_id, result)
+
+ except Exception as e:
+ logger.error(f"关键词检索失败: {e}")
+ return f"关键词检索失败: {str(e)}"
+
+
@tool(category="knowledge", tags=["知识库"], args_schema=OpenKBDocumentInput)
async def open_kb_document(
kb_id: str,
@@ -358,11 +418,12 @@ async def find_kb_document(
def get_common_kb_tools() -> list:
"""获取通用知识库工具列表
- 返回 5 个通用工具:
+ 返回 6 个通用工具:
- list_kbs: 列出用户可访问的知识库
- get_mindmap: 获取指定知识库的思维导图
- - query_kb: 在指定知识库中检索
+ - query_kb: 在指定知识库中语义检索
+ - query_keywords: 基于关键词在指定知识库中检索
- find_kb_document: 在指定文件内定位关键词或正则模式
- open_kb_document: 按 file_id 分段打开知识库文档
"""
- return [list_kbs, get_mindmap, query_kb, find_kb_document, open_kb_document]
+ return [list_kbs, get_mindmap, query_kb, query_keywords, find_kb_document, open_kb_document]
diff --git a/backend/package/yuxi/knowledge/schemas.py b/backend/package/yuxi/knowledge/schemas.py
index d68e2a57b..2fce10f5a 100644
--- a/backend/package/yuxi/knowledge/schemas.py
+++ b/backend/package/yuxi/knowledge/schemas.py
@@ -68,3 +68,13 @@ class OpenOutputSchema(BaseModel):
has_more_after: bool = Field(description="窗口后是否还有内容")
next_offset: int | None = Field(default=None, description="下一窗口 offset;没有更多内容时为 null")
content: str = Field(description="带行号的窗口内容")
+
+
+class QueryKeywordsInputSchema(BaseModel):
+ """基于关键词检索的输入模型"""
+
+ kb_id: str = Field(description="知识库资源 ID,也就是 kb_id")
+ keywords: list[str] = Field(
+ description="关键词列表,用于 BM25 关键词检索;适合精确匹配专有名词、术语、代码符号等场景"
+ )
+ file_name: str | None = Field(default=None, description="可选文件名关键词过滤,非必要不要使用")
diff --git a/docs/develop-guides/changelog.md b/docs/develop-guides/changelog.md
index 48ed688cd..e363db997 100644
--- a/docs/develop-guides/changelog.md
+++ b/docs/develop-guides/changelog.md
@@ -20,6 +20,7 @@
- 新增 Agent token usage 状态快照,在状态面板中作为普通可折叠分组展示完整 `messages`、当前传给 LLM 的 `messages`、system/tools 构成、输入构成堆叠条和上下文窗口占用估算。
- 优化 Agent 上下文压缩:Yuxi 的 DeepAgents summary adapter 在生成 summary 与写入 conversation history 时,不再改写 `AIMessage.tool_calls` 或 provider tool metadata,只逐条替换被摘要掉的旧 `ToolMessage.content`;`summary_keep_messages` 保留窗口原样传给模型,不再额外清洗最近消息;完整工具输出写入 `outputs/large_tool_results`,文件名使用工具名与内容 hash 生成,上下文只保留完整路径和最多 `summary_tool_result_token_limit` tokens 的预览,未触发 summary 的常规模型调用不做额外 ToolMessage 清洗;Summary 阈值判断沿用 DeepAgents/LangChain 默认近似 token counter,并保留其 usage metadata scaling;首次写入 `conversation_history` 前读取旧文件的 sandbox 404 会按 `file_not_found` 处理,不再产生误导性 warning;`present_artifacts` 会拒绝展示 `large_tool_results` 与 `conversation_history` 等工具调用阶段文件。新增管理员可配置项 `summary_keep_messages`、`summary_prompt`、`summary_tool_result_token_limit` 与 `max_execution_steps`,分别控制摘要后保留消息数、摘要提示词、summary 阶段工具结果预览上限和 LangGraph `recursion_limit`。
- 收敛普通聊天模型加载链路:`select_model` 保留旧 `.call()` 调用契约,内部改为通过 LangChain chat model adapter 复用 Agent 侧模型加载器,统一 OpenAI-compatible、Anthropic 与 Gemini 等 provider 的运行时适配;移除旧 `OpenAIBase` wrapper,默认重试策略迁移为 LangChain provider 参数。
+- 新增知识库 `query_keywords` 工具:基于关键词列表走 BM25 通道检索,适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充
## v0.7.0 (2026-06-13)
diff --git a/docs/develop-guides/roadmap.md b/docs/develop-guides/roadmap.md
index 65ce589e7..189696fb6 100644
--- a/docs/develop-guides/roadmap.md
+++ b/docs/develop-guides/roadmap.md
@@ -9,7 +9,7 @@
**知识库**
- [ ] office 组件预览,docx/pptx 可以转PDF,然后前端预览
-- [ ] 知识库工具新增 query_keywords 工具,专门用于基于关键词命中的排序
+- [x] 知识库工具新增 query_keywords 工具,专门用于基于关键词命中的排序
- [ ] 调研将当前知识库映射为虚拟文件系统的可行性,先明确文件树映射、权限边界、内容读取与 Agent 工具调用形态,再决定是否实现
- [ ] 增强知识库检索体验:增强 metadata、标签等
- [x] 优化思维导图构建的接口设计,支持增量构建和更新
From f4c8cb1c02d79c493b446016a7b2fca8f92a4980 Mon Sep 17 00:00:00 2001
From: xiangchang_24 <1476897511@qq.com>
Date: Thu, 18 Jun 2026 12:59:17 +0800
Subject: [PATCH 2/3] =?UTF-8?q?feat(knowledge):=20query=5Fkeywords=20?=
=?UTF-8?q?=E6=94=B9=E4=B8=BA=E7=B2=BE=E5=87=86=E5=8C=B9=E9=85=8D=E4=BC=98?=
=?UTF-8?q?=E5=85=88=20+=20BM25=20=E5=85=9C=E5=BA=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
按作者评审反馈,query_keywords 此前纯 BM25 无法保证精准命中排前。改为基于
Milvus 2.6 PHRASE_MATCH 实现「精准优先 + BM25 兜底」检索策略:
- 升级 Milvus v2.5.6 -> v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28),
同步更新 compose 与镜像拉取/打包脚本;客户端 pymilvus 已锁 3.0.0 无需动。
- KB 与图谱 content 字段新增 enable_match=True 以支持 PHRASE_MATCH。
- _collection_supports_bm25 增加 enable_match 自检:存量 KB 集合首次访问时
自动 drop 重建+重索引(懒触发、按 KB);图谱集合仅对新建生效(图谱检索纯
向量、不用 PHRASE_MATCH,重建需重跑 LLM 抽取,成本不成比例)。
- aquery keyword 分支重写:PHRASE_MATCH 过滤的精准命中在前(BM25 降序),
不足 final_top_k 时纯 BM25 兜底,按 chunk_id 去重;新增 expr 构造 helper
(转义防注入、多关键词 or 连接)、_merge_precise_and_backfill。
- _build_chunk_from_hit 新增 is_precise_match 标记写入 metadata(build_search_output
仅透传 metadata,故标记须放 metadata 才能存活到工具输出)。
- query_keywords 传 precise_match/phrase_match_terms,并过滤空/纯空白关键词。
---
.../package/yuxi/agents/toolkits/kbs/tools.py | 14 +-
.../graphs/milvus_graph_vector_store.py | 2 +
.../yuxi/knowledge/implementations/milvus.py | 148 ++++++++++++++++--
docker-compose.prod.yml | 6 +-
docker-compose.yml | 6 +-
docker/save_docker_images.ps1 | 6 +-
docker/save_docker_images.sh | 6 +-
docs/develop-guides/changelog.md | 3 +-
docs/develop-guides/roadmap.md | 2 +-
scripts/init.ps1 | 6 +-
scripts/init.sh | 6 +-
11 files changed, 165 insertions(+), 40 deletions(-)
diff --git a/backend/package/yuxi/agents/toolkits/kbs/tools.py b/backend/package/yuxi/agents/toolkits/kbs/tools.py
index 069f212b5..899ab20e7 100644
--- a/backend/package/yuxi/agents/toolkits/kbs/tools.py
+++ b/backend/package/yuxi/agents/toolkits/kbs/tools.py
@@ -253,7 +253,10 @@ async def query_keywords(
"""基于关键词在指定知识库中检索内容
当用户明确知道要搜索的关键词(如专有名词、技术术语、代码符号、特定指标等)
- 时使用此工具,走 BM25 关键词命中排序。如果需要语义理解型的模糊检索,请使用 query_kb。
+ 时使用此工具。检索采用「精准优先 + BM25 兜底」策略:包含完整关键词短语的 chunk
+ 排在前面(基于 Milvus PHRASE_MATCH,分词后 token 相邻即算精准命中),精准命中
+ 不足时由 BM25 模糊命中补齐,结果 metadata 中以 is_precise_match 标记。如果需要
+ 语义理解型的模糊检索,请使用 query_kb。
Args:
kb_id: 知识库资源 ID,也就是 kb_id
@@ -265,6 +268,7 @@ async def query_keywords(
"""
if not kb_id:
return "请提供 kb_id"
+ keywords = [k.strip() for k in keywords if k and k.strip()]
if not keywords:
return "请提供关键词列表"
@@ -281,9 +285,13 @@ async def query_keywords(
try:
retriever = target_info["retriever"]
- # 拼接关键词为查询文本,强制使用 keyword/BM25 模式
+ # 拼接关键词为查询文本,强制使用 keyword/BM25 模式并启用精准匹配
query_text = " ".join(keywords)
- kwargs: dict[str, Any] = {"search_mode": "keyword"}
+ kwargs: dict[str, Any] = {
+ "search_mode": "keyword",
+ "precise_match": True,
+ "phrase_match_terms": keywords,
+ }
if file_name:
kwargs["file_name"] = file_name
diff --git a/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py b/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py
index d27ebc7ca..2d7d8bea8 100644
--- a/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py
+++ b/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py
@@ -205,6 +205,7 @@ def _get_or_create_entity_collection(self, kb_id: str, embedding_info: Any) -> C
max_length=65535,
enable_analyzer=True,
analyzer_params=CONTENT_ANALYZER_PARAMS,
+ enable_match=True,
),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_info.dimension or 1024),
FieldSchema(name=CONTENT_SPARSE_FIELD, dtype=DataType.SPARSE_FLOAT_VECTOR),
@@ -221,6 +222,7 @@ def _get_or_create_triple_collection(self, kb_id: str, embedding_info: Any) -> C
max_length=65535,
enable_analyzer=True,
analyzer_params=CONTENT_ANALYZER_PARAMS,
+ enable_match=True,
),
FieldSchema(name="source_id", dtype=DataType.VARCHAR, max_length=100),
FieldSchema(name="target_id", dtype=DataType.VARCHAR, max_length=100),
diff --git a/backend/package/yuxi/knowledge/implementations/milvus.py b/backend/package/yuxi/knowledge/implementations/milvus.py
index 61b221aec..9161ea711 100644
--- a/backend/package/yuxi/knowledge/implementations/milvus.py
+++ b/backend/package/yuxi/knowledge/implementations/milvus.py
@@ -334,7 +334,9 @@ async def _create_kb_instance(self, kb_id: str, kb_config: dict) -> Any:
return self._create_new_collection(collection_name, embedding_info, kb_id)
if not self._collection_supports_bm25(collection):
- logger.warning(f"Collection {collection_name} schema does not support BM25, recreating")
+ logger.warning(
+ f"Collection {collection_name} schema does not support BM25/phrase-match, recreating"
+ )
utility.drop_collection(collection_name, using=self.connection_alias)
return self._create_new_collection(collection_name, embedding_info, kb_id)
@@ -366,6 +368,7 @@ def _create_new_collection(self, collection_name: str, embedding_info: Any, kb_i
max_length=65535,
enable_analyzer=True,
analyzer_params=CONTENT_ANALYZER_PARAMS,
+ enable_match=True,
),
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=100),
FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=100),
@@ -404,7 +407,7 @@ def _create_new_collection(self, collection_name: str, embedding_info: Any, kb_i
return collection
def _collection_supports_bm25(self, collection: Collection) -> bool:
- """检查集合是否具备 Milvus 内置 BM25 所需的 schema。"""
+ """检查集合是否具备 Milvus 内置 BM25 与 PHRASE_MATCH 所需的 schema。"""
fields = {field.name: field for field in collection.schema.fields}
content_field = fields.get("content")
sparse_field = fields.get(CONTENT_SPARSE_FIELD)
@@ -412,6 +415,8 @@ def _collection_supports_bm25(self, collection: Collection) -> bool:
return False
if content_field.params.get("enable_analyzer") is not True:
return False
+ if content_field.params.get("enable_match") is not True:
+ return False
if not sparse_field or sparse_field.dtype != DataType.SPARSE_FLOAT_VECTOR:
return False
@@ -598,6 +603,34 @@ def _build_file_name_expr(self, kb_id: str, file_name: str | None) -> str | None
joined_ids = '", "'.join(escaped_ids)
return f'file_id in ["{joined_ids}"]'
+ @staticmethod
+ def _escape_expr_literal(s: str) -> str:
+ """转义 Milvus 表达式字符串字面量中的反斜杠与双引号。"""
+ return s.replace("\\", "\\\\").replace('"', '\\"')
+
+ def _build_phrase_match_expr(self, terms: list[str], slop: int) -> str | None:
+ """构建 PHRASE_MATCH 表达式:任一关键词精准命中即算精准(grep 语义,取 or)。
+
+ slop=0 要求分词后 token 相邻(精确短语),>0 允许 token 间有间隔/乱序。
+ 全部 term 为空时返回 None,表示无法应用精准匹配。
+ """
+ clauses: list[str] = []
+ for term in terms:
+ cleaned = (term or "").strip()
+ if not cleaned:
+ continue
+ escaped = self._escape_expr_literal(cleaned)
+ clauses.append(f'PHRASE_MATCH(content, "{escaped}", {int(slop)})')
+ if not clauses:
+ return None
+ return " or ".join(clauses) if len(clauses) > 1 else clauses[0]
+
+ @staticmethod
+ def _combine_exprs(*exprs: str | None) -> str | None:
+ """用 and 连接若干过滤表达式,跳过 None/空串;全空返回 None。"""
+ parts = [e for e in exprs if e]
+ return " and ".join(parts) if parts else None
+
async def index_file(
self, kb_id: str, file_id: str, operator_id: str | None = None, params: dict | None = None
) -> dict:
@@ -828,6 +861,7 @@ def _build_chunk_from_hit(
score: float,
include_distances: bool,
score_field: str | None = None,
+ is_precise_match: bool | None = None,
) -> dict:
"""将 Milvus Hit 转成知识库统一返回的 Chunk 结构。"""
entity = hit.entity
@@ -838,6 +872,8 @@ def _build_chunk_from_hit(
"file_id": file_id,
"chunk_index": entity.get("chunk_index"),
}
+ if is_precise_match is not None:
+ metadata["is_precise_match"] = is_precise_match
chunk = {"content": entity.get("content", ""), "metadata": metadata, "score": float(score or 0.0)}
if score_field:
chunk[score_field] = float(score or 0.0)
@@ -845,6 +881,37 @@ def _build_chunk_from_hit(
chunk["distance"] = hit.distance
return chunk
+ @staticmethod
+ def _merge_precise_and_backfill(
+ precise_hits: list[dict], backfill_hits: list[dict], final_top_k: int
+ ) -> list[dict]:
+ """合并精准命中与 BM25 兜底命中:精准块在前(已按 BM25 降序),兜底块在后,按 chunk_id 去重,截 final_top_k。"""
+ seen: set[str] = set()
+ merged: list[dict] = []
+
+ def _chunk_id(chunk: dict) -> str | None:
+ return chunk.get("metadata", {}).get("chunk_id")
+
+ for chunk in precise_hits:
+ cid = _chunk_id(chunk)
+ if cid is None or cid in seen:
+ continue
+ seen.add(cid)
+ merged.append(chunk)
+ if len(merged) >= final_top_k:
+ return merged
+
+ for chunk in backfill_hits:
+ cid = _chunk_id(chunk)
+ if cid is None or cid in seen:
+ continue
+ seen.add(cid)
+ merged.append(chunk)
+ if len(merged) >= final_top_k:
+ break
+ return merged
+
+
async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, **kwargs) -> list[dict]:
"""异步查询知识库"""
collection = await self._get_milvus_collection(kb_id)
@@ -919,22 +986,69 @@ async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, **
"params": {"drop_ratio_search": bm25_drop_ratio_search},
}
- results = collection.search(
- data=[query_text],
- anns_field=CONTENT_SPARSE_FIELD,
- param=bm25_search_params,
- limit=bm25_top_k,
- expr=file_expr,
- output_fields=output_fields,
- )
-
- if results and len(results) > 0 and len(results[0]) > 0:
- for hit in results[0]:
- retrieved_chunks.append(
- self._build_chunk_from_hit(hit, hit.distance, include_distances, score_field="bm25_score")
+ precise_match = bool(merged_kwargs.get("precise_match", False))
+ precise_hits: list[dict] = []
+ backfill_hits: list[dict] = []
+
+ if precise_match:
+ phrase_slop = int(merged_kwargs.get("phrase_slop", 0))
+ terms = merged_kwargs.get("phrase_match_terms") or [query_text]
+ phrase_expr = self._build_phrase_match_expr(list(terms), phrase_slop)
+ if phrase_expr is None:
+ logger.warning("precise_match requested but no valid terms; falling back to pure BM25")
+ precise_match = False
+ else:
+ precise_expr = self._combine_exprs(file_expr, phrase_expr)
+ results = collection.search(
+ data=[query_text],
+ anns_field=CONTENT_SPARSE_FIELD,
+ param=bm25_search_params,
+ limit=bm25_top_k,
+ expr=precise_expr,
+ output_fields=output_fields,
)
-
- logger.debug(f"Milvus BM25 query response: {len(retrieved_chunks)} chunks found")
+ if results and len(results) > 0 and len(results[0]) > 0:
+ for hit in results[0]:
+ precise_hits.append(
+ self._build_chunk_from_hit(
+ hit,
+ hit.distance,
+ include_distances,
+ score_field="bm25_score",
+ is_precise_match=True,
+ )
+ )
+ logger.debug(f"PHRASE_MATCH+BM25: {len(precise_hits)} precise hits")
+
+ # 精准命中不足时用纯 BM25 兜底;精准命中已够则短路跳过
+ if not precise_match or len(precise_hits) < final_top_k:
+ results = collection.search(
+ data=[query_text],
+ anns_field=CONTENT_SPARSE_FIELD,
+ param=bm25_search_params,
+ limit=bm25_top_k,
+ expr=file_expr,
+ output_fields=output_fields,
+ )
+ if results and len(results) > 0 and len(results[0]) > 0:
+ for hit in results[0]:
+ backfill_hits.append(
+ self._build_chunk_from_hit(
+ hit,
+ hit.distance,
+ include_distances,
+ score_field="bm25_score",
+ is_precise_match=False if precise_match else None,
+ )
+ )
+ logger.debug(f"BM25 backfill: {len(backfill_hits)} hits")
+
+ retrieved_chunks = self._merge_precise_and_backfill(
+ precise_hits, backfill_hits, final_top_k
+ )
+ logger.debug(
+ f"Milvus BM25 query response: {len(retrieved_chunks)} chunks found (precise={len(precise_hits)})"
+ )
else:
embedding_model_spec = self.databases_meta[kb_id].get("embedding_model_spec")
embedding_function = self._get_embedding_function(embedding_model_spec, sync=True)
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index badf555d0..d7afcc79d 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -182,7 +182,7 @@ services:
etcd:
container_name: milvus-etcd
- image: quay.io/coreos/etcd:v3.5.5
+ image: quay.io/coreos/etcd:v3.5.25
environment:
- ETCD_AUTO_COMPACTION_MODE=revision
- ETCD_AUTO_COMPACTION_RETENTION=1000
@@ -203,7 +203,7 @@ services:
minio:
container_name: minio
- image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+ image: minio/minio:RELEASE.2024-05-28T17-19-04Z
environment:
MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-minioadmin}
MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-minioadmin}
@@ -222,7 +222,7 @@ services:
restart: unless-stopped
milvus:
- image: milvusdb/milvus:v2.5.6
+ image: milvusdb/milvus:v2.6.16
container_name: milvus
command: ["milvus", "run", "standalone"]
security_opt:
diff --git a/docker-compose.yml b/docker-compose.yml
index 95ec4c9d5..19a38f1ca 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -228,7 +228,7 @@ services:
etcd:
container_name: milvus-etcd-dev
- image: quay.io/coreos/etcd:v3.5.5
+ image: quay.io/coreos/etcd:v3.5.25
environment:
- ETCD_AUTO_COMPACTION_MODE=revision
- ETCD_AUTO_COMPACTION_RETENTION=1000
@@ -249,7 +249,7 @@ services:
minio:
container_name: minio
- image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+ image: minio/minio:RELEASE.2024-05-28T17-19-04Z
environment:
MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-minioadmin}
MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-minioadmin}
@@ -271,7 +271,7 @@ services:
restart: unless-stopped
milvus:
- image: milvusdb/milvus:v2.5.6
+ image: milvusdb/milvus:v2.6.16
container_name: milvus
command: ["milvus", "run", "standalone"]
security_opt:
diff --git a/docker/save_docker_images.ps1 b/docker/save_docker_images.ps1
index 1c6a8fe43..00a887655 100644
--- a/docker/save_docker_images.ps1
+++ b/docker/save_docker_images.ps1
@@ -20,9 +20,9 @@ $Images = @(
"node:24-slim",
"nginx:alpine",
"neo4j:5.26",
- "quay.io/coreos/etcd:v3.5.5",
- "minio/minio:RELEASE.2023-03-20T20-16-18Z",
- "milvusdb/milvus:v2.5.6",
+ "quay.io/coreos/etcd:v3.5.25",
+ "minio/minio:RELEASE.2024-05-28T17-19-04Z",
+ "milvusdb/milvus:v2.6.16",
# "lmsysorg/sglang:v0.4.9.post3-cu126",
# "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.0.1-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6"
)
diff --git a/docker/save_docker_images.sh b/docker/save_docker_images.sh
index fb7e23c38..a07a6d8c7 100644
--- a/docker/save_docker_images.sh
+++ b/docker/save_docker_images.sh
@@ -17,9 +17,9 @@ IMAGES=(
"node:24-slim",
"nginx:alpine",
"neo4j:5.26",
- "quay.io/coreos/etcd:v3.5.5",
- "minio/minio:RELEASE.2023-03-20T20-16-18Z",
- "milvusdb/milvus:v2.5.6",
+ "quay.io/coreos/etcd:v3.5.25",
+ "minio/minio:RELEASE.2024-05-28T17-19-04Z",
+ "milvusdb/milvus:v2.6.16",
)
# 确保所有镜像都已下载
diff --git a/docs/develop-guides/changelog.md b/docs/develop-guides/changelog.md
index e363db997..54b68a954 100644
--- a/docs/develop-guides/changelog.md
+++ b/docs/develop-guides/changelog.md
@@ -20,7 +20,8 @@
- 新增 Agent token usage 状态快照,在状态面板中作为普通可折叠分组展示完整 `messages`、当前传给 LLM 的 `messages`、system/tools 构成、输入构成堆叠条和上下文窗口占用估算。
- 优化 Agent 上下文压缩:Yuxi 的 DeepAgents summary adapter 在生成 summary 与写入 conversation history 时,不再改写 `AIMessage.tool_calls` 或 provider tool metadata,只逐条替换被摘要掉的旧 `ToolMessage.content`;`summary_keep_messages` 保留窗口原样传给模型,不再额外清洗最近消息;完整工具输出写入 `outputs/large_tool_results`,文件名使用工具名与内容 hash 生成,上下文只保留完整路径和最多 `summary_tool_result_token_limit` tokens 的预览,未触发 summary 的常规模型调用不做额外 ToolMessage 清洗;Summary 阈值判断沿用 DeepAgents/LangChain 默认近似 token counter,并保留其 usage metadata scaling;首次写入 `conversation_history` 前读取旧文件的 sandbox 404 会按 `file_not_found` 处理,不再产生误导性 warning;`present_artifacts` 会拒绝展示 `large_tool_results` 与 `conversation_history` 等工具调用阶段文件。新增管理员可配置项 `summary_keep_messages`、`summary_prompt`、`summary_tool_result_token_limit` 与 `max_execution_steps`,分别控制摘要后保留消息数、摘要提示词、summary 阶段工具结果预览上限和 LangGraph `recursion_limit`。
- 收敛普通聊天模型加载链路:`select_model` 保留旧 `.call()` 调用契约,内部改为通过 LangChain chat model adapter 复用 Agent 侧模型加载器,统一 OpenAI-compatible、Anthropic 与 Gemini 等 provider 的运行时适配;移除旧 `OpenAIBase` wrapper,默认重试策略迁移为 LangChain provider 参数。
-- 新增知识库 `query_keywords` 工具:基于关键词列表走 BM25 通道检索,适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充
+- 新增知识库 `query_keywords` 工具:采用「精准优先 + BM25 兜底」策略,基于 Milvus 2.6 `PHRASE_MATCH` 让包含完整关键词短语的 chunk 排在前面,精准命中不足时由 BM25 模糊命中补齐,结果以 `metadata.is_precise_match` 标记;适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充
+- 升级 Milvus 至 v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28):知识库与图谱 content 字段新增 `enable_match=True` 以支持 `PHRASE_MATCH`;存量知识库集合在 `_collection_supports_bm25` 自检时自动检测并 drop 重建+重索引(懒触发、按 KB),图谱集合仅对新建生效
## v0.7.0 (2026-06-13)
diff --git a/docs/develop-guides/roadmap.md b/docs/develop-guides/roadmap.md
index 189696fb6..53f7a9109 100644
--- a/docs/develop-guides/roadmap.md
+++ b/docs/develop-guides/roadmap.md
@@ -9,7 +9,7 @@
**知识库**
- [ ] office 组件预览,docx/pptx 可以转PDF,然后前端预览
-- [x] 知识库工具新增 query_keywords 工具,专门用于基于关键词命中的排序
+- [x] 知识库工具新增 query_keywords 工具,基于 PHRASE_MATCH 精准匹配优先 + BM25 兜底的关键词检索
- [ ] 调研将当前知识库映射为虚拟文件系统的可行性,先明确文件树映射、权限边界、内容读取与 Agent 工具调用形态,再决定是否实现
- [ ] 增强知识库检索体验:增强 metadata、标签等
- [x] 优化思维导图构建的接口设计,支持增量构建和更新
diff --git a/scripts/init.ps1 b/scripts/init.ps1
index c4a3986da..5b5aabcea 100644
--- a/scripts/init.ps1
+++ b/scripts/init.ps1
@@ -121,12 +121,12 @@ $images = @(
"python:3.12-slim",
"node:24-slim",
"node:24-alpine",
- "milvusdb/milvus:v2.5.6",
+ "milvusdb/milvus:v2.6.16",
"neo4j:5.26",
- "minio/minio:RELEASE.2023-03-20T20-16-18Z",
+ "minio/minio:RELEASE.2024-05-28T17-19-04Z",
"ghcr.io/astral-sh/uv:0.7.2",
"nginx:alpine",
- "quay.io/coreos/etcd:v3.5.5",
+ "quay.io/coreos/etcd:v3.5.25",
"postgres:16",
"redis:7-alpine"
)
diff --git a/scripts/init.sh b/scripts/init.sh
index d742a68f8..c752f599e 100644
--- a/scripts/init.sh
+++ b/scripts/init.sh
@@ -117,12 +117,12 @@ images=(
"python:3.12-slim"
"node:24-slim"
"node:24-alpine"
- "milvusdb/milvus:v2.5.6"
+ "milvusdb/milvus:v2.6.16"
"neo4j:5.26"
- "minio/minio:RELEASE.2023-03-20T20-16-18Z"
+ "minio/minio:RELEASE.2024-05-28T17-19-04Z"
"ghcr.io/astral-sh/uv:0.7.2"
"nginx:alpine"
- "quay.io/coreos/etcd:v3.5.5"
+ "quay.io/coreos/etcd:v3.5.25"
"postgres:16"
"redis:7-alpine"
)
From b0dd76036748ea3df063c34c6b414f76c0d75523 Mon Sep 17 00:00:00 2001
From: xiangchang_24 <1476897511@qq.com>
Date: Thu, 18 Jun 2026 22:30:42 +0800
Subject: [PATCH 3/3] =?UTF-8?q?fix(knowledge):=20query=5Fkeywords=20?=
=?UTF-8?q?=E6=94=B9=E5=90=91=E9=87=8F=E8=BF=81=E7=A7=BB=E5=B9=B6=E4=BF=AE?=
=?UTF-8?q?=E6=A3=80=E7=B4=A2=E9=93=BE=E8=B7=AF=20bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
按评审反馈调整存量集合升级策略并修复 Codex 指出的正确性问题:
- 向量迁移替代空重建:存量集合自检缺 enable_match 时,drop 前用
query_iterator 读出全量 embedding 原样回灌新集合,不重算;
content_sparse 由新集合 BM25 Function 自动生成,迁移后 flush 保证
PHRASE_MATCH 倒排可见。embedding 模型变更分支仍走重算不迁移。
- 修复 or/and 优先级:多关键词 PHRASE_MATCH 的 or 子句整体加括号,
避免与 file_name 的 and 拼接时 file_name 仅约束首个关键词。
- 修复重排前截断:keyword 分支 _merge_precise_and_backfill 改传
recall_top_k 而非 final_top_k,开启重排/图检索时候选池不再失效。
- 补齐 enable_match 自检单测 fixture,新增迁移/优先级/截断单测,
新增精准匹配集成测试。
测试:test_milvus_kb 22 + test_kbs_tools 12 + 集成精准匹配 2 全绿
---
.../yuxi/knowledge/implementations/milvus.py | 92 +++++-
.../api/test_knowledge_precise_match.py | 158 ++++++++++
backend/test/unit/plugins/test_milvus_kb.py | 279 +++++++++++++++++-
backend/test/unit/toolkits/test_kbs_tools.py | 65 +++-
docs/develop-guides/changelog.md | 3 +-
5 files changed, 579 insertions(+), 18 deletions(-)
create mode 100644 backend/test/integration/api/test_knowledge_precise_match.py
diff --git a/backend/package/yuxi/knowledge/implementations/milvus.py b/backend/package/yuxi/knowledge/implementations/milvus.py
index 9161ea711..2a7142f76 100644
--- a/backend/package/yuxi/knowledge/implementations/milvus.py
+++ b/backend/package/yuxi/knowledge/implementations/milvus.py
@@ -35,6 +35,7 @@
CONTENT_ANALYZER_PARAMS = {"type": "chinese"}
VECTOR_METRIC_TYPE = "COSINE"
MILVUS_CHUNK_EMBED_BATCH_SIZE = 200
+MILVUS_MIGRATE_BATCH_SIZE = 1000
@dataclass(kw_only=True)
@@ -335,10 +336,9 @@ async def _create_kb_instance(self, kb_id: str, kb_config: dict) -> Any:
if not self._collection_supports_bm25(collection):
logger.warning(
- f"Collection {collection_name} schema does not support BM25/phrase-match, recreating"
+ f"Collection {collection_name} schema does not support BM25/phrase-match, migrating data"
)
- utility.drop_collection(collection_name, using=self.connection_alias)
- return self._create_new_collection(collection_name, embedding_info, kb_id)
+ return await self._migrate_collection_for_match(collection_name, collection, embedding_info, kb_id)
logger.info(f"Retrieved existing collection: {collection_name}")
return collection
@@ -429,6 +429,79 @@ def _collection_supports_bm25(self, collection: Collection) -> bool:
return True
return False
+ async def _migrate_collection_for_match(
+ self,
+ collection_name: str,
+ old_collection: Collection,
+ embedding_info: Any,
+ kb_id: str,
+ ) -> Collection:
+ """迁移存量集合到支持 PHRASE_MATCH 的新 schema(enable_match=True)。
+
+ 升级路径:把旧集合的 embedding 原样读出 → drop → 按新 schema 建集合 → 回灌,
+ 不重算 embedding。content_sparse 由新集合的 BM25 Function 在 insert 时自动生成。
+
+ 风险:drop 后若 insert 失败,该 KB 数据会丢失,需人工重建。故应在维护窗口预热,
+ 且升级前对重要 KB 做备份。任一步失败直接抛错,不静默回退。
+ """
+ logger.warning(f"Collection {collection_name} missing enable_match, migrating embeddings to new schema")
+ return await asyncio.to_thread(
+ self._migrate_collection_for_match_sync,
+ collection_name,
+ old_collection,
+ embedding_info,
+ kb_id,
+ )
+
+ def _migrate_collection_for_match_sync(
+ self,
+ collection_name: str,
+ old_collection: Collection,
+ embedding_info: Any,
+ kb_id: str,
+ ) -> Collection:
+ """同步执行向量迁移(在 to_thread 中调用)。"""
+ # 1. load 旧集合(query 前置条件;已 load 时 Milvus 幂等)
+ try:
+ old_collection.load()
+ except Exception as e:
+ logger.warning(f"Load old collection {collection_name} for migration failed: {e}")
+
+ # 2. 分批读出全量 records(不含 content_sparse,新集合 BM25 Function 自动生成)
+ output_fields = ["id", "content", "chunk_id", "file_id", "chunk_index", "embedding"]
+ records: list[dict] = []
+ iterator = old_collection.query_iterator(batch_size=MILVUS_MIGRATE_BATCH_SIZE, output_fields=output_fields)
+ while True:
+ batch = iterator.next()
+ if not batch:
+ break
+ records.extend(batch)
+ logger.info(f"Migrating {len(records)} records for collection {collection_name}")
+
+ # 3. drop 旧集合
+ utility.drop_collection(collection_name, using=self.connection_alias)
+
+ # 4. 建新集合(带 enable_match)
+ new_collection = self._create_new_collection(collection_name, embedding_info, kb_id)
+
+ # 5. 分批回灌:列格式与 _insert_chunks_to_stores 一致,不传 sparse
+ for start in range(0, len(records), MILVUS_MIGRATE_BATCH_SIZE):
+ batch = records[start : start + MILVUS_MIGRATE_BATCH_SIZE]
+ entities = [
+ [r.get("id") for r in batch],
+ [r.get("content") for r in batch],
+ [r.get("chunk_id") for r in batch],
+ [r.get("file_id") for r in batch],
+ [r.get("chunk_index") for r in batch],
+ [r.get("embedding") for r in batch],
+ ]
+ new_collection.insert(entities)
+
+ # 6. flush 确保 PHRASE_MATCH 倒排在 growing segment 上可见,迁移后立即可查
+ new_collection.flush()
+ logger.info(f"Migrated collection {collection_name}: {len(records)} records re-inserted")
+ return new_collection
+
async def _initialize_kb_instance(self, instance: Any) -> None:
"""初始化 Milvus 集合(加载到内存)"""
try:
@@ -623,7 +696,11 @@ def _build_phrase_match_expr(self, terms: list[str], slop: int) -> str | None:
clauses.append(f'PHRASE_MATCH(content, "{escaped}", {int(slop)})')
if not clauses:
return None
- return " or ".join(clauses) if len(clauses) > 1 else clauses[0]
+ # 多关键词整体加括号:与 file_expr 经 _combine_exprs 拼成 `file and (PM(a) or PM(b))`。
+ # Milvus 中 and 优先级高于 or,不加括号 file_name 过滤只会约束第一个关键词。
+ if len(clauses) > 1:
+ return f"({' or '.join(clauses)})"
+ return clauses[0]
@staticmethod
def _combine_exprs(*exprs: str | None) -> str | None:
@@ -911,7 +988,6 @@ def _chunk_id(chunk: dict) -> str | None:
break
return merged
-
async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, **kwargs) -> list[dict]:
"""异步查询知识库"""
collection = await self._get_milvus_collection(kb_id)
@@ -1043,9 +1119,9 @@ async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, **
)
logger.debug(f"BM25 backfill: {len(backfill_hits)} hits")
- retrieved_chunks = self._merge_precise_and_backfill(
- precise_hits, backfill_hits, final_top_k
- )
+ # 用 recall_top_k 而非 final_top_k:开启 reranker/graph 时 recall_top_k 是重排候选池,
+ # 提前截到 final_top_k 会让重排只拿到少量候选、recall 退化。最终截断由 [:final_top_k] 负责。
+ retrieved_chunks = self._merge_precise_and_backfill(precise_hits, backfill_hits, recall_top_k)
logger.debug(
f"Milvus BM25 query response: {len(retrieved_chunks)} chunks found (precise={len(precise_hits)})"
)
diff --git a/backend/test/integration/api/test_knowledge_precise_match.py b/backend/test/integration/api/test_knowledge_precise_match.py
new file mode 100644
index 000000000..974f947cb
--- /dev/null
+++ b/backend/test/integration/api/test_knowledge_precise_match.py
@@ -0,0 +1,158 @@
+"""知识库精准匹配检索 integration 测试。
+
+覆盖 query_keywords 工具背后「精准优先 + BM25 兜底」检索策略的端到端链路:
+建 KB -> 上传文件 -> 异步索引 -> 通过 /query-test 接口验证 PHRASE_MATCH 精准命中
+排在 BM25 模糊命中之前,且 metadata.is_precise_match 标记正确。
+
+依赖 docker compose up -d 后的运行环境与 TEST_USERNAME/TEST_PASSWORD 超管凭据。
+"""
+
+import asyncio
+import os
+
+import pytest
+from pymilvus import Collection, connections, utility
+
+pytestmark = [pytest.mark.asyncio, pytest.mark.integration]
+
+_MILVUS_FLUSH_ALIAS = "_test_flush_kb"
+
+
+# 包含完整短语「扭转减振器」的段落会被 PHRASE_MATCH 精准命中;
+# 仅提及「减振器/振动」的段落只能由 BM25 模糊命中。
+PRECISE_MATCH_DOC = """# 扭转减振器技术说明
+
+扭转减振器是汽车传动系统中的关键部件,用于衰减发动机曲轴产生的扭转振动,保护传动系免受过大动载荷。扭转减振器通常安装在飞轮与离合器之间,通过匹配刚度与阻尼来吸收扭转振动能量。
+
+减振器的设计需要兼顾工作频段与共振点位置。当系统振动频率接近共振点时,减振装置能够有效降低振幅,避免传动系结构件因疲劳而损坏。普通减振器一般通过橡胶元件或弹簧组提供阻尼。
+
+维护保养方面,应定期检查减振元件的老化程度与连接紧固情况。一旦发现橡胶开裂或弹簧失效,需及时更换,否则会削弱减振能力并放大振动。日常使用中若出现异常振动噪声,应优先排查减振装置。
+
+安装扭矩与配合间隙必须符合厂家规范。过紧会加剧磨损,过松则无法有效传递阻尼。建议在专业场地由技术人员操作,并使用专用工具校核安装尺寸。
+"""
+
+
+async def _wait_for_task(test_client, admin_headers, task_id: str, timeout: float = 120.0) -> dict:
+ """轮询任务直到进入终态,返回 task 字典。"""
+ deadline = asyncio.get_event_loop().time() + timeout
+ while asyncio.get_event_loop().time() < deadline:
+ resp = await test_client.get(f"/api/tasks/{task_id}", headers=admin_headers)
+ assert resp.status_code == 200, resp.text
+ task = resp.json().get("task", {})
+ if task.get("status") in {"success", "failed", "cancelled"}:
+ return task
+ await asyncio.sleep(0.5)
+ pytest.fail(f"Task {task_id} did not reach terminal status within {timeout}s")
+
+
+async def _index_markdown(test_client, admin_headers, kb_id: str, filename: str, content: str) -> None:
+ """上传 markdown 文件并以 auto_index 触发解析+索引,等待完成。"""
+ upload_resp = await test_client.post(
+ "/api/knowledge/files/upload",
+ params={"kb_id": kb_id},
+ files={"file": (filename, content.encode("utf-8"), "text/markdown")},
+ headers=admin_headers,
+ )
+ assert upload_resp.status_code == 200, upload_resp.text
+ upload_json = upload_resp.json()
+ minio_url = upload_json["file_path"]
+
+ enqueue_resp = await test_client.post(
+ f"/api/knowledge/databases/{kb_id}/documents",
+ json={
+ "items": [minio_url],
+ "params": {
+ "content_type": "file",
+ "auto_index": True,
+ "content_hashes": {minio_url: upload_json["content_hash"]},
+ "file_sizes": {minio_url: upload_json["size"]},
+ },
+ },
+ headers=admin_headers,
+ )
+ assert enqueue_resp.status_code == 200, enqueue_resp.text
+ task_id = enqueue_resp.json()["task_id"]
+
+ task = await _wait_for_task(test_client, admin_headers, task_id)
+ assert task["status"] == "success", f"indexing task failed: {task.get('error') or task.get('result')}"
+ await _flush_kb_collection(kb_id)
+
+
+async def _flush_kb_collection(kb_id: str) -> None:
+ """显式 flush 集合,保证 PHRASE_MATCH 倒排索引在 growing segment 上可查。
+
+ index_file insert 后未 flush,enable_match 倒排索引需 segment seal 后才稳定可见,
+ 否则索引后立即查询会偶发返回空。flush 是测试侧保证数据可见性的手段,不改变检索语义。
+ """
+ uri = os.getenv("MILVUS_URI", "http://milvus:19530")
+ if connections.has_connection(_MILVUS_FLUSH_ALIAS):
+ connections.disconnect(_MILVUS_FLUSH_ALIAS)
+ connections.connect(alias=_MILVUS_FLUSH_ALIAS, uri=uri)
+ try:
+ if utility.has_collection(kb_id, using=_MILVUS_FLUSH_ALIAS):
+ Collection(kb_id, using=_MILVUS_FLUSH_ALIAS).flush()
+ finally:
+ connections.disconnect(_MILVUS_FLUSH_ALIAS)
+
+
+async def _query_test(test_client, admin_headers, kb_id: str, query: str, meta: dict) -> list[dict]:
+ resp = await test_client.post(
+ f"/api/knowledge/databases/{kb_id}/query-test",
+ json={"query": query, "meta": meta},
+ headers=admin_headers,
+ )
+ assert resp.status_code == 200, resp.text
+ return resp.json()
+
+
+async def test_query_test_precise_match_ranks_phrase_hits_first(test_client, admin_headers, knowledge_database):
+ """精准匹配:含完整短语的 chunk 标记 is_precise_match=True 且排在模糊命中之前。"""
+ kb_id = knowledge_database["kb_id"]
+ await _index_markdown(test_client, admin_headers, kb_id, "torsional_damper.md", PRECISE_MATCH_DOC)
+
+ chunks = await _query_test(
+ test_client,
+ admin_headers,
+ kb_id,
+ "扭转减振器",
+ {
+ "search_mode": "keyword",
+ "precise_match": True,
+ "phrase_match_terms": ["扭转减振器"],
+ "final_top_k": 10,
+ },
+ )
+
+ assert chunks, "精准匹配检索应返回非空结果"
+
+ precise_flags = [bool(c.get("metadata", {}).get("is_precise_match")) for c in chunks]
+ assert any(precise_flags), "应至少有一个精准命中(is_precise_match=True)的 chunk"
+
+ # 精准块必须整体排在非精准块之前:最后一个 True 之后不应再出现 False
+ last_true = max(i for i, v in enumerate(precise_flags) if v)
+ has_false_after = any(not precise_flags[i] for i in range(last_true + 1, len(precise_flags)))
+ assert not has_false_after, "精准命中必须排在 BM25 兜底命中之前"
+
+ # 精准命中的 chunk 内容应包含完整短语,且带 bm25_score
+ precise_chunks = [c for c in chunks if c.get("metadata", {}).get("is_precise_match")]
+ assert all("扭转减振器" in c.get("content", "") for c in precise_chunks)
+ assert all(isinstance(c.get("bm25_score"), float) for c in precise_chunks)
+
+
+async def test_query_test_pure_bm25_omits_precise_flag(test_client, admin_headers, knowledge_database):
+ """纯 BM25(不启用精准匹配)返回的 chunk 不应带 is_precise_match 标记。"""
+ kb_id = knowledge_database["kb_id"]
+ await _index_markdown(test_client, admin_headers, kb_id, "torsional_damper_plain.md", PRECISE_MATCH_DOC)
+
+ chunks = await _query_test(
+ test_client,
+ admin_headers,
+ kb_id,
+ "扭转减振器",
+ {"search_mode": "keyword", "final_top_k": 10},
+ )
+
+ assert chunks, "纯 BM25 检索应返回非空结果"
+ assert all("is_precise_match" not in c.get("metadata", {}) for c in chunks), (
+ "未启用 precise_match 时不应写入 is_precise_match 标记"
+ )
diff --git a/backend/test/unit/plugins/test_milvus_kb.py b/backend/test/unit/plugins/test_milvus_kb.py
index df0ff1181..1f500ae70 100644
--- a/backend/test/unit/plugins/test_milvus_kb.py
+++ b/backend/test/unit/plugins/test_milvus_kb.py
@@ -19,25 +19,28 @@
class FakeHit:
- def __init__(self, content: str, distance: float):
+ def __init__(self, content: str, distance: float, chunk_id: str = "chunk-1"):
self.distance = distance
self.entity = {
"content": content,
- "chunk_id": "chunk-1",
+ "chunk_id": chunk_id,
"file_id": "file-1",
"chunk_index": 0,
}
class FakeCollection:
- def __init__(self, distance: float = 0.8):
+ def __init__(self, distance: float = 0.8, search_results: list | None = None):
self.search_calls = []
self.hybrid_calls = []
self.insert_calls = []
self.distance = distance
+ self._search_results = list(search_results) if search_results else None
def search(self, **kwargs):
self.search_calls.append(kwargs)
+ if self._search_results:
+ return self._search_results.pop(0)
return [[FakeHit("BM25 result", self.distance)]]
def hybrid_search(self, **kwargs):
@@ -420,6 +423,102 @@ async def test_keyword_mode_uses_milvus_bm25_search():
assert search_call["limit"] == 7
+async def test_keyword_mode_precise_match_uses_phrase_match_filter_and_backfill():
+ """精准匹配:PHRASE_MATCH 过滤的精准命中在前,BM25 兜底在后,按 chunk_id 去重。"""
+ precise_results = [
+ [
+ FakeHit("precise-1", 0.9, chunk_id="p1"),
+ FakeHit("precise-2", 0.7, chunk_id="p2"),
+ ]
+ ]
+ backfill_results = [
+ [
+ FakeHit("backfill-1", 0.5, chunk_id="b1"),
+ FakeHit("backfill-2", 0.3, chunk_id="b2"),
+ ]
+ ]
+ collection = FakeCollection(search_results=[precise_results, backfill_results])
+ kb = make_kb(collection)
+
+ chunks = await kb.aquery(
+ "扭转减振器",
+ "db",
+ search_mode="keyword",
+ precise_match=True,
+ phrase_match_terms=["扭转减振器"],
+ final_top_k=10,
+ )
+
+ # 第一次 search 带 PHRASE_MATCH 过滤
+ precise_call = collection.search_calls[0]
+ assert precise_call["anns_field"] == CONTENT_SPARSE_FIELD
+ assert 'PHRASE_MATCH(content, "扭转减振器", 0)' in precise_call["expr"]
+
+ # 第二次 search 为纯 BM25 兜底(无 file_name 时 expr 为 None)
+ backfill_call = collection.search_calls[1]
+ assert backfill_call["expr"] is None
+
+ # 合并顺序:精准在前、兜底在后,去重后共 4 条
+ assert [c["metadata"]["chunk_id"] for c in chunks] == ["p1", "p2", "b1", "b2"]
+ assert chunks[0]["metadata"]["is_precise_match"] is True
+ assert chunks[1]["metadata"]["is_precise_match"] is True
+ assert chunks[2]["metadata"]["is_precise_match"] is False
+ assert chunks[3]["metadata"]["is_precise_match"] is False
+
+
+async def test_precise_match_short_circuits_when_enough_hits():
+ """精准命中已够 final_top_k 时不再触发兜底查询。"""
+ precise_results = [
+ [
+ FakeHit("precise-1", 0.9, chunk_id="p1"),
+ FakeHit("precise-2", 0.7, chunk_id="p2"),
+ ]
+ ]
+ collection = FakeCollection(search_results=[precise_results])
+ kb = make_kb(collection)
+
+ chunks = await kb.aquery(
+ "term",
+ "db",
+ search_mode="keyword",
+ precise_match=True,
+ phrase_match_terms=["term"],
+ final_top_k=1,
+ )
+
+ assert len(collection.search_calls) == 1
+ assert chunks[0]["metadata"]["chunk_id"] == "p1"
+ assert chunks[0]["metadata"]["is_precise_match"] is True
+
+
+async def test_precise_match_degrades_when_no_valid_terms():
+ """phrase_match_terms 全为空时降级为纯 BM25,不抛错、不写 is_precise_match。"""
+ collection = FakeCollection()
+ kb = make_kb(collection)
+
+ chunks = await kb.aquery(
+ "fallback query",
+ "db",
+ search_mode="keyword",
+ precise_match=True,
+ phrase_match_terms=["", " "],
+ final_top_k=5,
+ )
+
+ assert len(collection.search_calls) == 1
+ assert collection.search_calls[0]["expr"] is None
+ assert "is_precise_match" not in chunks[0]["metadata"]
+
+
+def test_build_phrase_match_expr_or_joins_and_escapes():
+ kb = MilvusKB.__new__(MilvusKB)
+ expr = kb._build_phrase_match_expr(["扭转减振器", '含"引号', ""], 0)
+ # 多关键词整体加括号,避免与 file_expr 拼 and 时 or 优先级问题
+ assert expr == '(PHRASE_MATCH(content, "扭转减振器", 0) or PHRASE_MATCH(content, "含\\"引号", 0))'
+ assert kb._build_phrase_match_expr(["", " "], 0) is None
+ assert kb._build_phrase_match_expr(["单关键词"], 2) == 'PHRASE_MATCH(content, "单关键词", 2)'
+
+
async def test_vector_mode_ignores_metric_type_override():
collection = FakeCollection()
kb = make_kb(collection)
@@ -509,6 +608,7 @@ def test_collection_supports_bm25_requires_analyzed_content_sparse_field_and_fun
max_length=65535,
enable_analyzer=True,
analyzer_params=CONTENT_ANALYZER_PARAMS,
+ enable_match=True,
),
FieldSchema(name=CONTENT_SPARSE_FIELD, dtype=DataType.SPARSE_FLOAT_VECTOR),
],
@@ -525,3 +625,176 @@ def test_collection_supports_bm25_requires_analyzed_content_sparse_field_and_fun
collection = type("Collection", (), {"schema": schema})()
assert kb._collection_supports_bm25(collection)
+
+
+def test_collection_supports_bm25_requires_enable_match():
+ """缺 enable_match 的存量集合应被判为不支持,触发自动重建。"""
+ kb = MilvusKB.__new__(MilvusKB)
+ schema = CollectionSchema(
+ fields=[
+ FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=100, is_primary=True),
+ FieldSchema(
+ name="content",
+ dtype=DataType.VARCHAR,
+ max_length=65535,
+ enable_analyzer=True,
+ analyzer_params=CONTENT_ANALYZER_PARAMS,
+ ),
+ FieldSchema(name=CONTENT_SPARSE_FIELD, dtype=DataType.SPARSE_FLOAT_VECTOR),
+ ],
+ functions=[
+ Function(
+ name="content_bm25",
+ input_field_names=["content"],
+ output_field_names=[CONTENT_SPARSE_FIELD],
+ function_type=FunctionType.BM25,
+ )
+ ],
+ )
+
+ collection = type("Collection", (), {"schema": schema})()
+
+ assert not kb._collection_supports_bm25(collection)
+
+
+async def test_migrate_collection_for_match_reuses_embeddings(monkeypatch):
+ """向量迁移:旧集合 embedding 原样回灌新集合,不重算;不读 content_sparse。"""
+ from yuxi.knowledge.implementations import milvus as milvus_mod
+
+ records = [
+ {"id": "id-1", "content": "c1", "chunk_id": "c1", "file_id": "f1", "chunk_index": 0, "embedding": [0.1, 0.2]},
+ {"id": "id-2", "content": "c2", "chunk_id": "c2", "file_id": "f1", "chunk_index": 1, "embedding": [0.3, 0.4]},
+ ]
+
+ class FakeIterator:
+ def __init__(self, batches):
+ self._batches = batches
+ self._i = 0
+
+ def next(self):
+ if self._i >= len(self._batches):
+ return []
+ batch = self._batches[self._i]
+ self._i += 1
+ return batch
+
+ class OldCollection:
+ def __init__(self):
+ self.query_iterator_calls = []
+
+ def load(self):
+ pass
+
+ def query_iterator(self, **kwargs):
+ self.query_iterator_calls.append(kwargs)
+ return FakeIterator([records])
+
+ class NewCollection:
+ def __init__(self):
+ self.insert_calls = []
+ self.flushed = False
+
+ def insert(self, entities):
+ self.insert_calls.append(entities)
+
+ def flush(self):
+ self.flushed = True
+
+ old = OldCollection()
+ new_collection = NewCollection()
+ kb = MilvusKB.__new__(MilvusKB)
+ kb.connection_alias = "default"
+ kb._create_new_collection = lambda name, info, kb_id: new_collection
+
+ drop_calls = []
+ monkeypatch.setattr(milvus_mod.utility, "drop_collection", lambda name, using=None: drop_calls.append(name))
+ # __del__ 会对带 connection_alias 的实例调 disconnect,mock 掉避免 pymilvus deprecation 噪音
+ monkeypatch.setattr(milvus_mod.connections, "disconnect", lambda *a, **k: None)
+
+ result = await kb._migrate_collection_for_match("col", old, None, "db")
+
+ assert result is new_collection
+ assert drop_calls == ["col"]
+ assert old.query_iterator_calls[0]["output_fields"] == [
+ "id",
+ "content",
+ "chunk_id",
+ "file_id",
+ "chunk_index",
+ "embedding",
+ ]
+ assert "content_sparse" not in old.query_iterator_calls[0]["output_fields"]
+ assert len(new_collection.insert_calls) == 1
+ entities = new_collection.insert_calls[0]
+ # 列格式:[ids, contents, chunk_ids, file_ids, chunk_indexes, embeddings]
+ assert entities[0] == ["id-1", "id-2"]
+ assert entities[2] == ["c1", "c2"]
+ assert entities[5] == [[0.1, 0.2], [0.3, 0.4]]
+ assert new_collection.flushed is True
+
+
+async def test_keyword_mode_reranker_keeps_recall_pool(monkeypatch):
+ """bug #2: reranker 开启时 keyword 分支不应提前截到 final_top_k,候选池保留 recall_top_k。"""
+ precise_results = [
+ [
+ FakeHit("p1", 0.9, chunk_id="p1"),
+ FakeHit("p2", 0.8, chunk_id="p2"),
+ FakeHit("p3", 0.7, chunk_id="p3"),
+ ]
+ ]
+ collection = FakeCollection(search_results=[precise_results])
+ kb = make_kb(collection)
+
+ captured = {}
+
+ class FakeReranker:
+ async def acompute_score(self, pairs, normalize=True):
+ _, docs = pairs
+ captured["docs"] = list(docs)
+ return [0.9, 0.5, 0.7][: len(docs)]
+
+ async def aclose(self):
+ pass
+
+ import yuxi.models.rerank as rerank_mod
+
+ monkeypatch.setattr(rerank_mod, "get_reranker", lambda model: FakeReranker())
+
+ chunks = await kb.aquery(
+ "term",
+ "db",
+ search_mode="keyword",
+ precise_match=True,
+ phrase_match_terms=["term"],
+ final_top_k=2,
+ use_reranker=True,
+ reranker_model="fake",
+ recall_top_k=50,
+ )
+
+ # precise 命中 3 条短路兜底;reranker 应拿到全部 3 条而非被 final_top_k=2 截断
+ assert len(captured["docs"]) == 3
+ assert len(chunks) == 2 # 最终截断到 final_top_k
+
+
+async def test_keyword_mode_precise_match_with_file_name_wraps_or(monkeypatch):
+ """bug #3: 多关键词 + file_name 过滤时,or 子句整体加括号,file_name 约束全部关键词。"""
+ precise_results = [[FakeHit("precise-1", 0.9, chunk_id="p1")]]
+ collection = FakeCollection(search_results=[precise_results])
+ kb = make_kb(collection)
+
+ await kb.aquery(
+ "kw",
+ "db",
+ search_mode="keyword",
+ precise_match=True,
+ phrase_match_terms=["扭转减振器", "减振器"],
+ file_name="demo.md",
+ final_top_k=5,
+ )
+
+ precise_call = collection.search_calls[0]
+ expr = precise_call["expr"]
+ # file_expr 在前,PHRASE_MATCH 的 or 子句整体被括号包裹
+ assert expr.startswith('file_id == "file-1" and (PHRASE_MATCH(content, "扭转减振器", 0) or ')
+ assert expr.endswith(")")
diff --git a/backend/test/unit/toolkits/test_kbs_tools.py b/backend/test/unit/toolkits/test_kbs_tools.py
index 3034c3814..c27d977a0 100644
--- a/backend/test/unit/toolkits/test_kbs_tools.py
+++ b/backend/test/unit/toolkits/test_kbs_tools.py
@@ -4,7 +4,6 @@
from types import SimpleNamespace
import pytest
-
from yuxi.agents.toolkits.kbs import tools
@@ -24,6 +23,10 @@ def _query_kb_callable():
return _tool_callable(tools.query_kb)
+def _query_keywords_callable():
+ return _tool_callable(tools.query_keywords)
+
+
def _find_kb_document_callable():
return _tool_callable(tools.find_kb_document)
@@ -43,6 +46,10 @@ async def _run_query_kb(**kwargs):
return await _run_tool(_query_kb_callable(), **kwargs)
+async def _run_query_keywords(**kwargs):
+ return await _run_tool(_query_keywords_callable(), **kwargs)
+
+
async def _run_find_kb_document(**kwargs):
return await _run_tool(_find_kb_document_callable(), **kwargs)
@@ -71,7 +78,7 @@ def _build_test_window(content: str, offset: int = 0, limit: int = 1800) -> dict
def _patch_retrievers(monkeypatch, *, kb_type: str = "milvus", retriever=None):
monkeypatch.setattr(
- tools.knowledge_base,
+ tools._get_knowledge_base(),
"get_retrievers",
lambda: {
"db-1": {
@@ -80,6 +87,7 @@ def _patch_retrievers(monkeypatch, *, kb_type: str = "milvus", retriever=None):
"metadata": {"kb_type": kb_type},
}
},
+ raising=False,
)
@@ -204,6 +212,51 @@ async def _fake_retriever(query_text: str, **kwargs):
}
+@pytest.mark.asyncio
+async def test_query_keywords_forwards_precise_match_kwargs(monkeypatch) -> None:
+ captured: dict = {}
+
+ async def _fake_retriever(query_text: str, **kwargs):
+ captured["query_text"] = query_text
+ captured["kwargs"] = kwargs
+ return [
+ {
+ "content": "precise hit",
+ "metadata": {"file_id": "file-1", "source": "doc.md", "is_precise_match": True},
+ }
+ ]
+
+ _patch_retrievers(monkeypatch, retriever=_fake_retriever)
+ monkeypatch.setattr(tools, "_resolve_visible_knowledge_bases_for_query", _fake_visible_kbs)
+
+ runtime = SimpleNamespace(context=SimpleNamespace())
+ result = await _run_query_keywords(kb_id="db-1", keywords=["扭转减振器", "故障"], runtime=runtime)
+
+ # 拼接为空格分隔的查询文本,并强制 keyword + 精准匹配 + 原始关键词列表
+ assert captured["query_text"] == "扭转减振器 故障"
+ assert captured["kwargs"] == {
+ "search_mode": "keyword",
+ "precise_match": True,
+ "phrase_match_terms": ["扭转减振器", "故障"],
+ }
+ assert result["kb_id"] == "db-1"
+ assert result["results"][0]["metadata"]["is_precise_match"] is True
+
+
+@pytest.mark.asyncio
+async def test_query_keywords_rejects_empty_or_whitespace_keywords(monkeypatch) -> None:
+ async def _must_not_be_called(*args, **kwargs):
+ raise AssertionError("retriever should not be called for empty keywords")
+
+ _patch_retrievers(monkeypatch, retriever=_must_not_be_called)
+ monkeypatch.setattr(tools, "_resolve_visible_knowledge_bases_for_query", _fake_visible_kbs)
+
+ runtime = SimpleNamespace(context=SimpleNamespace())
+
+ assert await _run_query_keywords(kb_id="db-1", keywords=[], runtime=runtime) == "请提供关键词列表"
+ assert await _run_query_keywords(kb_id="db-1", keywords=["", " "], runtime=runtime) == "请提供关键词列表"
+
+
@pytest.mark.asyncio
async def test_find_kb_document_returns_context_windows(monkeypatch) -> None:
_patch_retrievers(monkeypatch)
@@ -240,7 +293,7 @@ async def _fake_find_file_content(
],
}
- monkeypatch.setattr(tools.knowledge_base, "find_file_content", _fake_find_file_content)
+ monkeypatch.setattr(tools._get_knowledge_base(), "find_file_content", _fake_find_file_content, raising=False)
runtime = SimpleNamespace(context=SimpleNamespace())
result = await _run_find_kb_document(
@@ -295,7 +348,7 @@ async def _fake_open_file_content(kb_id: str, file_id: str, offset: int = 0, lim
assert file_id == "file-1"
return _build_test_window("\n".join(lines), offset=offset, limit=limit)
- monkeypatch.setattr(tools.knowledge_base, "open_file_content", _fake_open_file_content)
+ monkeypatch.setattr(tools._get_knowledge_base(), "open_file_content", _fake_open_file_content, raising=False)
runtime = SimpleNamespace(context=SimpleNamespace())
result = await _run_open_kb_document(kb_id="db-1", file_id="file-1", runtime=runtime)
@@ -325,7 +378,7 @@ async def _fake_open_file_content(kb_id: str, file_id: str, offset: int = 0, lim
assert file_id == "file-1"
return _build_test_window("\n".join(lines), offset=offset, limit=limit)
- monkeypatch.setattr(tools.knowledge_base, "open_file_content", _fake_open_file_content)
+ monkeypatch.setattr(tools._get_knowledge_base(), "open_file_content", _fake_open_file_content, raising=False)
runtime = SimpleNamespace(context=SimpleNamespace())
result = await _run_open_kb_document(
@@ -369,7 +422,7 @@ async def _fake_open_file_content(kb_id: str, file_id: str, offset: int = 0, lim
del kb_id, file_id, offset, limit
raise Exception("文件 file-1 没有解析后的 Markdown 内容")
- monkeypatch.setattr(tools.knowledge_base, "open_file_content", _fake_open_file_content)
+ monkeypatch.setattr(tools._get_knowledge_base(), "open_file_content", _fake_open_file_content, raising=False)
runtime = SimpleNamespace(context=SimpleNamespace())
result = await _run_open_kb_document(kb_id="db-1", file_id="file-1", runtime=runtime)
diff --git a/docs/develop-guides/changelog.md b/docs/develop-guides/changelog.md
index 54b68a954..e90502bd3 100644
--- a/docs/develop-guides/changelog.md
+++ b/docs/develop-guides/changelog.md
@@ -21,7 +21,8 @@
- 优化 Agent 上下文压缩:Yuxi 的 DeepAgents summary adapter 在生成 summary 与写入 conversation history 时,不再改写 `AIMessage.tool_calls` 或 provider tool metadata,只逐条替换被摘要掉的旧 `ToolMessage.content`;`summary_keep_messages` 保留窗口原样传给模型,不再额外清洗最近消息;完整工具输出写入 `outputs/large_tool_results`,文件名使用工具名与内容 hash 生成,上下文只保留完整路径和最多 `summary_tool_result_token_limit` tokens 的预览,未触发 summary 的常规模型调用不做额外 ToolMessage 清洗;Summary 阈值判断沿用 DeepAgents/LangChain 默认近似 token counter,并保留其 usage metadata scaling;首次写入 `conversation_history` 前读取旧文件的 sandbox 404 会按 `file_not_found` 处理,不再产生误导性 warning;`present_artifacts` 会拒绝展示 `large_tool_results` 与 `conversation_history` 等工具调用阶段文件。新增管理员可配置项 `summary_keep_messages`、`summary_prompt`、`summary_tool_result_token_limit` 与 `max_execution_steps`,分别控制摘要后保留消息数、摘要提示词、summary 阶段工具结果预览上限和 LangGraph `recursion_limit`。
- 收敛普通聊天模型加载链路:`select_model` 保留旧 `.call()` 调用契约,内部改为通过 LangChain chat model adapter 复用 Agent 侧模型加载器,统一 OpenAI-compatible、Anthropic 与 Gemini 等 provider 的运行时适配;移除旧 `OpenAIBase` wrapper,默认重试策略迁移为 LangChain provider 参数。
- 新增知识库 `query_keywords` 工具:采用「精准优先 + BM25 兜底」策略,基于 Milvus 2.6 `PHRASE_MATCH` 让包含完整关键词短语的 chunk 排在前面,精准命中不足时由 BM25 模糊命中补齐,结果以 `metadata.is_precise_match` 标记;适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充
-- 升级 Milvus 至 v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28):知识库与图谱 content 字段新增 `enable_match=True` 以支持 `PHRASE_MATCH`;存量知识库集合在 `_collection_supports_bm25` 自检时自动检测并 drop 重建+重索引(懒触发、按 KB),图谱集合仅对新建生效
+- 升级 Milvus 至 v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28):知识库与图谱 content 字段新增 `enable_match=True` 以支持 `PHRASE_MATCH`;存量知识库集合在 `_collection_supports_bm25` 自检时自动迁移向量到新 schema(drop 前用 `query_iterator` 读出全量 embedding 原样回灌,不重算;`content_sparse` 由新集合 BM25 Function 重新生成,迁移后 flush 保证精准匹配可见),懒触发、按 KB;图谱集合仅对新建生效
+- 修复 `query_keywords` 检索链路三处问题:多关键词 `PHRASE_MATCH` 以 `or` 拼接后与 `file_name` 过滤表达式用 `and` 连接未加括号,受 Milvus 运算符优先级影响 `file_name` 仅约束首个关键词,现对多关键词子句整体加括号;keyword 模式开启重排/图检索时候选池被提前截断至 `final_top_k` 导致 recall 退化,改为截到 `recall_top_k`、最终截断交由统一出口;存量集合自检缺 `enable_match` 时原 drop+空重建会使检索变空,改为向量迁移路径
## v0.7.0 (2026-06-13)