From faccf70cbd2ba1f67a20856ec04af523b715cb57 Mon Sep 17 00:00:00 2001 From: xiangchang_24 <1476897511@qq.com> Date: Tue, 16 Jun 2026 13:03:22 +0800 Subject: [PATCH 1/3] =?UTF-8?q?feat(knowledge):=20=E6=96=B0=E5=A2=9E=20que?= =?UTF-8?q?ry=5Fkeywords=20=E5=B7=A5=E5=85=B7=EF=BC=8C=E5=9F=BA=E4=BA=8E?= =?UTF-8?q?=E5=85=B3=E9=94=AE=E8=AF=8D=E5=91=BD=E4=B8=AD=E7=9A=84=20BM25?= =?UTF-8?q?=20=E6=A3=80=E7=B4=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 QueryKeywordsInputSchema,接收 keywords 列表 - 新增 query_keywords 工具函数,强制 search_mode=keyword 走 BM25 通道 - 过滤空字符串和纯空格关键词,避免检索异常 - 注册到 get_common_kb_tools(),Agent 可自动发现 - 更新 changelog 和 roadmap --- .../package/yuxi/agents/toolkits/kbs/tools.py | 67 ++++++++++++++++++- backend/package/yuxi/knowledge/schemas.py | 10 +++ docs/develop-guides/changelog.md | 1 + docs/develop-guides/roadmap.md | 2 +- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/backend/package/yuxi/agents/toolkits/kbs/tools.py b/backend/package/yuxi/agents/toolkits/kbs/tools.py index c27f45253..069f212b5 100644 --- a/backend/package/yuxi/agents/toolkits/kbs/tools.py +++ b/backend/package/yuxi/agents/toolkits/kbs/tools.py @@ -13,6 +13,7 @@ FindOutputSchema, OpenInputSchema, OpenOutputSchema, + QueryKeywordsInputSchema, SearchInputSchema, SearchOutputSchema, ) @@ -153,6 +154,7 @@ def mindmap_to_text(node, level=0): QueryKBInput = SearchInputSchema +QueryKeywordsInput = QueryKeywordsInputSchema OpenKBDocumentInput = OpenInputSchema FindKBDocumentInput = FindInputSchema @@ -241,6 +243,64 @@ async def query_kb(kb_id: str, query_text: str, file_name: str | None = None, ru return f"检索失败: {str(e)}" +@tool(category="knowledge", tags=["知识库"], args_schema=QueryKeywordsInput) +async def query_keywords( + kb_id: str, + keywords: list[str], + file_name: str | None = None, + runtime: ToolRuntime = None, +) -> Any: + """基于关键词在指定知识库中检索内容 + + 当用户明确知道要搜索的关键词(如专有名词、技术术语、代码符号、特定指标等) + 时使用此工具,走 BM25 关键词命中排序。如果需要语义理解型的模糊检索,请使用 query_kb。 + + Args: + kb_id: 知识库资源 ID,也就是 kb_id + keywords: 关键词列表 + file_name: 可选文件名关键词过滤 + + Returns: + 检索结果列表,结构与 query_kb 一致 + """ + if not kb_id: + return "请提供 kb_id" + if not keywords: + return "请提供关键词列表" + + knowledge_base = _get_knowledge_base() + retrievers = knowledge_base.get_retrievers() + visible_kbs = await _resolve_visible_knowledge_bases_for_query(runtime) + target_info, target_kb_id, target_error = _find_query_target( + kb_id=kb_id, + retrievers=retrievers, + visible_kbs=visible_kbs, + ) + if target_error: + return target_error + + try: + retriever = target_info["retriever"] + # 拼接关键词为查询文本,强制使用 keyword/BM25 模式 + query_text = " ".join(keywords) + kwargs: dict[str, Any] = {"search_mode": "keyword"} + if file_name: + kwargs["file_name"] = file_name + + if inspect.iscoroutinefunction(retriever): + result = await retriever(query_text, **kwargs) + else: + result = retriever(query_text, **kwargs) + + if isinstance(result, dict) and result.get("kb_id") == target_kb_id and isinstance(result.get("results"), list): + return SearchOutputSchema(**result).model_dump() + return KnowledgeBase.build_search_output(target_kb_id, result) + + except Exception as e: + logger.error(f"关键词检索失败: {e}") + return f"关键词检索失败: {str(e)}" + + @tool(category="knowledge", tags=["知识库"], args_schema=OpenKBDocumentInput) async def open_kb_document( kb_id: str, @@ -358,11 +418,12 @@ async def find_kb_document( def get_common_kb_tools() -> list: """获取通用知识库工具列表 - 返回 5 个通用工具: + 返回 6 个通用工具: - list_kbs: 列出用户可访问的知识库 - get_mindmap: 获取指定知识库的思维导图 - - query_kb: 在指定知识库中检索 + - query_kb: 在指定知识库中语义检索 + - query_keywords: 基于关键词在指定知识库中检索 - find_kb_document: 在指定文件内定位关键词或正则模式 - open_kb_document: 按 file_id 分段打开知识库文档 """ - return [list_kbs, get_mindmap, query_kb, find_kb_document, open_kb_document] + return [list_kbs, get_mindmap, query_kb, query_keywords, find_kb_document, open_kb_document] diff --git a/backend/package/yuxi/knowledge/schemas.py b/backend/package/yuxi/knowledge/schemas.py index d68e2a57b..2fce10f5a 100644 --- a/backend/package/yuxi/knowledge/schemas.py +++ b/backend/package/yuxi/knowledge/schemas.py @@ -68,3 +68,13 @@ class OpenOutputSchema(BaseModel): has_more_after: bool = Field(description="窗口后是否还有内容") next_offset: int | None = Field(default=None, description="下一窗口 offset;没有更多内容时为 null") content: str = Field(description="带行号的窗口内容") + + +class QueryKeywordsInputSchema(BaseModel): + """基于关键词检索的输入模型""" + + kb_id: str = Field(description="知识库资源 ID,也就是 kb_id") + keywords: list[str] = Field( + description="关键词列表,用于 BM25 关键词检索;适合精确匹配专有名词、术语、代码符号等场景" + ) + file_name: str | None = Field(default=None, description="可选文件名关键词过滤,非必要不要使用") diff --git a/docs/develop-guides/changelog.md b/docs/develop-guides/changelog.md index 48ed688cd..e363db997 100644 --- a/docs/develop-guides/changelog.md +++ b/docs/develop-guides/changelog.md @@ -20,6 +20,7 @@ - 新增 Agent token usage 状态快照,在状态面板中作为普通可折叠分组展示完整 `messages`、当前传给 LLM 的 `messages`、system/tools 构成、输入构成堆叠条和上下文窗口占用估算。 - 优化 Agent 上下文压缩:Yuxi 的 DeepAgents summary adapter 在生成 summary 与写入 conversation history 时,不再改写 `AIMessage.tool_calls` 或 provider tool metadata,只逐条替换被摘要掉的旧 `ToolMessage.content`;`summary_keep_messages` 保留窗口原样传给模型,不再额外清洗最近消息;完整工具输出写入 `outputs/large_tool_results`,文件名使用工具名与内容 hash 生成,上下文只保留完整路径和最多 `summary_tool_result_token_limit` tokens 的预览,未触发 summary 的常规模型调用不做额外 ToolMessage 清洗;Summary 阈值判断沿用 DeepAgents/LangChain 默认近似 token counter,并保留其 usage metadata scaling;首次写入 `conversation_history` 前读取旧文件的 sandbox 404 会按 `file_not_found` 处理,不再产生误导性 warning;`present_artifacts` 会拒绝展示 `large_tool_results` 与 `conversation_history` 等工具调用阶段文件。新增管理员可配置项 `summary_keep_messages`、`summary_prompt`、`summary_tool_result_token_limit` 与 `max_execution_steps`,分别控制摘要后保留消息数、摘要提示词、summary 阶段工具结果预览上限和 LangGraph `recursion_limit`。 - 收敛普通聊天模型加载链路:`select_model` 保留旧 `.call()` 调用契约,内部改为通过 LangChain chat model adapter 复用 Agent 侧模型加载器,统一 OpenAI-compatible、Anthropic 与 Gemini 等 provider 的运行时适配;移除旧 `OpenAIBase` wrapper,默认重试策略迁移为 LangChain provider 参数。 +- 新增知识库 `query_keywords` 工具:基于关键词列表走 BM25 通道检索,适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充 ## v0.7.0 (2026-06-13) diff --git a/docs/develop-guides/roadmap.md b/docs/develop-guides/roadmap.md index 65ce589e7..189696fb6 100644 --- a/docs/develop-guides/roadmap.md +++ b/docs/develop-guides/roadmap.md @@ -9,7 +9,7 @@ **知识库** - [ ] office 组件预览,docx/pptx 可以转PDF,然后前端预览 -- [ ] 知识库工具新增 query_keywords 工具,专门用于基于关键词命中的排序 +- [x] 知识库工具新增 query_keywords 工具,专门用于基于关键词命中的排序 - [ ] 调研将当前知识库映射为虚拟文件系统的可行性,先明确文件树映射、权限边界、内容读取与 Agent 工具调用形态,再决定是否实现 - [ ] 增强知识库检索体验:增强 metadata、标签等 - [x] 优化思维导图构建的接口设计,支持增量构建和更新 From f4c8cb1c02d79c493b446016a7b2fca8f92a4980 Mon Sep 17 00:00:00 2001 From: xiangchang_24 <1476897511@qq.com> Date: Thu, 18 Jun 2026 12:59:17 +0800 Subject: [PATCH 2/3] =?UTF-8?q?feat(knowledge):=20query=5Fkeywords=20?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=E7=B2=BE=E5=87=86=E5=8C=B9=E9=85=8D=E4=BC=98?= =?UTF-8?q?=E5=85=88=20+=20BM25=20=E5=85=9C=E5=BA=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 按作者评审反馈,query_keywords 此前纯 BM25 无法保证精准命中排前。改为基于 Milvus 2.6 PHRASE_MATCH 实现「精准优先 + BM25 兜底」检索策略: - 升级 Milvus v2.5.6 -> v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28), 同步更新 compose 与镜像拉取/打包脚本;客户端 pymilvus 已锁 3.0.0 无需动。 - KB 与图谱 content 字段新增 enable_match=True 以支持 PHRASE_MATCH。 - _collection_supports_bm25 增加 enable_match 自检:存量 KB 集合首次访问时 自动 drop 重建+重索引(懒触发、按 KB);图谱集合仅对新建生效(图谱检索纯 向量、不用 PHRASE_MATCH,重建需重跑 LLM 抽取,成本不成比例)。 - aquery keyword 分支重写:PHRASE_MATCH 过滤的精准命中在前(BM25 降序), 不足 final_top_k 时纯 BM25 兜底,按 chunk_id 去重;新增 expr 构造 helper (转义防注入、多关键词 or 连接)、_merge_precise_and_backfill。 - _build_chunk_from_hit 新增 is_precise_match 标记写入 metadata(build_search_output 仅透传 metadata,故标记须放 metadata 才能存活到工具输出)。 - query_keywords 传 precise_match/phrase_match_terms,并过滤空/纯空白关键词。 --- .../package/yuxi/agents/toolkits/kbs/tools.py | 14 +- .../graphs/milvus_graph_vector_store.py | 2 + .../yuxi/knowledge/implementations/milvus.py | 148 ++++++++++++++++-- docker-compose.prod.yml | 6 +- docker-compose.yml | 6 +- docker/save_docker_images.ps1 | 6 +- docker/save_docker_images.sh | 6 +- docs/develop-guides/changelog.md | 3 +- docs/develop-guides/roadmap.md | 2 +- scripts/init.ps1 | 6 +- scripts/init.sh | 6 +- 11 files changed, 165 insertions(+), 40 deletions(-) diff --git a/backend/package/yuxi/agents/toolkits/kbs/tools.py b/backend/package/yuxi/agents/toolkits/kbs/tools.py index 069f212b5..899ab20e7 100644 --- a/backend/package/yuxi/agents/toolkits/kbs/tools.py +++ b/backend/package/yuxi/agents/toolkits/kbs/tools.py @@ -253,7 +253,10 @@ async def query_keywords( """基于关键词在指定知识库中检索内容 当用户明确知道要搜索的关键词(如专有名词、技术术语、代码符号、特定指标等) - 时使用此工具,走 BM25 关键词命中排序。如果需要语义理解型的模糊检索,请使用 query_kb。 + 时使用此工具。检索采用「精准优先 + BM25 兜底」策略:包含完整关键词短语的 chunk + 排在前面(基于 Milvus PHRASE_MATCH,分词后 token 相邻即算精准命中),精准命中 + 不足时由 BM25 模糊命中补齐,结果 metadata 中以 is_precise_match 标记。如果需要 + 语义理解型的模糊检索,请使用 query_kb。 Args: kb_id: 知识库资源 ID,也就是 kb_id @@ -265,6 +268,7 @@ async def query_keywords( """ if not kb_id: return "请提供 kb_id" + keywords = [k.strip() for k in keywords if k and k.strip()] if not keywords: return "请提供关键词列表" @@ -281,9 +285,13 @@ async def query_keywords( try: retriever = target_info["retriever"] - # 拼接关键词为查询文本,强制使用 keyword/BM25 模式 + # 拼接关键词为查询文本,强制使用 keyword/BM25 模式并启用精准匹配 query_text = " ".join(keywords) - kwargs: dict[str, Any] = {"search_mode": "keyword"} + kwargs: dict[str, Any] = { + "search_mode": "keyword", + "precise_match": True, + "phrase_match_terms": keywords, + } if file_name: kwargs["file_name"] = file_name diff --git a/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py b/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py index d27ebc7ca..2d7d8bea8 100644 --- a/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py +++ b/backend/package/yuxi/knowledge/graphs/milvus_graph_vector_store.py @@ -205,6 +205,7 @@ def _get_or_create_entity_collection(self, kb_id: str, embedding_info: Any) -> C max_length=65535, enable_analyzer=True, analyzer_params=CONTENT_ANALYZER_PARAMS, + enable_match=True, ), FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_info.dimension or 1024), FieldSchema(name=CONTENT_SPARSE_FIELD, dtype=DataType.SPARSE_FLOAT_VECTOR), @@ -221,6 +222,7 @@ def _get_or_create_triple_collection(self, kb_id: str, embedding_info: Any) -> C max_length=65535, enable_analyzer=True, analyzer_params=CONTENT_ANALYZER_PARAMS, + enable_match=True, ), FieldSchema(name="source_id", dtype=DataType.VARCHAR, max_length=100), FieldSchema(name="target_id", dtype=DataType.VARCHAR, max_length=100), diff --git a/backend/package/yuxi/knowledge/implementations/milvus.py b/backend/package/yuxi/knowledge/implementations/milvus.py index 61b221aec..9161ea711 100644 --- a/backend/package/yuxi/knowledge/implementations/milvus.py +++ b/backend/package/yuxi/knowledge/implementations/milvus.py @@ -334,7 +334,9 @@ async def _create_kb_instance(self, kb_id: str, kb_config: dict) -> Any: return self._create_new_collection(collection_name, embedding_info, kb_id) if not self._collection_supports_bm25(collection): - logger.warning(f"Collection {collection_name} schema does not support BM25, recreating") + logger.warning( + f"Collection {collection_name} schema does not support BM25/phrase-match, recreating" + ) utility.drop_collection(collection_name, using=self.connection_alias) return self._create_new_collection(collection_name, embedding_info, kb_id) @@ -366,6 +368,7 @@ def _create_new_collection(self, collection_name: str, embedding_info: Any, kb_i max_length=65535, enable_analyzer=True, analyzer_params=CONTENT_ANALYZER_PARAMS, + enable_match=True, ), FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=100), FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=100), @@ -404,7 +407,7 @@ def _create_new_collection(self, collection_name: str, embedding_info: Any, kb_i return collection def _collection_supports_bm25(self, collection: Collection) -> bool: - """检查集合是否具备 Milvus 内置 BM25 所需的 schema。""" + """检查集合是否具备 Milvus 内置 BM25 与 PHRASE_MATCH 所需的 schema。""" fields = {field.name: field for field in collection.schema.fields} content_field = fields.get("content") sparse_field = fields.get(CONTENT_SPARSE_FIELD) @@ -412,6 +415,8 @@ def _collection_supports_bm25(self, collection: Collection) -> bool: return False if content_field.params.get("enable_analyzer") is not True: return False + if content_field.params.get("enable_match") is not True: + return False if not sparse_field or sparse_field.dtype != DataType.SPARSE_FLOAT_VECTOR: return False @@ -598,6 +603,34 @@ def _build_file_name_expr(self, kb_id: str, file_name: str | None) -> str | None joined_ids = '", "'.join(escaped_ids) return f'file_id in ["{joined_ids}"]' + @staticmethod + def _escape_expr_literal(s: str) -> str: + """转义 Milvus 表达式字符串字面量中的反斜杠与双引号。""" + return s.replace("\\", "\\\\").replace('"', '\\"') + + def _build_phrase_match_expr(self, terms: list[str], slop: int) -> str | None: + """构建 PHRASE_MATCH 表达式:任一关键词精准命中即算精准(grep 语义,取 or)。 + + slop=0 要求分词后 token 相邻(精确短语),>0 允许 token 间有间隔/乱序。 + 全部 term 为空时返回 None,表示无法应用精准匹配。 + """ + clauses: list[str] = [] + for term in terms: + cleaned = (term or "").strip() + if not cleaned: + continue + escaped = self._escape_expr_literal(cleaned) + clauses.append(f'PHRASE_MATCH(content, "{escaped}", {int(slop)})') + if not clauses: + return None + return " or ".join(clauses) if len(clauses) > 1 else clauses[0] + + @staticmethod + def _combine_exprs(*exprs: str | None) -> str | None: + """用 and 连接若干过滤表达式,跳过 None/空串;全空返回 None。""" + parts = [e for e in exprs if e] + return " and ".join(parts) if parts else None + async def index_file( self, kb_id: str, file_id: str, operator_id: str | None = None, params: dict | None = None ) -> dict: @@ -828,6 +861,7 @@ def _build_chunk_from_hit( score: float, include_distances: bool, score_field: str | None = None, + is_precise_match: bool | None = None, ) -> dict: """将 Milvus Hit 转成知识库统一返回的 Chunk 结构。""" entity = hit.entity @@ -838,6 +872,8 @@ def _build_chunk_from_hit( "file_id": file_id, "chunk_index": entity.get("chunk_index"), } + if is_precise_match is not None: + metadata["is_precise_match"] = is_precise_match chunk = {"content": entity.get("content", ""), "metadata": metadata, "score": float(score or 0.0)} if score_field: chunk[score_field] = float(score or 0.0) @@ -845,6 +881,37 @@ def _build_chunk_from_hit( chunk["distance"] = hit.distance return chunk + @staticmethod + def _merge_precise_and_backfill( + precise_hits: list[dict], backfill_hits: list[dict], final_top_k: int + ) -> list[dict]: + """合并精准命中与 BM25 兜底命中:精准块在前(已按 BM25 降序),兜底块在后,按 chunk_id 去重,截 final_top_k。""" + seen: set[str] = set() + merged: list[dict] = [] + + def _chunk_id(chunk: dict) -> str | None: + return chunk.get("metadata", {}).get("chunk_id") + + for chunk in precise_hits: + cid = _chunk_id(chunk) + if cid is None or cid in seen: + continue + seen.add(cid) + merged.append(chunk) + if len(merged) >= final_top_k: + return merged + + for chunk in backfill_hits: + cid = _chunk_id(chunk) + if cid is None or cid in seen: + continue + seen.add(cid) + merged.append(chunk) + if len(merged) >= final_top_k: + break + return merged + + async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, **kwargs) -> list[dict]: """异步查询知识库""" collection = await self._get_milvus_collection(kb_id) @@ -919,22 +986,69 @@ async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, ** "params": {"drop_ratio_search": bm25_drop_ratio_search}, } - results = collection.search( - data=[query_text], - anns_field=CONTENT_SPARSE_FIELD, - param=bm25_search_params, - limit=bm25_top_k, - expr=file_expr, - output_fields=output_fields, - ) - - if results and len(results) > 0 and len(results[0]) > 0: - for hit in results[0]: - retrieved_chunks.append( - self._build_chunk_from_hit(hit, hit.distance, include_distances, score_field="bm25_score") + precise_match = bool(merged_kwargs.get("precise_match", False)) + precise_hits: list[dict] = [] + backfill_hits: list[dict] = [] + + if precise_match: + phrase_slop = int(merged_kwargs.get("phrase_slop", 0)) + terms = merged_kwargs.get("phrase_match_terms") or [query_text] + phrase_expr = self._build_phrase_match_expr(list(terms), phrase_slop) + if phrase_expr is None: + logger.warning("precise_match requested but no valid terms; falling back to pure BM25") + precise_match = False + else: + precise_expr = self._combine_exprs(file_expr, phrase_expr) + results = collection.search( + data=[query_text], + anns_field=CONTENT_SPARSE_FIELD, + param=bm25_search_params, + limit=bm25_top_k, + expr=precise_expr, + output_fields=output_fields, ) - - logger.debug(f"Milvus BM25 query response: {len(retrieved_chunks)} chunks found") + if results and len(results) > 0 and len(results[0]) > 0: + for hit in results[0]: + precise_hits.append( + self._build_chunk_from_hit( + hit, + hit.distance, + include_distances, + score_field="bm25_score", + is_precise_match=True, + ) + ) + logger.debug(f"PHRASE_MATCH+BM25: {len(precise_hits)} precise hits") + + # 精准命中不足时用纯 BM25 兜底;精准命中已够则短路跳过 + if not precise_match or len(precise_hits) < final_top_k: + results = collection.search( + data=[query_text], + anns_field=CONTENT_SPARSE_FIELD, + param=bm25_search_params, + limit=bm25_top_k, + expr=file_expr, + output_fields=output_fields, + ) + if results and len(results) > 0 and len(results[0]) > 0: + for hit in results[0]: + backfill_hits.append( + self._build_chunk_from_hit( + hit, + hit.distance, + include_distances, + score_field="bm25_score", + is_precise_match=False if precise_match else None, + ) + ) + logger.debug(f"BM25 backfill: {len(backfill_hits)} hits") + + retrieved_chunks = self._merge_precise_and_backfill( + precise_hits, backfill_hits, final_top_k + ) + logger.debug( + f"Milvus BM25 query response: {len(retrieved_chunks)} chunks found (precise={len(precise_hits)})" + ) else: embedding_model_spec = self.databases_meta[kb_id].get("embedding_model_spec") embedding_function = self._get_embedding_function(embedding_model_spec, sync=True) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index badf555d0..d7afcc79d 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -182,7 +182,7 @@ services: etcd: container_name: milvus-etcd - image: quay.io/coreos/etcd:v3.5.5 + image: quay.io/coreos/etcd:v3.5.25 environment: - ETCD_AUTO_COMPACTION_MODE=revision - ETCD_AUTO_COMPACTION_RETENTION=1000 @@ -203,7 +203,7 @@ services: minio: container_name: minio - image: minio/minio:RELEASE.2023-03-20T20-16-18Z + image: minio/minio:RELEASE.2024-05-28T17-19-04Z environment: MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-minioadmin} MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-minioadmin} @@ -222,7 +222,7 @@ services: restart: unless-stopped milvus: - image: milvusdb/milvus:v2.5.6 + image: milvusdb/milvus:v2.6.16 container_name: milvus command: ["milvus", "run", "standalone"] security_opt: diff --git a/docker-compose.yml b/docker-compose.yml index 95ec4c9d5..19a38f1ca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -228,7 +228,7 @@ services: etcd: container_name: milvus-etcd-dev - image: quay.io/coreos/etcd:v3.5.5 + image: quay.io/coreos/etcd:v3.5.25 environment: - ETCD_AUTO_COMPACTION_MODE=revision - ETCD_AUTO_COMPACTION_RETENTION=1000 @@ -249,7 +249,7 @@ services: minio: container_name: minio - image: minio/minio:RELEASE.2023-03-20T20-16-18Z + image: minio/minio:RELEASE.2024-05-28T17-19-04Z environment: MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-minioadmin} MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-minioadmin} @@ -271,7 +271,7 @@ services: restart: unless-stopped milvus: - image: milvusdb/milvus:v2.5.6 + image: milvusdb/milvus:v2.6.16 container_name: milvus command: ["milvus", "run", "standalone"] security_opt: diff --git a/docker/save_docker_images.ps1 b/docker/save_docker_images.ps1 index 1c6a8fe43..00a887655 100644 --- a/docker/save_docker_images.ps1 +++ b/docker/save_docker_images.ps1 @@ -20,9 +20,9 @@ $Images = @( "node:24-slim", "nginx:alpine", "neo4j:5.26", - "quay.io/coreos/etcd:v3.5.5", - "minio/minio:RELEASE.2023-03-20T20-16-18Z", - "milvusdb/milvus:v2.5.6", + "quay.io/coreos/etcd:v3.5.25", + "minio/minio:RELEASE.2024-05-28T17-19-04Z", + "milvusdb/milvus:v2.6.16", # "lmsysorg/sglang:v0.4.9.post3-cu126", # "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.0.1-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6" ) diff --git a/docker/save_docker_images.sh b/docker/save_docker_images.sh index fb7e23c38..a07a6d8c7 100644 --- a/docker/save_docker_images.sh +++ b/docker/save_docker_images.sh @@ -17,9 +17,9 @@ IMAGES=( "node:24-slim", "nginx:alpine", "neo4j:5.26", - "quay.io/coreos/etcd:v3.5.5", - "minio/minio:RELEASE.2023-03-20T20-16-18Z", - "milvusdb/milvus:v2.5.6", + "quay.io/coreos/etcd:v3.5.25", + "minio/minio:RELEASE.2024-05-28T17-19-04Z", + "milvusdb/milvus:v2.6.16", ) # 确保所有镜像都已下载 diff --git a/docs/develop-guides/changelog.md b/docs/develop-guides/changelog.md index e363db997..54b68a954 100644 --- a/docs/develop-guides/changelog.md +++ b/docs/develop-guides/changelog.md @@ -20,7 +20,8 @@ - 新增 Agent token usage 状态快照,在状态面板中作为普通可折叠分组展示完整 `messages`、当前传给 LLM 的 `messages`、system/tools 构成、输入构成堆叠条和上下文窗口占用估算。 - 优化 Agent 上下文压缩:Yuxi 的 DeepAgents summary adapter 在生成 summary 与写入 conversation history 时,不再改写 `AIMessage.tool_calls` 或 provider tool metadata,只逐条替换被摘要掉的旧 `ToolMessage.content`;`summary_keep_messages` 保留窗口原样传给模型,不再额外清洗最近消息;完整工具输出写入 `outputs/large_tool_results`,文件名使用工具名与内容 hash 生成,上下文只保留完整路径和最多 `summary_tool_result_token_limit` tokens 的预览,未触发 summary 的常规模型调用不做额外 ToolMessage 清洗;Summary 阈值判断沿用 DeepAgents/LangChain 默认近似 token counter,并保留其 usage metadata scaling;首次写入 `conversation_history` 前读取旧文件的 sandbox 404 会按 `file_not_found` 处理,不再产生误导性 warning;`present_artifacts` 会拒绝展示 `large_tool_results` 与 `conversation_history` 等工具调用阶段文件。新增管理员可配置项 `summary_keep_messages`、`summary_prompt`、`summary_tool_result_token_limit` 与 `max_execution_steps`,分别控制摘要后保留消息数、摘要提示词、summary 阶段工具结果预览上限和 LangGraph `recursion_limit`。 - 收敛普通聊天模型加载链路:`select_model` 保留旧 `.call()` 调用契约,内部改为通过 LangChain chat model adapter 复用 Agent 侧模型加载器,统一 OpenAI-compatible、Anthropic 与 Gemini 等 provider 的运行时适配;移除旧 `OpenAIBase` wrapper,默认重试策略迁移为 LangChain provider 参数。 -- 新增知识库 `query_keywords` 工具:基于关键词列表走 BM25 通道检索,适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充 +- 新增知识库 `query_keywords` 工具:采用「精准优先 + BM25 兜底」策略,基于 Milvus 2.6 `PHRASE_MATCH` 让包含完整关键词短语的 chunk 排在前面,精准命中不足时由 BM25 模糊命中补齐,结果以 `metadata.is_precise_match` 标记;适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充 +- 升级 Milvus 至 v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28):知识库与图谱 content 字段新增 `enable_match=True` 以支持 `PHRASE_MATCH`;存量知识库集合在 `_collection_supports_bm25` 自检时自动检测并 drop 重建+重索引(懒触发、按 KB),图谱集合仅对新建生效 ## v0.7.0 (2026-06-13) diff --git a/docs/develop-guides/roadmap.md b/docs/develop-guides/roadmap.md index 189696fb6..53f7a9109 100644 --- a/docs/develop-guides/roadmap.md +++ b/docs/develop-guides/roadmap.md @@ -9,7 +9,7 @@ **知识库** - [ ] office 组件预览,docx/pptx 可以转PDF,然后前端预览 -- [x] 知识库工具新增 query_keywords 工具,专门用于基于关键词命中的排序 +- [x] 知识库工具新增 query_keywords 工具,基于 PHRASE_MATCH 精准匹配优先 + BM25 兜底的关键词检索 - [ ] 调研将当前知识库映射为虚拟文件系统的可行性,先明确文件树映射、权限边界、内容读取与 Agent 工具调用形态,再决定是否实现 - [ ] 增强知识库检索体验:增强 metadata、标签等 - [x] 优化思维导图构建的接口设计,支持增量构建和更新 diff --git a/scripts/init.ps1 b/scripts/init.ps1 index c4a3986da..5b5aabcea 100644 --- a/scripts/init.ps1 +++ b/scripts/init.ps1 @@ -121,12 +121,12 @@ $images = @( "python:3.12-slim", "node:24-slim", "node:24-alpine", - "milvusdb/milvus:v2.5.6", + "milvusdb/milvus:v2.6.16", "neo4j:5.26", - "minio/minio:RELEASE.2023-03-20T20-16-18Z", + "minio/minio:RELEASE.2024-05-28T17-19-04Z", "ghcr.io/astral-sh/uv:0.7.2", "nginx:alpine", - "quay.io/coreos/etcd:v3.5.5", + "quay.io/coreos/etcd:v3.5.25", "postgres:16", "redis:7-alpine" ) diff --git a/scripts/init.sh b/scripts/init.sh index d742a68f8..c752f599e 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -117,12 +117,12 @@ images=( "python:3.12-slim" "node:24-slim" "node:24-alpine" - "milvusdb/milvus:v2.5.6" + "milvusdb/milvus:v2.6.16" "neo4j:5.26" - "minio/minio:RELEASE.2023-03-20T20-16-18Z" + "minio/minio:RELEASE.2024-05-28T17-19-04Z" "ghcr.io/astral-sh/uv:0.7.2" "nginx:alpine" - "quay.io/coreos/etcd:v3.5.5" + "quay.io/coreos/etcd:v3.5.25" "postgres:16" "redis:7-alpine" ) From b0dd76036748ea3df063c34c6b414f76c0d75523 Mon Sep 17 00:00:00 2001 From: xiangchang_24 <1476897511@qq.com> Date: Thu, 18 Jun 2026 22:30:42 +0800 Subject: [PATCH 3/3] =?UTF-8?q?fix(knowledge):=20query=5Fkeywords=20?= =?UTF-8?q?=E6=94=B9=E5=90=91=E9=87=8F=E8=BF=81=E7=A7=BB=E5=B9=B6=E4=BF=AE?= =?UTF-8?q?=E6=A3=80=E7=B4=A2=E9=93=BE=E8=B7=AF=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 按评审反馈调整存量集合升级策略并修复 Codex 指出的正确性问题: - 向量迁移替代空重建:存量集合自检缺 enable_match 时,drop 前用 query_iterator 读出全量 embedding 原样回灌新集合,不重算; content_sparse 由新集合 BM25 Function 自动生成,迁移后 flush 保证 PHRASE_MATCH 倒排可见。embedding 模型变更分支仍走重算不迁移。 - 修复 or/and 优先级:多关键词 PHRASE_MATCH 的 or 子句整体加括号, 避免与 file_name 的 and 拼接时 file_name 仅约束首个关键词。 - 修复重排前截断:keyword 分支 _merge_precise_and_backfill 改传 recall_top_k 而非 final_top_k,开启重排/图检索时候选池不再失效。 - 补齐 enable_match 自检单测 fixture,新增迁移/优先级/截断单测, 新增精准匹配集成测试。 测试:test_milvus_kb 22 + test_kbs_tools 12 + 集成精准匹配 2 全绿 --- .../yuxi/knowledge/implementations/milvus.py | 92 +++++- .../api/test_knowledge_precise_match.py | 158 ++++++++++ backend/test/unit/plugins/test_milvus_kb.py | 279 +++++++++++++++++- backend/test/unit/toolkits/test_kbs_tools.py | 65 +++- docs/develop-guides/changelog.md | 3 +- 5 files changed, 579 insertions(+), 18 deletions(-) create mode 100644 backend/test/integration/api/test_knowledge_precise_match.py diff --git a/backend/package/yuxi/knowledge/implementations/milvus.py b/backend/package/yuxi/knowledge/implementations/milvus.py index 9161ea711..2a7142f76 100644 --- a/backend/package/yuxi/knowledge/implementations/milvus.py +++ b/backend/package/yuxi/knowledge/implementations/milvus.py @@ -35,6 +35,7 @@ CONTENT_ANALYZER_PARAMS = {"type": "chinese"} VECTOR_METRIC_TYPE = "COSINE" MILVUS_CHUNK_EMBED_BATCH_SIZE = 200 +MILVUS_MIGRATE_BATCH_SIZE = 1000 @dataclass(kw_only=True) @@ -335,10 +336,9 @@ async def _create_kb_instance(self, kb_id: str, kb_config: dict) -> Any: if not self._collection_supports_bm25(collection): logger.warning( - f"Collection {collection_name} schema does not support BM25/phrase-match, recreating" + f"Collection {collection_name} schema does not support BM25/phrase-match, migrating data" ) - utility.drop_collection(collection_name, using=self.connection_alias) - return self._create_new_collection(collection_name, embedding_info, kb_id) + return await self._migrate_collection_for_match(collection_name, collection, embedding_info, kb_id) logger.info(f"Retrieved existing collection: {collection_name}") return collection @@ -429,6 +429,79 @@ def _collection_supports_bm25(self, collection: Collection) -> bool: return True return False + async def _migrate_collection_for_match( + self, + collection_name: str, + old_collection: Collection, + embedding_info: Any, + kb_id: str, + ) -> Collection: + """迁移存量集合到支持 PHRASE_MATCH 的新 schema(enable_match=True)。 + + 升级路径:把旧集合的 embedding 原样读出 → drop → 按新 schema 建集合 → 回灌, + 不重算 embedding。content_sparse 由新集合的 BM25 Function 在 insert 时自动生成。 + + 风险:drop 后若 insert 失败,该 KB 数据会丢失,需人工重建。故应在维护窗口预热, + 且升级前对重要 KB 做备份。任一步失败直接抛错,不静默回退。 + """ + logger.warning(f"Collection {collection_name} missing enable_match, migrating embeddings to new schema") + return await asyncio.to_thread( + self._migrate_collection_for_match_sync, + collection_name, + old_collection, + embedding_info, + kb_id, + ) + + def _migrate_collection_for_match_sync( + self, + collection_name: str, + old_collection: Collection, + embedding_info: Any, + kb_id: str, + ) -> Collection: + """同步执行向量迁移(在 to_thread 中调用)。""" + # 1. load 旧集合(query 前置条件;已 load 时 Milvus 幂等) + try: + old_collection.load() + except Exception as e: + logger.warning(f"Load old collection {collection_name} for migration failed: {e}") + + # 2. 分批读出全量 records(不含 content_sparse,新集合 BM25 Function 自动生成) + output_fields = ["id", "content", "chunk_id", "file_id", "chunk_index", "embedding"] + records: list[dict] = [] + iterator = old_collection.query_iterator(batch_size=MILVUS_MIGRATE_BATCH_SIZE, output_fields=output_fields) + while True: + batch = iterator.next() + if not batch: + break + records.extend(batch) + logger.info(f"Migrating {len(records)} records for collection {collection_name}") + + # 3. drop 旧集合 + utility.drop_collection(collection_name, using=self.connection_alias) + + # 4. 建新集合(带 enable_match) + new_collection = self._create_new_collection(collection_name, embedding_info, kb_id) + + # 5. 分批回灌:列格式与 _insert_chunks_to_stores 一致,不传 sparse + for start in range(0, len(records), MILVUS_MIGRATE_BATCH_SIZE): + batch = records[start : start + MILVUS_MIGRATE_BATCH_SIZE] + entities = [ + [r.get("id") for r in batch], + [r.get("content") for r in batch], + [r.get("chunk_id") for r in batch], + [r.get("file_id") for r in batch], + [r.get("chunk_index") for r in batch], + [r.get("embedding") for r in batch], + ] + new_collection.insert(entities) + + # 6. flush 确保 PHRASE_MATCH 倒排在 growing segment 上可见,迁移后立即可查 + new_collection.flush() + logger.info(f"Migrated collection {collection_name}: {len(records)} records re-inserted") + return new_collection + async def _initialize_kb_instance(self, instance: Any) -> None: """初始化 Milvus 集合(加载到内存)""" try: @@ -623,7 +696,11 @@ def _build_phrase_match_expr(self, terms: list[str], slop: int) -> str | None: clauses.append(f'PHRASE_MATCH(content, "{escaped}", {int(slop)})') if not clauses: return None - return " or ".join(clauses) if len(clauses) > 1 else clauses[0] + # 多关键词整体加括号:与 file_expr 经 _combine_exprs 拼成 `file and (PM(a) or PM(b))`。 + # Milvus 中 and 优先级高于 or,不加括号 file_name 过滤只会约束第一个关键词。 + if len(clauses) > 1: + return f"({' or '.join(clauses)})" + return clauses[0] @staticmethod def _combine_exprs(*exprs: str | None) -> str | None: @@ -911,7 +988,6 @@ def _chunk_id(chunk: dict) -> str | None: break return merged - async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, **kwargs) -> list[dict]: """异步查询知识库""" collection = await self._get_milvus_collection(kb_id) @@ -1043,9 +1119,9 @@ async def aquery(self, query_text: str, kb_id: str, agent_call: bool = False, ** ) logger.debug(f"BM25 backfill: {len(backfill_hits)} hits") - retrieved_chunks = self._merge_precise_and_backfill( - precise_hits, backfill_hits, final_top_k - ) + # 用 recall_top_k 而非 final_top_k:开启 reranker/graph 时 recall_top_k 是重排候选池, + # 提前截到 final_top_k 会让重排只拿到少量候选、recall 退化。最终截断由 [:final_top_k] 负责。 + retrieved_chunks = self._merge_precise_and_backfill(precise_hits, backfill_hits, recall_top_k) logger.debug( f"Milvus BM25 query response: {len(retrieved_chunks)} chunks found (precise={len(precise_hits)})" ) diff --git a/backend/test/integration/api/test_knowledge_precise_match.py b/backend/test/integration/api/test_knowledge_precise_match.py new file mode 100644 index 000000000..974f947cb --- /dev/null +++ b/backend/test/integration/api/test_knowledge_precise_match.py @@ -0,0 +1,158 @@ +"""知识库精准匹配检索 integration 测试。 + +覆盖 query_keywords 工具背后「精准优先 + BM25 兜底」检索策略的端到端链路: +建 KB -> 上传文件 -> 异步索引 -> 通过 /query-test 接口验证 PHRASE_MATCH 精准命中 +排在 BM25 模糊命中之前,且 metadata.is_precise_match 标记正确。 + +依赖 docker compose up -d 后的运行环境与 TEST_USERNAME/TEST_PASSWORD 超管凭据。 +""" + +import asyncio +import os + +import pytest +from pymilvus import Collection, connections, utility + +pytestmark = [pytest.mark.asyncio, pytest.mark.integration] + +_MILVUS_FLUSH_ALIAS = "_test_flush_kb" + + +# 包含完整短语「扭转减振器」的段落会被 PHRASE_MATCH 精准命中; +# 仅提及「减振器/振动」的段落只能由 BM25 模糊命中。 +PRECISE_MATCH_DOC = """# 扭转减振器技术说明 + +扭转减振器是汽车传动系统中的关键部件,用于衰减发动机曲轴产生的扭转振动,保护传动系免受过大动载荷。扭转减振器通常安装在飞轮与离合器之间,通过匹配刚度与阻尼来吸收扭转振动能量。 + +减振器的设计需要兼顾工作频段与共振点位置。当系统振动频率接近共振点时,减振装置能够有效降低振幅,避免传动系结构件因疲劳而损坏。普通减振器一般通过橡胶元件或弹簧组提供阻尼。 + +维护保养方面,应定期检查减振元件的老化程度与连接紧固情况。一旦发现橡胶开裂或弹簧失效,需及时更换,否则会削弱减振能力并放大振动。日常使用中若出现异常振动噪声,应优先排查减振装置。 + +安装扭矩与配合间隙必须符合厂家规范。过紧会加剧磨损,过松则无法有效传递阻尼。建议在专业场地由技术人员操作,并使用专用工具校核安装尺寸。 +""" + + +async def _wait_for_task(test_client, admin_headers, task_id: str, timeout: float = 120.0) -> dict: + """轮询任务直到进入终态,返回 task 字典。""" + deadline = asyncio.get_event_loop().time() + timeout + while asyncio.get_event_loop().time() < deadline: + resp = await test_client.get(f"/api/tasks/{task_id}", headers=admin_headers) + assert resp.status_code == 200, resp.text + task = resp.json().get("task", {}) + if task.get("status") in {"success", "failed", "cancelled"}: + return task + await asyncio.sleep(0.5) + pytest.fail(f"Task {task_id} did not reach terminal status within {timeout}s") + + +async def _index_markdown(test_client, admin_headers, kb_id: str, filename: str, content: str) -> None: + """上传 markdown 文件并以 auto_index 触发解析+索引,等待完成。""" + upload_resp = await test_client.post( + "/api/knowledge/files/upload", + params={"kb_id": kb_id}, + files={"file": (filename, content.encode("utf-8"), "text/markdown")}, + headers=admin_headers, + ) + assert upload_resp.status_code == 200, upload_resp.text + upload_json = upload_resp.json() + minio_url = upload_json["file_path"] + + enqueue_resp = await test_client.post( + f"/api/knowledge/databases/{kb_id}/documents", + json={ + "items": [minio_url], + "params": { + "content_type": "file", + "auto_index": True, + "content_hashes": {minio_url: upload_json["content_hash"]}, + "file_sizes": {minio_url: upload_json["size"]}, + }, + }, + headers=admin_headers, + ) + assert enqueue_resp.status_code == 200, enqueue_resp.text + task_id = enqueue_resp.json()["task_id"] + + task = await _wait_for_task(test_client, admin_headers, task_id) + assert task["status"] == "success", f"indexing task failed: {task.get('error') or task.get('result')}" + await _flush_kb_collection(kb_id) + + +async def _flush_kb_collection(kb_id: str) -> None: + """显式 flush 集合,保证 PHRASE_MATCH 倒排索引在 growing segment 上可查。 + + index_file insert 后未 flush,enable_match 倒排索引需 segment seal 后才稳定可见, + 否则索引后立即查询会偶发返回空。flush 是测试侧保证数据可见性的手段,不改变检索语义。 + """ + uri = os.getenv("MILVUS_URI", "http://milvus:19530") + if connections.has_connection(_MILVUS_FLUSH_ALIAS): + connections.disconnect(_MILVUS_FLUSH_ALIAS) + connections.connect(alias=_MILVUS_FLUSH_ALIAS, uri=uri) + try: + if utility.has_collection(kb_id, using=_MILVUS_FLUSH_ALIAS): + Collection(kb_id, using=_MILVUS_FLUSH_ALIAS).flush() + finally: + connections.disconnect(_MILVUS_FLUSH_ALIAS) + + +async def _query_test(test_client, admin_headers, kb_id: str, query: str, meta: dict) -> list[dict]: + resp = await test_client.post( + f"/api/knowledge/databases/{kb_id}/query-test", + json={"query": query, "meta": meta}, + headers=admin_headers, + ) + assert resp.status_code == 200, resp.text + return resp.json() + + +async def test_query_test_precise_match_ranks_phrase_hits_first(test_client, admin_headers, knowledge_database): + """精准匹配:含完整短语的 chunk 标记 is_precise_match=True 且排在模糊命中之前。""" + kb_id = knowledge_database["kb_id"] + await _index_markdown(test_client, admin_headers, kb_id, "torsional_damper.md", PRECISE_MATCH_DOC) + + chunks = await _query_test( + test_client, + admin_headers, + kb_id, + "扭转减振器", + { + "search_mode": "keyword", + "precise_match": True, + "phrase_match_terms": ["扭转减振器"], + "final_top_k": 10, + }, + ) + + assert chunks, "精准匹配检索应返回非空结果" + + precise_flags = [bool(c.get("metadata", {}).get("is_precise_match")) for c in chunks] + assert any(precise_flags), "应至少有一个精准命中(is_precise_match=True)的 chunk" + + # 精准块必须整体排在非精准块之前:最后一个 True 之后不应再出现 False + last_true = max(i for i, v in enumerate(precise_flags) if v) + has_false_after = any(not precise_flags[i] for i in range(last_true + 1, len(precise_flags))) + assert not has_false_after, "精准命中必须排在 BM25 兜底命中之前" + + # 精准命中的 chunk 内容应包含完整短语,且带 bm25_score + precise_chunks = [c for c in chunks if c.get("metadata", {}).get("is_precise_match")] + assert all("扭转减振器" in c.get("content", "") for c in precise_chunks) + assert all(isinstance(c.get("bm25_score"), float) for c in precise_chunks) + + +async def test_query_test_pure_bm25_omits_precise_flag(test_client, admin_headers, knowledge_database): + """纯 BM25(不启用精准匹配)返回的 chunk 不应带 is_precise_match 标记。""" + kb_id = knowledge_database["kb_id"] + await _index_markdown(test_client, admin_headers, kb_id, "torsional_damper_plain.md", PRECISE_MATCH_DOC) + + chunks = await _query_test( + test_client, + admin_headers, + kb_id, + "扭转减振器", + {"search_mode": "keyword", "final_top_k": 10}, + ) + + assert chunks, "纯 BM25 检索应返回非空结果" + assert all("is_precise_match" not in c.get("metadata", {}) for c in chunks), ( + "未启用 precise_match 时不应写入 is_precise_match 标记" + ) diff --git a/backend/test/unit/plugins/test_milvus_kb.py b/backend/test/unit/plugins/test_milvus_kb.py index df0ff1181..1f500ae70 100644 --- a/backend/test/unit/plugins/test_milvus_kb.py +++ b/backend/test/unit/plugins/test_milvus_kb.py @@ -19,25 +19,28 @@ class FakeHit: - def __init__(self, content: str, distance: float): + def __init__(self, content: str, distance: float, chunk_id: str = "chunk-1"): self.distance = distance self.entity = { "content": content, - "chunk_id": "chunk-1", + "chunk_id": chunk_id, "file_id": "file-1", "chunk_index": 0, } class FakeCollection: - def __init__(self, distance: float = 0.8): + def __init__(self, distance: float = 0.8, search_results: list | None = None): self.search_calls = [] self.hybrid_calls = [] self.insert_calls = [] self.distance = distance + self._search_results = list(search_results) if search_results else None def search(self, **kwargs): self.search_calls.append(kwargs) + if self._search_results: + return self._search_results.pop(0) return [[FakeHit("BM25 result", self.distance)]] def hybrid_search(self, **kwargs): @@ -420,6 +423,102 @@ async def test_keyword_mode_uses_milvus_bm25_search(): assert search_call["limit"] == 7 +async def test_keyword_mode_precise_match_uses_phrase_match_filter_and_backfill(): + """精准匹配:PHRASE_MATCH 过滤的精准命中在前,BM25 兜底在后,按 chunk_id 去重。""" + precise_results = [ + [ + FakeHit("precise-1", 0.9, chunk_id="p1"), + FakeHit("precise-2", 0.7, chunk_id="p2"), + ] + ] + backfill_results = [ + [ + FakeHit("backfill-1", 0.5, chunk_id="b1"), + FakeHit("backfill-2", 0.3, chunk_id="b2"), + ] + ] + collection = FakeCollection(search_results=[precise_results, backfill_results]) + kb = make_kb(collection) + + chunks = await kb.aquery( + "扭转减振器", + "db", + search_mode="keyword", + precise_match=True, + phrase_match_terms=["扭转减振器"], + final_top_k=10, + ) + + # 第一次 search 带 PHRASE_MATCH 过滤 + precise_call = collection.search_calls[0] + assert precise_call["anns_field"] == CONTENT_SPARSE_FIELD + assert 'PHRASE_MATCH(content, "扭转减振器", 0)' in precise_call["expr"] + + # 第二次 search 为纯 BM25 兜底(无 file_name 时 expr 为 None) + backfill_call = collection.search_calls[1] + assert backfill_call["expr"] is None + + # 合并顺序:精准在前、兜底在后,去重后共 4 条 + assert [c["metadata"]["chunk_id"] for c in chunks] == ["p1", "p2", "b1", "b2"] + assert chunks[0]["metadata"]["is_precise_match"] is True + assert chunks[1]["metadata"]["is_precise_match"] is True + assert chunks[2]["metadata"]["is_precise_match"] is False + assert chunks[3]["metadata"]["is_precise_match"] is False + + +async def test_precise_match_short_circuits_when_enough_hits(): + """精准命中已够 final_top_k 时不再触发兜底查询。""" + precise_results = [ + [ + FakeHit("precise-1", 0.9, chunk_id="p1"), + FakeHit("precise-2", 0.7, chunk_id="p2"), + ] + ] + collection = FakeCollection(search_results=[precise_results]) + kb = make_kb(collection) + + chunks = await kb.aquery( + "term", + "db", + search_mode="keyword", + precise_match=True, + phrase_match_terms=["term"], + final_top_k=1, + ) + + assert len(collection.search_calls) == 1 + assert chunks[0]["metadata"]["chunk_id"] == "p1" + assert chunks[0]["metadata"]["is_precise_match"] is True + + +async def test_precise_match_degrades_when_no_valid_terms(): + """phrase_match_terms 全为空时降级为纯 BM25,不抛错、不写 is_precise_match。""" + collection = FakeCollection() + kb = make_kb(collection) + + chunks = await kb.aquery( + "fallback query", + "db", + search_mode="keyword", + precise_match=True, + phrase_match_terms=["", " "], + final_top_k=5, + ) + + assert len(collection.search_calls) == 1 + assert collection.search_calls[0]["expr"] is None + assert "is_precise_match" not in chunks[0]["metadata"] + + +def test_build_phrase_match_expr_or_joins_and_escapes(): + kb = MilvusKB.__new__(MilvusKB) + expr = kb._build_phrase_match_expr(["扭转减振器", '含"引号', ""], 0) + # 多关键词整体加括号,避免与 file_expr 拼 and 时 or 优先级问题 + assert expr == '(PHRASE_MATCH(content, "扭转减振器", 0) or PHRASE_MATCH(content, "含\\"引号", 0))' + assert kb._build_phrase_match_expr(["", " "], 0) is None + assert kb._build_phrase_match_expr(["单关键词"], 2) == 'PHRASE_MATCH(content, "单关键词", 2)' + + async def test_vector_mode_ignores_metric_type_override(): collection = FakeCollection() kb = make_kb(collection) @@ -509,6 +608,7 @@ def test_collection_supports_bm25_requires_analyzed_content_sparse_field_and_fun max_length=65535, enable_analyzer=True, analyzer_params=CONTENT_ANALYZER_PARAMS, + enable_match=True, ), FieldSchema(name=CONTENT_SPARSE_FIELD, dtype=DataType.SPARSE_FLOAT_VECTOR), ], @@ -525,3 +625,176 @@ def test_collection_supports_bm25_requires_analyzed_content_sparse_field_and_fun collection = type("Collection", (), {"schema": schema})() assert kb._collection_supports_bm25(collection) + + +def test_collection_supports_bm25_requires_enable_match(): + """缺 enable_match 的存量集合应被判为不支持,触发自动重建。""" + kb = MilvusKB.__new__(MilvusKB) + schema = CollectionSchema( + fields=[ + FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=100, is_primary=True), + FieldSchema( + name="content", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + analyzer_params=CONTENT_ANALYZER_PARAMS, + ), + FieldSchema(name=CONTENT_SPARSE_FIELD, dtype=DataType.SPARSE_FLOAT_VECTOR), + ], + functions=[ + Function( + name="content_bm25", + input_field_names=["content"], + output_field_names=[CONTENT_SPARSE_FIELD], + function_type=FunctionType.BM25, + ) + ], + ) + + collection = type("Collection", (), {"schema": schema})() + + assert not kb._collection_supports_bm25(collection) + + +async def test_migrate_collection_for_match_reuses_embeddings(monkeypatch): + """向量迁移:旧集合 embedding 原样回灌新集合,不重算;不读 content_sparse。""" + from yuxi.knowledge.implementations import milvus as milvus_mod + + records = [ + {"id": "id-1", "content": "c1", "chunk_id": "c1", "file_id": "f1", "chunk_index": 0, "embedding": [0.1, 0.2]}, + {"id": "id-2", "content": "c2", "chunk_id": "c2", "file_id": "f1", "chunk_index": 1, "embedding": [0.3, 0.4]}, + ] + + class FakeIterator: + def __init__(self, batches): + self._batches = batches + self._i = 0 + + def next(self): + if self._i >= len(self._batches): + return [] + batch = self._batches[self._i] + self._i += 1 + return batch + + class OldCollection: + def __init__(self): + self.query_iterator_calls = [] + + def load(self): + pass + + def query_iterator(self, **kwargs): + self.query_iterator_calls.append(kwargs) + return FakeIterator([records]) + + class NewCollection: + def __init__(self): + self.insert_calls = [] + self.flushed = False + + def insert(self, entities): + self.insert_calls.append(entities) + + def flush(self): + self.flushed = True + + old = OldCollection() + new_collection = NewCollection() + kb = MilvusKB.__new__(MilvusKB) + kb.connection_alias = "default" + kb._create_new_collection = lambda name, info, kb_id: new_collection + + drop_calls = [] + monkeypatch.setattr(milvus_mod.utility, "drop_collection", lambda name, using=None: drop_calls.append(name)) + # __del__ 会对带 connection_alias 的实例调 disconnect,mock 掉避免 pymilvus deprecation 噪音 + monkeypatch.setattr(milvus_mod.connections, "disconnect", lambda *a, **k: None) + + result = await kb._migrate_collection_for_match("col", old, None, "db") + + assert result is new_collection + assert drop_calls == ["col"] + assert old.query_iterator_calls[0]["output_fields"] == [ + "id", + "content", + "chunk_id", + "file_id", + "chunk_index", + "embedding", + ] + assert "content_sparse" not in old.query_iterator_calls[0]["output_fields"] + assert len(new_collection.insert_calls) == 1 + entities = new_collection.insert_calls[0] + # 列格式:[ids, contents, chunk_ids, file_ids, chunk_indexes, embeddings] + assert entities[0] == ["id-1", "id-2"] + assert entities[2] == ["c1", "c2"] + assert entities[5] == [[0.1, 0.2], [0.3, 0.4]] + assert new_collection.flushed is True + + +async def test_keyword_mode_reranker_keeps_recall_pool(monkeypatch): + """bug #2: reranker 开启时 keyword 分支不应提前截到 final_top_k,候选池保留 recall_top_k。""" + precise_results = [ + [ + FakeHit("p1", 0.9, chunk_id="p1"), + FakeHit("p2", 0.8, chunk_id="p2"), + FakeHit("p3", 0.7, chunk_id="p3"), + ] + ] + collection = FakeCollection(search_results=[precise_results]) + kb = make_kb(collection) + + captured = {} + + class FakeReranker: + async def acompute_score(self, pairs, normalize=True): + _, docs = pairs + captured["docs"] = list(docs) + return [0.9, 0.5, 0.7][: len(docs)] + + async def aclose(self): + pass + + import yuxi.models.rerank as rerank_mod + + monkeypatch.setattr(rerank_mod, "get_reranker", lambda model: FakeReranker()) + + chunks = await kb.aquery( + "term", + "db", + search_mode="keyword", + precise_match=True, + phrase_match_terms=["term"], + final_top_k=2, + use_reranker=True, + reranker_model="fake", + recall_top_k=50, + ) + + # precise 命中 3 条短路兜底;reranker 应拿到全部 3 条而非被 final_top_k=2 截断 + assert len(captured["docs"]) == 3 + assert len(chunks) == 2 # 最终截断到 final_top_k + + +async def test_keyword_mode_precise_match_with_file_name_wraps_or(monkeypatch): + """bug #3: 多关键词 + file_name 过滤时,or 子句整体加括号,file_name 约束全部关键词。""" + precise_results = [[FakeHit("precise-1", 0.9, chunk_id="p1")]] + collection = FakeCollection(search_results=[precise_results]) + kb = make_kb(collection) + + await kb.aquery( + "kw", + "db", + search_mode="keyword", + precise_match=True, + phrase_match_terms=["扭转减振器", "减振器"], + file_name="demo.md", + final_top_k=5, + ) + + precise_call = collection.search_calls[0] + expr = precise_call["expr"] + # file_expr 在前,PHRASE_MATCH 的 or 子句整体被括号包裹 + assert expr.startswith('file_id == "file-1" and (PHRASE_MATCH(content, "扭转减振器", 0) or ') + assert expr.endswith(")") diff --git a/backend/test/unit/toolkits/test_kbs_tools.py b/backend/test/unit/toolkits/test_kbs_tools.py index 3034c3814..c27d977a0 100644 --- a/backend/test/unit/toolkits/test_kbs_tools.py +++ b/backend/test/unit/toolkits/test_kbs_tools.py @@ -4,7 +4,6 @@ from types import SimpleNamespace import pytest - from yuxi.agents.toolkits.kbs import tools @@ -24,6 +23,10 @@ def _query_kb_callable(): return _tool_callable(tools.query_kb) +def _query_keywords_callable(): + return _tool_callable(tools.query_keywords) + + def _find_kb_document_callable(): return _tool_callable(tools.find_kb_document) @@ -43,6 +46,10 @@ async def _run_query_kb(**kwargs): return await _run_tool(_query_kb_callable(), **kwargs) +async def _run_query_keywords(**kwargs): + return await _run_tool(_query_keywords_callable(), **kwargs) + + async def _run_find_kb_document(**kwargs): return await _run_tool(_find_kb_document_callable(), **kwargs) @@ -71,7 +78,7 @@ def _build_test_window(content: str, offset: int = 0, limit: int = 1800) -> dict def _patch_retrievers(monkeypatch, *, kb_type: str = "milvus", retriever=None): monkeypatch.setattr( - tools.knowledge_base, + tools._get_knowledge_base(), "get_retrievers", lambda: { "db-1": { @@ -80,6 +87,7 @@ def _patch_retrievers(monkeypatch, *, kb_type: str = "milvus", retriever=None): "metadata": {"kb_type": kb_type}, } }, + raising=False, ) @@ -204,6 +212,51 @@ async def _fake_retriever(query_text: str, **kwargs): } +@pytest.mark.asyncio +async def test_query_keywords_forwards_precise_match_kwargs(monkeypatch) -> None: + captured: dict = {} + + async def _fake_retriever(query_text: str, **kwargs): + captured["query_text"] = query_text + captured["kwargs"] = kwargs + return [ + { + "content": "precise hit", + "metadata": {"file_id": "file-1", "source": "doc.md", "is_precise_match": True}, + } + ] + + _patch_retrievers(monkeypatch, retriever=_fake_retriever) + monkeypatch.setattr(tools, "_resolve_visible_knowledge_bases_for_query", _fake_visible_kbs) + + runtime = SimpleNamespace(context=SimpleNamespace()) + result = await _run_query_keywords(kb_id="db-1", keywords=["扭转减振器", "故障"], runtime=runtime) + + # 拼接为空格分隔的查询文本,并强制 keyword + 精准匹配 + 原始关键词列表 + assert captured["query_text"] == "扭转减振器 故障" + assert captured["kwargs"] == { + "search_mode": "keyword", + "precise_match": True, + "phrase_match_terms": ["扭转减振器", "故障"], + } + assert result["kb_id"] == "db-1" + assert result["results"][0]["metadata"]["is_precise_match"] is True + + +@pytest.mark.asyncio +async def test_query_keywords_rejects_empty_or_whitespace_keywords(monkeypatch) -> None: + async def _must_not_be_called(*args, **kwargs): + raise AssertionError("retriever should not be called for empty keywords") + + _patch_retrievers(monkeypatch, retriever=_must_not_be_called) + monkeypatch.setattr(tools, "_resolve_visible_knowledge_bases_for_query", _fake_visible_kbs) + + runtime = SimpleNamespace(context=SimpleNamespace()) + + assert await _run_query_keywords(kb_id="db-1", keywords=[], runtime=runtime) == "请提供关键词列表" + assert await _run_query_keywords(kb_id="db-1", keywords=["", " "], runtime=runtime) == "请提供关键词列表" + + @pytest.mark.asyncio async def test_find_kb_document_returns_context_windows(monkeypatch) -> None: _patch_retrievers(monkeypatch) @@ -240,7 +293,7 @@ async def _fake_find_file_content( ], } - monkeypatch.setattr(tools.knowledge_base, "find_file_content", _fake_find_file_content) + monkeypatch.setattr(tools._get_knowledge_base(), "find_file_content", _fake_find_file_content, raising=False) runtime = SimpleNamespace(context=SimpleNamespace()) result = await _run_find_kb_document( @@ -295,7 +348,7 @@ async def _fake_open_file_content(kb_id: str, file_id: str, offset: int = 0, lim assert file_id == "file-1" return _build_test_window("\n".join(lines), offset=offset, limit=limit) - monkeypatch.setattr(tools.knowledge_base, "open_file_content", _fake_open_file_content) + monkeypatch.setattr(tools._get_knowledge_base(), "open_file_content", _fake_open_file_content, raising=False) runtime = SimpleNamespace(context=SimpleNamespace()) result = await _run_open_kb_document(kb_id="db-1", file_id="file-1", runtime=runtime) @@ -325,7 +378,7 @@ async def _fake_open_file_content(kb_id: str, file_id: str, offset: int = 0, lim assert file_id == "file-1" return _build_test_window("\n".join(lines), offset=offset, limit=limit) - monkeypatch.setattr(tools.knowledge_base, "open_file_content", _fake_open_file_content) + monkeypatch.setattr(tools._get_knowledge_base(), "open_file_content", _fake_open_file_content, raising=False) runtime = SimpleNamespace(context=SimpleNamespace()) result = await _run_open_kb_document( @@ -369,7 +422,7 @@ async def _fake_open_file_content(kb_id: str, file_id: str, offset: int = 0, lim del kb_id, file_id, offset, limit raise Exception("文件 file-1 没有解析后的 Markdown 内容") - monkeypatch.setattr(tools.knowledge_base, "open_file_content", _fake_open_file_content) + monkeypatch.setattr(tools._get_knowledge_base(), "open_file_content", _fake_open_file_content, raising=False) runtime = SimpleNamespace(context=SimpleNamespace()) result = await _run_open_kb_document(kb_id="db-1", file_id="file-1", runtime=runtime) diff --git a/docs/develop-guides/changelog.md b/docs/develop-guides/changelog.md index 54b68a954..e90502bd3 100644 --- a/docs/develop-guides/changelog.md +++ b/docs/develop-guides/changelog.md @@ -21,7 +21,8 @@ - 优化 Agent 上下文压缩:Yuxi 的 DeepAgents summary adapter 在生成 summary 与写入 conversation history 时,不再改写 `AIMessage.tool_calls` 或 provider tool metadata,只逐条替换被摘要掉的旧 `ToolMessage.content`;`summary_keep_messages` 保留窗口原样传给模型,不再额外清洗最近消息;完整工具输出写入 `outputs/large_tool_results`,文件名使用工具名与内容 hash 生成,上下文只保留完整路径和最多 `summary_tool_result_token_limit` tokens 的预览,未触发 summary 的常规模型调用不做额外 ToolMessage 清洗;Summary 阈值判断沿用 DeepAgents/LangChain 默认近似 token counter,并保留其 usage metadata scaling;首次写入 `conversation_history` 前读取旧文件的 sandbox 404 会按 `file_not_found` 处理,不再产生误导性 warning;`present_artifacts` 会拒绝展示 `large_tool_results` 与 `conversation_history` 等工具调用阶段文件。新增管理员可配置项 `summary_keep_messages`、`summary_prompt`、`summary_tool_result_token_limit` 与 `max_execution_steps`,分别控制摘要后保留消息数、摘要提示词、summary 阶段工具结果预览上限和 LangGraph `recursion_limit`。 - 收敛普通聊天模型加载链路:`select_model` 保留旧 `.call()` 调用契约,内部改为通过 LangChain chat model adapter 复用 Agent 侧模型加载器,统一 OpenAI-compatible、Anthropic 与 Gemini 等 provider 的运行时适配;移除旧 `OpenAIBase` wrapper,默认重试策略迁移为 LangChain provider 参数。 - 新增知识库 `query_keywords` 工具:采用「精准优先 + BM25 兜底」策略,基于 Milvus 2.6 `PHRASE_MATCH` 让包含完整关键词短语的 chunk 排在前面,精准命中不足时由 BM25 模糊命中补齐,结果以 `metadata.is_precise_match` 标记;适合精确匹配专有名词、术语和代码符号等场景,与 `query_kb` 的语义检索互为补充 -- 升级 Milvus 至 v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28):知识库与图谱 content 字段新增 `enable_match=True` 以支持 `PHRASE_MATCH`;存量知识库集合在 `_collection_supports_bm25` 自检时自动检测并 drop 重建+重索引(懒触发、按 KB),图谱集合仅对新建生效 +- 升级 Milvus 至 v2.6.16(etcd v3.5.25 / minio RELEASE.2024-05-28):知识库与图谱 content 字段新增 `enable_match=True` 以支持 `PHRASE_MATCH`;存量知识库集合在 `_collection_supports_bm25` 自检时自动迁移向量到新 schema(drop 前用 `query_iterator` 读出全量 embedding 原样回灌,不重算;`content_sparse` 由新集合 BM25 Function 重新生成,迁移后 flush 保证精准匹配可见),懒触发、按 KB;图谱集合仅对新建生效 +- 修复 `query_keywords` 检索链路三处问题:多关键词 `PHRASE_MATCH` 以 `or` 拼接后与 `file_name` 过滤表达式用 `and` 连接未加括号,受 Milvus 运算符优先级影响 `file_name` 仅约束首个关键词,现对多关键词子句整体加括号;keyword 模式开启重排/图检索时候选池被提前截断至 `final_top_k` 导致 recall 退化,改为截到 `recall_top_k`、最终截断交由统一出口;存量集合自检缺 `enable_match` 时原 drop+空重建会使检索变空,改为向量迁移路径 ## v0.7.0 (2026-06-13)