|
| 1 | +""" |
| 2 | +热门问题:按数据源聚合,并在同一数据源内做语义相近合并(非纯字面 group_by)。 |
| 3 | +
|
| 4 | +1. 意图桶:库表/数据概览类中文问法合并为同一主题(见 META_OVERVIEW_PATTERN)。 |
| 5 | +2. 向量聚类:对其余问句用本地中文 embedding 做余弦相似度合并(可选,失败则回退)。 |
| 6 | +3. 回退:归一化 + difflib 合并相近字面。 |
| 7 | +""" |
| 8 | + |
| 9 | +from __future__ import annotations |
| 10 | + |
| 11 | +import re |
| 12 | +from difflib import SequenceMatcher |
| 13 | +from typing import Any, Dict, List, Tuple |
| 14 | + |
| 15 | +import numpy as np |
| 16 | + |
| 17 | +# 表/数据量/有哪些数据 等「元信息」类问题归为一类(用户示例) |
| 18 | +META_OVERVIEW_PATTERN = re.compile( |
| 19 | + r"(几张表|哪些表|多少张表|有多少表|表.*数据量|数据量.*表|分别.*数据量|数据量.*多大|" |
| 20 | + r"哪些数据|有什么数据|有哪些数据|什么数据|库表|schema|多少条数据|统计.*表|表的.*数量)", |
| 21 | + re.IGNORECASE, |
| 22 | +) |
| 23 | + |
| 24 | + |
| 25 | +def normalize_question(s: str) -> str: |
| 26 | + if not s: |
| 27 | + return "" |
| 28 | + t = s.strip() |
| 29 | + t = re.sub(r"[\s\u3000]+", "", t) |
| 30 | + t = re.sub(r"[。..!?!?;;,、]+$", "", t) |
| 31 | + return t |
| 32 | + |
| 33 | + |
| 34 | +def _split_meta_overview( |
| 35 | + weighted: List[Tuple[str, int]], |
| 36 | +) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]: |
| 37 | + meta: List[Tuple[str, int]] = [] |
| 38 | + rest: List[Tuple[str, int]] = [] |
| 39 | + for q, c in weighted: |
| 40 | + if META_OVERVIEW_PATTERN.search(q): |
| 41 | + meta.append((q, c)) |
| 42 | + else: |
| 43 | + rest.append((q, c)) |
| 44 | + out: List[Tuple[str, int]] = [] |
| 45 | + if meta: |
| 46 | + rep = max(meta, key=lambda x: x[1])[0] |
| 47 | + total = sum(c for _, c in meta) |
| 48 | + out.append((rep, total)) |
| 49 | + return out, rest |
| 50 | + |
| 51 | + |
| 52 | +def _merge_difflib(weighted: List[Tuple[str, int]], threshold: float = 0.78) -> List[Tuple[str, int]]: |
| 53 | + if not weighted: |
| 54 | + return [] |
| 55 | + items = sorted(weighted, key=lambda x: -x[1]) |
| 56 | + clusters: List[Dict[str, Any]] = [] |
| 57 | + for q, c in items: |
| 58 | + nq = normalize_question(q) |
| 59 | + best_i = -1 |
| 60 | + best_r = 0.0 |
| 61 | + for i, cl in enumerate(clusters): |
| 62 | + r = SequenceMatcher(None, nq, cl["norm"]).ratio() |
| 63 | + if r >= threshold and r > best_r: |
| 64 | + best_r = r |
| 65 | + best_i = i |
| 66 | + if best_i >= 0: |
| 67 | + clusters[best_i]["count"] += c |
| 68 | + if c > clusters[best_i].get("max_w", 0): |
| 69 | + clusters[best_i]["rep"] = q |
| 70 | + clusters[best_i]["max_w"] = c |
| 71 | + else: |
| 72 | + clusters.append({"rep": q, "count": c, "norm": nq, "max_w": c}) |
| 73 | + return [(c["rep"], int(c["count"])) for c in clusters] |
| 74 | + |
| 75 | + |
| 76 | +def _merge_embedding(weighted: List[Tuple[str, int]], threshold: float = 0.76) -> List[Tuple[str, int]]: |
| 77 | + if len(weighted) <= 1: |
| 78 | + return weighted |
| 79 | + try: |
| 80 | + from apps.ai_model.embedding import EmbeddingModelCache |
| 81 | + |
| 82 | + texts = [w[0] for w in weighted] |
| 83 | + model = EmbeddingModelCache.get_model() |
| 84 | + embs = model.embed_documents(texts) |
| 85 | + arr = np.array(embs, dtype=np.float32) |
| 86 | + norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-9 |
| 87 | + arr = arr / norms |
| 88 | + n = len(weighted) |
| 89 | + parent = list(range(n)) |
| 90 | + |
| 91 | + def find(a: int) -> int: |
| 92 | + while parent[a] != a: |
| 93 | + parent[a] = parent[parent[a]] |
| 94 | + a = parent[a] |
| 95 | + return a |
| 96 | + |
| 97 | + def union(a: int, b: int) -> None: |
| 98 | + ra, rb = find(a), find(b) |
| 99 | + if ra != rb: |
| 100 | + parent[rb] = ra |
| 101 | + |
| 102 | + sim = arr @ arr.T |
| 103 | + for i in range(n): |
| 104 | + for j in range(i + 1, n): |
| 105 | + if float(sim[i, j]) >= threshold: |
| 106 | + union(i, j) |
| 107 | + groups: Dict[int, List[int]] = {} |
| 108 | + for i in range(n): |
| 109 | + r = find(i) |
| 110 | + groups.setdefault(r, []).append(i) |
| 111 | + out: List[Tuple[str, int]] = [] |
| 112 | + for idxs in groups.values(): |
| 113 | + total = sum(weighted[i][1] for i in idxs) |
| 114 | + rep_q = max((weighted[i] for i in idxs), key=lambda x: x[1])[0] |
| 115 | + out.append((rep_q, int(total))) |
| 116 | + return out |
| 117 | + except Exception: |
| 118 | + return _merge_difflib(weighted, threshold=0.78) |
| 119 | + |
| 120 | + |
| 121 | +def cluster_questions_for_datasource(weighted: List[Tuple[str, int]]) -> List[Tuple[str, int]]: |
| 122 | + """同一数据源下多组 (原文, 次数) -> 合并后 (代表问句, 总次数)。""" |
| 123 | + if not weighted: |
| 124 | + return [] |
| 125 | + meta_merged, rest = _split_meta_overview(weighted) |
| 126 | + if not rest: |
| 127 | + return meta_merged |
| 128 | + embedded_or_fb = _merge_embedding(rest) |
| 129 | + return meta_merged + embedded_or_fb |
0 commit comments