diff --git a/backend/app/services/fetcher_service.py b/backend/app/services/fetcher_service.py index b41a15e..85ebc69 100644 --- a/backend/app/services/fetcher_service.py +++ b/backend/app/services/fetcher_service.py @@ -35,7 +35,7 @@ def generate_md5(text: str) -> str: def generate_embedding_json(text: str) -> str: """辅助函数:调用大模型生成向量,并序列化为 JSON 字符串""" - raw_vec = embedder_model.encode([text], normalize_embeddings=True)[0] + raw_vec = embedder_model.encode([text], normalize_embeddings=True, show_progress_bar=False)[0] truncated_vec = [round(float(x), 5) for x in raw_vec] return json.dumps(truncated_vec, separators=(',', ':')) diff --git a/backend/app/services/matching_service.py b/backend/app/services/matching_service.py index f7dacc4..d7a3832 100644 --- a/backend/app/services/matching_service.py +++ b/backend/app/services/matching_service.py @@ -37,18 +37,48 @@ def _normalize_text(text: str) -> str: return text.strip().casefold() +_EMBEDDING_CACHE: dict[str, np.ndarray] = {} +MAX_CACHE_SIZE = 10000 + def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]: """ - 批量生成关键词向量,并返回原词到向量的映射。 - 这里要求向量已归一化,后续可直接用点积表示余弦相似度。 + 批量生成或从缓存获取关键词向量,并返回原词到向量的映射。 + 结合了批量推理(Batching)的极速优势和内存缓存的 O(1) 读取优势。 """ - if not keywords: - return {} - - vectors = embedder_model.encode(keywords, normalize_embeddings=True) result: dict[str, np.ndarray] = {} - for keyword, vec in zip(keywords, vectors): - result[keyword] = np.asarray(vec, dtype=np.float32) + if not keywords: + return result + + uncached_keywords = [] + + # 1. 尝试从缓存获取 + for keyword in keywords: + if not keyword: + continue + if keyword in _EMBEDDING_CACHE: + result[keyword] = _EMBEDDING_CACHE[keyword] + else: + uncached_keywords.append(keyword) + + # 2. 对未命中的词进行统一的批量推理 + if uncached_keywords: + # 去重,避免同一个未缓存的词被计算多次 + unique_uncached = list(dict.fromkeys(uncached_keywords)) + + vectors = embedder_model.encode(unique_uncached, normalize_embeddings=True, show_progress_bar=False) + + # 防止缓存无限增长:超过阈值时清空最早存入的一半(简单粗暴的内存控制) + if len(_EMBEDDING_CACHE) > MAX_CACHE_SIZE: + keys_to_delete = list(_EMBEDDING_CACHE.keys())[: MAX_CACHE_SIZE // 2] + for k in keys_to_delete: + del _EMBEDDING_CACHE[k] + + # 3. 将新计算的向量存入缓存并回填结果 + for keyword, vec in zip(unique_uncached, vectors): + vec_array = np.asarray(vec, dtype=np.float32) + _EMBEDDING_CACHE[keyword] = vec_array + result[keyword] = vec_array + return result diff --git a/backend/app/services/summary_service.py b/backend/app/services/summary_service.py index ebadd81..136b81e 100644 --- a/backend/app/services/summary_service.py +++ b/backend/app/services/summary_service.py @@ -108,7 +108,7 @@ def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dic return [] keywords = [item["keyword"] for item in topic_candidates] - vectors = embedder_model.encode(keywords, normalize_embeddings=True) + vectors = embedder_model.encode(keywords, normalize_embeddings=True, show_progress_bar=False) clusters: list[dict[str, Any]] = [] for item, vector in zip(topic_candidates, vectors): diff --git a/frontend/index.html b/frontend/index.html index 1f5209f..ad5abb2 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -2,7 +2,7 @@
- +