From e28b893a12a122cc2e60fd5b184bd4f11b33c8d5 Mon Sep 17 00:00:00 2001 From: stardrophere <1925008984@qq.com> Date: Thu, 12 Mar 2026 01:50:08 +0800 Subject: [PATCH] optimize --- backend/app/services/fetcher_service.py | 2 +- backend/app/services/matching_service.py | 46 +++++++++++++--- backend/app/services/summary_service.py | 2 +- frontend/index.html | 2 +- frontend/public/favicon.svg | 69 ++++++++++++++++++++++++ frontend/src/views/DashboardView.vue | 3 ++ frontend/src/views/RevisionsView.vue | 13 +++-- 7 files changed, 123 insertions(+), 14 deletions(-) create mode 100644 frontend/public/favicon.svg diff --git a/backend/app/services/fetcher_service.py b/backend/app/services/fetcher_service.py index b41a15e..85ebc69 100644 --- a/backend/app/services/fetcher_service.py +++ b/backend/app/services/fetcher_service.py @@ -35,7 +35,7 @@ def generate_md5(text: str) -> str: def generate_embedding_json(text: str) -> str: """辅助函数:调用大模型生成向量,并序列化为 JSON 字符串""" - raw_vec = embedder_model.encode([text], normalize_embeddings=True)[0] + raw_vec = embedder_model.encode([text], normalize_embeddings=True, show_progress_bar=False)[0] truncated_vec = [round(float(x), 5) for x in raw_vec] return json.dumps(truncated_vec, separators=(',', ':')) diff --git a/backend/app/services/matching_service.py b/backend/app/services/matching_service.py index f7dacc4..d7a3832 100644 --- a/backend/app/services/matching_service.py +++ b/backend/app/services/matching_service.py @@ -37,18 +37,48 @@ def _normalize_text(text: str) -> str: return text.strip().casefold() +_EMBEDDING_CACHE: dict[str, np.ndarray] = {} +MAX_CACHE_SIZE = 10000 + def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]: """ - 批量生成关键词向量,并返回原词到向量的映射。 - 这里要求向量已归一化,后续可直接用点积表示余弦相似度。 + 批量生成或从缓存获取关键词向量,并返回原词到向量的映射。 + 结合了批量推理(Batching)的极速优势和内存缓存的 O(1) 读取优势。 """ - if not keywords: - return {} - - vectors = embedder_model.encode(keywords, normalize_embeddings=True) result: dict[str, np.ndarray] = {} - for keyword, vec in zip(keywords, vectors): - result[keyword] = np.asarray(vec, dtype=np.float32) + if not keywords: + return result + + uncached_keywords = [] + + # 1. 尝试从缓存获取 + for keyword in keywords: + if not keyword: + continue + if keyword in _EMBEDDING_CACHE: + result[keyword] = _EMBEDDING_CACHE[keyword] + else: + uncached_keywords.append(keyword) + + # 2. 对未命中的词进行统一的批量推理 + if uncached_keywords: + # 去重,避免同一个未缓存的词被计算多次 + unique_uncached = list(dict.fromkeys(uncached_keywords)) + + vectors = embedder_model.encode(unique_uncached, normalize_embeddings=True, show_progress_bar=False) + + # 防止缓存无限增长:超过阈值时清空最早存入的一半(简单粗暴的内存控制) + if len(_EMBEDDING_CACHE) > MAX_CACHE_SIZE: + keys_to_delete = list(_EMBEDDING_CACHE.keys())[: MAX_CACHE_SIZE // 2] + for k in keys_to_delete: + del _EMBEDDING_CACHE[k] + + # 3. 将新计算的向量存入缓存并回填结果 + for keyword, vec in zip(unique_uncached, vectors): + vec_array = np.asarray(vec, dtype=np.float32) + _EMBEDDING_CACHE[keyword] = vec_array + result[keyword] = vec_array + return result diff --git a/backend/app/services/summary_service.py b/backend/app/services/summary_service.py index ebadd81..136b81e 100644 --- a/backend/app/services/summary_service.py +++ b/backend/app/services/summary_service.py @@ -108,7 +108,7 @@ def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dic return [] keywords = [item["keyword"] for item in topic_candidates] - vectors = embedder_model.encode(keywords, normalize_embeddings=True) + vectors = embedder_model.encode(keywords, normalize_embeddings=True, show_progress_bar=False) clusters: list[dict[str, Any]] = [] for item, vector in zip(topic_candidates, vectors): diff --git a/frontend/index.html b/frontend/index.html index 1f5209f..ad5abb2 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -2,7 +2,7 @@ - + InsightRadar - 全网热点监控中枢 diff --git a/frontend/public/favicon.svg b/frontend/public/favicon.svg new file mode 100644 index 0000000..0fda3ec --- /dev/null +++ b/frontend/public/favicon.svg @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/views/DashboardView.vue b/frontend/src/views/DashboardView.vue index bc315d1..a86a0cc 100644 --- a/frontend/src/views/DashboardView.vue +++ b/frontend/src/views/DashboardView.vue @@ -135,6 +135,9 @@ function getHotLevel(score: number): { label: string; color: string; bg: string } function formatRelativeTime(dateStr: string): string { + if (!dateStr.endsWith('Z') && !dateStr.includes('+')) { + dateStr += 'Z' // 补偿 SQLite 丢失的 UTC 时区标识 + } const now = Date.now() const target = new Date(dateStr).getTime() const diff = now - target diff --git a/frontend/src/views/RevisionsView.vue b/frontend/src/views/RevisionsView.vue index f9b3c32..aa0c498 100644 --- a/frontend/src/views/RevisionsView.vue +++ b/frontend/src/views/RevisionsView.vue @@ -47,8 +47,15 @@ function getPlatformIcon(name: string): string { } /** 格式化时间 */ +function safeParseTime(dateStr: string): number { + if (!dateStr.endsWith('Z') && !dateStr.includes('+')) { + dateStr += 'Z' + } + return new Date(dateStr).getTime() +} + function formatTime(dateStr: string): string { - const d = new Date(dateStr) + const d = new Date(safeParseTime(dateStr)) const now = Date.now() const diff = now - d.getTime() const minutes = Math.floor(diff / 60000) @@ -75,7 +82,7 @@ const revisionChains = computed(() => { const chains: RevisionChain[] = [] for (const [event_id, items] of groups) { // 组内按时间升序 - items.sort((a, b) => new Date(a.created_at).getTime() - new Date(b.created_at).getTime()) + items.sort((a, b) => safeParseTime(a.created_at) - safeParseTime(b.created_at)) // 拼接标题链,避免重复(相邻记录的 revised 与下一条 previous 通常相同) const titles: string[] = [items[0].previous_headline] @@ -102,7 +109,7 @@ const revisionChains = computed(() => { } // 最终按最新修改时间降序 - chains.sort((a, b) => new Date(b.last_at).getTime() - new Date(a.last_at).getTime()) + chains.sort((a, b) => safeParseTime(b.last_at) - safeParseTime(a.last_at)) return chains })