mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-06 00:00:05 +08:00
backend 去ai化
This commit is contained in:
@@ -1,7 +1,3 @@
|
||||
"""
|
||||
匹配服务:根据用户兴趣关键词(精确 + 语义)推荐事件
|
||||
打分融合:标签/标题匹配分 + 标签相关度 + 热度 + 新鲜度加成
|
||||
"""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
@@ -13,6 +9,7 @@ from sqlalchemy.orm import Session
|
||||
from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow
|
||||
from app.services.fetcher_service import embedder_model
|
||||
|
||||
# AI辅助生成:deepseek-v3-2,2026年3月20日
|
||||
|
||||
# 语义匹配阈值:用户关键词和事件标签/标题向量相似度达到该值才计入语义命中
|
||||
DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
|
||||
@@ -35,6 +32,7 @@ class MatchedEventResult:
|
||||
semantic_hits: list[dict[str, Any]]
|
||||
tags: list[str]
|
||||
|
||||
# AI生成结束
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
"""统一小写与首尾空白,便于做稳定匹配。"""
|
||||
@@ -80,7 +78,6 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
|
||||
|
||||
uncached_keywords = []
|
||||
|
||||
# 1. 尝试从缓存获取
|
||||
for keyword in keywords:
|
||||
if not keyword:
|
||||
continue
|
||||
@@ -89,9 +86,7 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
|
||||
else:
|
||||
uncached_keywords.append(keyword)
|
||||
|
||||
# 2. 对未命中的词进行统一的批量推理
|
||||
if uncached_keywords:
|
||||
# 去重,避免同一个未缓存的词被计算多次
|
||||
unique_uncached = list(dict.fromkeys(uncached_keywords))
|
||||
|
||||
vectors = embedder_model.encode(unique_uncached, normalize_embeddings=True, show_progress_bar=False)
|
||||
@@ -102,7 +97,6 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
|
||||
for k in keys_to_delete:
|
||||
del _EMBEDDING_CACHE[k]
|
||||
|
||||
# 3. 将新计算的向量存入缓存并回填结果
|
||||
for keyword, vec in zip(unique_uncached, vectors):
|
||||
vec_array = np.asarray(vec, dtype=np.float32)
|
||||
_EMBEDDING_CACHE[keyword] = vec_array
|
||||
@@ -172,7 +166,6 @@ def recommend_events_for_user(
|
||||
else PREFERENCE_SEMANTIC_THRESHOLD
|
||||
)
|
||||
|
||||
# 1. 读取用户兴趣词
|
||||
preferences = (
|
||||
db.query(UserTopicPreference)
|
||||
.filter(UserTopicPreference.user_id == user_id)
|
||||
@@ -185,7 +178,6 @@ def recommend_events_for_user(
|
||||
if not preference_keywords:
|
||||
return []
|
||||
|
||||
# 2. 读取候选事件(时间 + 热度过滤,避免全表扫描)
|
||||
time_limit = utcnow() - timedelta(hours=hours)
|
||||
events = (
|
||||
db.query(UnifiedEvent)
|
||||
@@ -213,20 +205,17 @@ def recommend_events_for_user(
|
||||
.all()
|
||||
)
|
||||
|
||||
# 组织事件标签映射:event_id -> [(tag, relevance_score), ...]
|
||||
event_topics: dict[int, list[tuple[str, float | None]]] = {}
|
||||
for event_id, topic_keyword, relevance_score in topic_rows:
|
||||
if not topic_keyword:
|
||||
continue
|
||||
event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
|
||||
|
||||
# 3. 批量编码用户词与标签词,减少模型调用次数
|
||||
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
|
||||
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
|
||||
pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
|
||||
topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
|
||||
|
||||
# 预先建立“标准化后用户词集合”,用于精确匹配
|
||||
normalized_preference_pairs = [
|
||||
(word, _normalize_text(word))
|
||||
for word in unique_preference_keywords
|
||||
@@ -246,20 +235,15 @@ def recommend_events_for_user(
|
||||
exact_hits: list[str] = []
|
||||
semantic_hits: list[dict[str, Any]] = []
|
||||
score = 0.0
|
||||
|
||||
# 对每个事件标签做精确匹配或语义匹配
|
||||
for topic_keyword, topic_relevance in topic_list:
|
||||
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
|
||||
|
||||
# 1) 精确命中(包括完全相等与包含关系)
|
||||
|
||||
matched_pref = _find_exact_preference_match(topic_keyword, normalized_preference_pairs)
|
||||
if matched_pref is not None:
|
||||
exact_hits.append(topic_keyword)
|
||||
# 精确命中给较高基础分,标签自身相关度作为增益
|
||||
score += 45.0 + topic_relevance_score * 0.2
|
||||
continue
|
||||
|
||||
# 2) 语义命中(未精确命中时再算)
|
||||
best_pref, best_sim = _find_best_semantic_match(topic_keyword, topic_vec_map, pref_vec_map)
|
||||
|
||||
if best_pref is not None and best_sim >= similarity_threshold:
|
||||
@@ -270,10 +254,8 @@ def recommend_events_for_user(
|
||||
"similarity": round(best_sim, 4),
|
||||
}
|
||||
)
|
||||
# 语义命中分略低于精确命中,并由相似度放大
|
||||
score += best_sim * 35.0 + topic_relevance_score * 0.12
|
||||
|
||||
# 标题也参与匹配,但权重低于结构化标签,避免长标题过度主导排序。
|
||||
event_title = (event.unified_title or "").strip()
|
||||
if event_title:
|
||||
title_exact_pref = _find_exact_preference_match(event_title, normalized_preference_pairs)
|
||||
@@ -292,15 +274,12 @@ def recommend_events_for_user(
|
||||
)
|
||||
score += best_sim * 24.0
|
||||
|
||||
# 如果精确和语义都没命中,直接跳过
|
||||
if not exact_hits and not semantic_hits:
|
||||
continue
|
||||
|
||||
# 融合事件热度和新鲜度,避免只看语义分
|
||||
score += min(event.hot_score, 100) * 0.3
|
||||
score += _calc_freshness_bonus(event)
|
||||
|
||||
# 返回标签时做去重,保证接口稳定
|
||||
tags = list(dict.fromkeys([item[0] for item in topic_list]))
|
||||
scored_results.append(
|
||||
MatchedEventResult(
|
||||
|
||||
Reference in New Issue
Block a user