big update

2026-06-06 00:00:05 +08:00 · 2026-03-11 20:52:58 +08:00
parent 8ed819a580
commit 966bcfbba4
44 changed files with 7124 additions and 650 deletions
@@ -0,0 +1,238 @@
+import os
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import numpy as np
+from sqlalchemy.orm import Session
+
+from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow
+from app.services.fetcher_service import embedder_model
+
+
+# 语义匹配阈值：用户关键词和事件标签向量相似度达到该值才计入语义命中
+DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
+PREFERENCE_SEMANTIC_THRESHOLD = float(
+    os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD))
+)
+# 推荐列表最大返回条数
+DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT = 50
+PREFERENCE_RECOMMEND_MAX_LIMIT = int(
+    os.getenv("PREFERENCE_RECOMMEND_MAX_LIMIT", str(DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT))
+)
+
+
+@dataclass
+class MatchedEventResult:
+    """用户兴趣匹配后的事件结果。"""
+    event: UnifiedEvent
+    match_score: float
+    exact_hits: list[str]
+    semantic_hits: list[dict[str, Any]]
+    tags: list[str]
+
+
+def _normalize_text(text: str) -> str:
+    """统一小写与首尾空白，便于做稳定匹配。"""
+    return text.strip().casefold()
+
+
+def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
+    """
+    批量生成关键词向量，并返回原词到向量的映射。
+    这里要求向量已归一化，后续可直接用点积表示余弦相似度。
+    """
+    if not keywords:
+        return {}
+
+    vectors = embedder_model.encode(keywords, normalize_embeddings=True)
+    result: dict[str, np.ndarray] = {}
+    for keyword, vec in zip(keywords, vectors):
+        result[keyword] = np.asarray(vec, dtype=np.float32)
+    return result
+
+
+def _ensure_aware(dt: datetime) -> datetime:
+    """SQLite 读出的 datetime 不带时区信息，统一补上 UTC 后才能和 utcnow() 做减法。"""
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=timezone.utc)
+    return dt
+
+
+def _calc_freshness_bonus(event: UnifiedEvent) -> float:
+    """根据事件新鲜度给一个小额加分，避免旧热点长期占据推荐位。"""
+    age_hours = max((utcnow() - _ensure_aware(event.created_at)).total_seconds() / 3600.0, 0.0)
+    if age_hours <= 6:
+        return 12.0
+    if age_hours <= 24:
+        return 8.0
+    if age_hours <= 72:
+        return 4.0
+    return 0.0
+
+
+def recommend_events_for_user(
+    db: Session,
+    *,
+    user_id: int,
+    min_hot: int = 3,
+    hours: int = 72,
+    limit: int = 20,
+    semantic_threshold: float | None = None,
+) -> list[MatchedEventResult]:
+    """
+    用户兴趣推荐主流程：
+    1) 精确匹配：用户词 == EVENT 标签
+    2) 语义匹配：用户词向量 vs EVENT 标签向量（超过阈值）
+    3) 打分融合：匹配分 + 标签相关度 + 热度 + 新鲜度
+    """
+    final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT))
+    similarity_threshold = (
+        semantic_threshold
+        if semantic_threshold is not None
+        else PREFERENCE_SEMANTIC_THRESHOLD
+    )
+
+    # 读取用户兴趣词
+    preferences = (
+        db.query(UserTopicPreference)
+        .filter(UserTopicPreference.user_id == user_id)
+        .all()
+    )
+    if not preferences:
+        return []
+
+    preference_keywords = [pref.interested_keyword.strip() for pref in preferences if pref.interested_keyword.strip()]
+    if not preference_keywords:
+        return []
+
+    # 读取候选事件（先做时间和热度过滤，避免全表扫描）
+    time_limit = utcnow() - timedelta(hours=hours)
+    events = (
+        db.query(UnifiedEvent)
+        .filter(
+            UnifiedEvent.hot_score >= min_hot,
+            UnifiedEvent.created_at >= time_limit,
+        )
+        .order_by(UnifiedEvent.hot_score.desc(), UnifiedEvent.created_at.desc())
+        .all()
+    )
+    if not events:
+        return []
+
+    event_id_list = [event.id for event in events]
+    topic_rows = (
+        db.query(
+            ExtractedTopic.target_id,
+            ExtractedTopic.topic_keyword,
+            ExtractedTopic.relevance_score,
+        )
+        .filter(
+            ExtractedTopic.target_type == TargetType.EVENT,
+            ExtractedTopic.target_id.in_(event_id_list),
+        )
+        .all()
+    )
+    if not topic_rows:
+        return []
+
+    # 组织事件标签映射：event_id -> [(tag, relevance_score), ...]
+    event_topics: dict[int, list[tuple[str, float | None]]] = {}
+    for event_id, topic_keyword, relevance_score in topic_rows:
+        if not topic_keyword:
+            continue
+        event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
+
+    # 如果某事件没有标签，就不参与推荐
+    if not event_topics:
+        return []
+
+    # 批量编码用户词和标签词，避免逐条调用模型
+    unique_preference_keywords = list(dict.fromkeys(preference_keywords))
+    unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
+    pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
+    topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
+
+    # 预先建立“标准化后用户词集合”，用于精确匹配
+    normalized_pref_set = {_normalize_text(word) for word in unique_preference_keywords}
+
+    scored_results: list[MatchedEventResult] = []
+    for event in events:
+        topic_list = event_topics.get(event.id, [])
+        if not topic_list:
+            continue
+
+        exact_hits: list[str] = []
+        semantic_hits: list[dict[str, Any]] = []
+        score = 0.0
+
+        # 对事件标签逐个匹配用户兴趣
+        for topic_keyword, topic_relevance in topic_list:
+            normalized_topic = _normalize_text(topic_keyword)
+            topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
+
+            # 1) 精确命中（包括完全相等与包含关系）
+            matched_exact = False
+            if normalized_topic in normalized_pref_set:
+                matched_exact = True
+            else:
+                for pref_word in normalized_pref_set:
+                    if pref_word and (pref_word in normalized_topic or normalized_topic in pref_word):
+                        matched_exact = True
+                        break
+
+            if matched_exact:
+                exact_hits.append(topic_keyword)
+                # 精确命中给较高基础分，标签自身相关度作为增益
+                score += 45.0 + topic_relevance_score * 0.2
+                continue
+
+            # 2) 语义命中（未精确命中时再算）
+            topic_vec = topic_vec_map.get(topic_keyword)
+            if topic_vec is None:
+                continue
+
+            best_pref = None
+            best_sim = -1.0
+            for pref_keyword, pref_vec in pref_vec_map.items():
+                sim = float(np.dot(topic_vec, pref_vec))
+                if sim > best_sim:
+                    best_sim = sim
+                    best_pref = pref_keyword
+
+            if best_pref is not None and best_sim >= similarity_threshold:
+                semantic_hits.append(
+                    {
+                        "preference_keyword": best_pref,
+                        "topic_keyword": topic_keyword,
+                        "similarity": round(best_sim, 4),
+                    }
+                )
+                # 语义命中分略低于精确命中，并由相似度放大
+                score += best_sim * 35.0 + topic_relevance_score * 0.12
+
+        # 如果精确和语义都没命中，直接跳过
+        if not exact_hits and not semantic_hits:
+            continue
+
+        # 融合事件热度和新鲜度，避免只看语义分
+        score += min(event.hot_score, 100) * 0.3
+        score += _calc_freshness_bonus(event)
+
+        # 返回标签时做去重，保证接口稳定
+        tags = list(dict.fromkeys([item[0] for item in topic_list]))
+        scored_results.append(
+            MatchedEventResult(
+                event=event,
+                match_score=round(score, 2),
+                exact_hits=list(dict.fromkeys(exact_hits)),
+                semantic_hits=semantic_hits,
+                tags=tags,
+            )
+        )
+
+    scored_results.sort(
+        key=lambda item: (item.match_score, item.event.hot_score, item.event.created_at),
+        reverse=True,
+    )
+    return scored_results[:final_limit]