import os from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import Any import numpy as np from sqlalchemy.orm import Session from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow from app.services.fetcher_service import embedder_model # AI辅助生成:deepseek-v3-2,2026年3月20日 # 语义匹配阈值:用户关键词和事件标签/标题向量相似度达到该值才计入语义命中 DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78 PREFERENCE_SEMANTIC_THRESHOLD = float( os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD)) ) # 推荐列表最大返回条数 DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT = 50 PREFERENCE_RECOMMEND_MAX_LIMIT = int( os.getenv("PREFERENCE_RECOMMEND_MAX_LIMIT", str(DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT)) ) @dataclass class MatchedEventResult: """用户兴趣匹配后的事件结果。""" event: UnifiedEvent match_score: float exact_hits: list[str] semantic_hits: list[dict[str, Any]] tags: list[str] # AI生成结束 def _normalize_text(text: str) -> str: """统一小写与首尾空白,便于做稳定匹配。""" return text.strip().casefold() def _find_exact_preference_match( target_text: str, normalized_preferences: list[tuple[str, str]], ) -> str | None: """ 判断目标文本是否与某个用户兴趣词形成“精确命中”。 命中条件: 1. 标准化后完全相等 2. 二者互为包含关系 返回命中的原始兴趣词,未命中则返回 None。 """ normalized_target = _normalize_text(target_text) if not normalized_target: return None for raw_pref, normalized_pref in normalized_preferences: if not normalized_pref: continue if normalized_target == normalized_pref: return raw_pref if normalized_pref in normalized_target or normalized_target in normalized_pref: return raw_pref return None _EMBEDDING_CACHE: dict[str, np.ndarray] = {} MAX_CACHE_SIZE = 10000 def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]: """ 批量生成或从缓存获取关键词向量,并返回原词到向量的映射。 结合了批量推理(Batching)的极速优势和内存缓存的 O(1) 读取优势。 """ result: dict[str, np.ndarray] = {} if not keywords: return result uncached_keywords = [] for keyword in keywords: if not keyword: continue if keyword in _EMBEDDING_CACHE: result[keyword] = _EMBEDDING_CACHE[keyword] else: uncached_keywords.append(keyword) if uncached_keywords: unique_uncached = list(dict.fromkeys(uncached_keywords)) vectors = embedder_model.encode(unique_uncached, normalize_embeddings=True, show_progress_bar=False) # 防止缓存无限增长:超过阈值时清空最早存入的一半(简单粗暴的内存控制) if len(_EMBEDDING_CACHE) > MAX_CACHE_SIZE: keys_to_delete = list(_EMBEDDING_CACHE.keys())[: MAX_CACHE_SIZE // 2] for k in keys_to_delete: del _EMBEDDING_CACHE[k] for keyword, vec in zip(unique_uncached, vectors): vec_array = np.asarray(vec, dtype=np.float32) _EMBEDDING_CACHE[keyword] = vec_array result[keyword] = vec_array return result def _find_best_semantic_match( target_text: str, target_vec_map: dict[str, np.ndarray], pref_vec_map: dict[str, np.ndarray], ) -> tuple[str | None, float]: """返回与目标文本最接近的兴趣词及其余弦相似度。""" target_vec = target_vec_map.get(target_text) if target_vec is None: return None, -1.0 best_pref = None best_sim = -1.0 for pref_keyword, pref_vec in pref_vec_map.items(): sim = float(np.dot(target_vec, pref_vec)) if sim > best_sim: best_sim = sim best_pref = pref_keyword return best_pref, best_sim def _ensure_aware(dt: datetime) -> datetime: """SQLite 读出的 datetime 不带时区信息,统一补上 UTC 后才能和 utcnow() 做减法。""" if dt.tzinfo is None: return dt.replace(tzinfo=timezone.utc) return dt def _calc_freshness_bonus(event: UnifiedEvent) -> float: """根据事件新鲜度给一个小额加分,避免旧热点长期占据推荐位。""" age_hours = max((utcnow() - _ensure_aware(event.created_at)).total_seconds() / 3600.0, 0.0) if age_hours <= 6: return 12.0 if age_hours <= 24: return 8.0 if age_hours <= 72: return 4.0 return 0.0 def recommend_events_for_user( db: Session, *, user_id: int, min_hot: int = 3, hours: int = 72, limit: int = 20, semantic_threshold: float | None = None, ) -> list[MatchedEventResult]: """ 用户兴趣推荐主流程: 1) 精确匹配:用户词 vs EVENT 标签/标题 2) 语义匹配:用户词向量 vs EVENT 标签/标题向量(超过阈值) 3) 打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度 """ final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT)) similarity_threshold = ( semantic_threshold if semantic_threshold is not None else PREFERENCE_SEMANTIC_THRESHOLD ) preferences = ( db.query(UserTopicPreference) .filter(UserTopicPreference.user_id == user_id) .all() ) if not preferences: return [] preference_keywords = [pref.interested_keyword.strip() for pref in preferences if pref.interested_keyword.strip()] if not preference_keywords: return [] time_limit = utcnow() - timedelta(hours=hours) events = ( db.query(UnifiedEvent) .filter( UnifiedEvent.hot_score >= min_hot, UnifiedEvent.created_at >= time_limit, ) .order_by(UnifiedEvent.hot_score.desc(), UnifiedEvent.created_at.desc()) .all() ) if not events: return [] event_id_list = [event.id for event in events] topic_rows = ( db.query( ExtractedTopic.target_id, ExtractedTopic.topic_keyword, ExtractedTopic.relevance_score, ) .filter( ExtractedTopic.target_type == TargetType.EVENT, ExtractedTopic.target_id.in_(event_id_list), ) .all() ) event_topics: dict[int, list[tuple[str, float | None]]] = {} for event_id, topic_keyword, relevance_score in topic_rows: if not topic_keyword: continue event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score)) unique_preference_keywords = list(dict.fromkeys(preference_keywords)) unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]])) pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords) topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords) normalized_preference_pairs = [ (word, _normalize_text(word)) for word in unique_preference_keywords if _normalize_text(word) ] unique_event_titles = list( dict.fromkeys( [event.unified_title.strip() for event in events if event.unified_title and event.unified_title.strip()] ) ) title_vec_map = _build_keyword_embedding_map(unique_event_titles) scored_results: list[MatchedEventResult] = [] for event in events: topic_list = event_topics.get(event.id, []) exact_hits: list[str] = [] semantic_hits: list[dict[str, Any]] = [] score = 0.0 for topic_keyword, topic_relevance in topic_list: topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0 matched_pref = _find_exact_preference_match(topic_keyword, normalized_preference_pairs) if matched_pref is not None: exact_hits.append(topic_keyword) score += 45.0 + topic_relevance_score * 0.2 continue best_pref, best_sim = _find_best_semantic_match(topic_keyword, topic_vec_map, pref_vec_map) if best_pref is not None and best_sim >= similarity_threshold: semantic_hits.append( { "preference_keyword": best_pref, "topic_keyword": topic_keyword, "similarity": round(best_sim, 4), } ) score += best_sim * 35.0 + topic_relevance_score * 0.12 event_title = (event.unified_title or "").strip() if event_title: title_exact_pref = _find_exact_preference_match(event_title, normalized_preference_pairs) if title_exact_pref is not None: exact_hits.append(f"标题:{title_exact_pref}") score += 30.0 else: best_pref, best_sim = _find_best_semantic_match(event_title, title_vec_map, pref_vec_map) if best_pref is not None and best_sim >= similarity_threshold: semantic_hits.append( { "preference_keyword": best_pref, "topic_keyword": f"标题:{best_pref}", "similarity": round(best_sim, 4), } ) score += best_sim * 24.0 if not exact_hits and not semantic_hits: continue score += min(event.hot_score, 100) * 0.3 score += _calc_freshness_bonus(event) tags = list(dict.fromkeys([item[0] for item in topic_list])) scored_results.append( MatchedEventResult( event=event, match_score=round(score, 2), exact_hits=list(dict.fromkeys(exact_hits)), semantic_hits=semantic_hits, tags=tags, ) ) scored_results.sort( key=lambda item: (item.match_score, item.event.hot_score, item.event.created_at), reverse=True, ) return scored_results[:final_limit]