big update

2026-06-06 00:57:51 +08:00 · 2026-03-11 20:52:58 +08:00
parent 8ed819a580
commit 966bcfbba4
44 changed files with 7124 additions and 650 deletions
@@ -0,0 +1,454 @@
+# 定时推送调度服务
+# 由 APScheduler 每分钟调用，检查当前时刻是否有用户需要接收推送，
+# 如匹配则生成摘要邮件并发送，同时写入 DeliveryHistory 防重复。
+import logging
+from logging.handlers import TimedRotatingFileHandler
+from dataclasses import dataclass, field
+from datetime import datetime, time as dt_time, timedelta, timezone, tzinfo
+from pathlib import Path
+from typing import Any
+from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+
+from sqlalchemy.orm import Session
+
+from app.database import SessionLocal
+from app.models.models import (
+    AppUser,
+    DeliveryHistory,
+    ExtractedTopic,
+    InfoSource,
+    TargetType,
+    TaskStatus,
+    TrendingEvent,
+    UnifiedEvent,
+    UserDeliverySchedule,
+    UserPushEndpoint,
+    UserTopicPreference,
+    utcnow,
+)
+from app.prompts.digest_email_template import build_digest_html
+from app.services.matching_service import recommend_events_for_user
+from app.utils.email_utils import send_html_email
+
+logger = logging.getLogger("delivery_service")
+
+# delivery_service 日志单独写文件，不再输出到控制台
+_delivery_log_dir = Path(__file__).resolve().parents[2] / "logs"
+_delivery_log_dir.mkdir(parents=True, exist_ok=True)
+_delivery_log_file = _delivery_log_dir / "delivery_check.log"
+if not logger.handlers:
+    _file_handler = TimedRotatingFileHandler(
+        filename=str(_delivery_log_file),
+        when="midnight",
+        interval=1,
+        backupCount=14,
+        encoding="utf-8",
+    )
+    _file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
+    logger.addHandler(_file_handler)
+logger.setLevel(logging.INFO)
+logger.propagate = False
+
+# 推送时间窗口：实际执行时刻与设定时间的最大容差（分钟）
+DELIVERY_WINDOW_MINUTES = 2
+# 同一用户两次推送之间的最小间隔（分钟）
+MIN_PUSH_INTERVAL_MINUTES = 30
+# 单次推送最多携带的事件数
+MAX_EVENTS_PER_PUSH = 12
+# 默认模式热度阈值（无关键词或无匹配时使用）
+DEFAULT_MODE_HOT_THRESHOLD = 3
+# 默认模式查询时间窗口（小时）
+DEFAULT_MODE_HOURS = 48
+# 用户时区无效时的兜底时区
+DEFAULT_FALLBACK_TIMEZONE = "Asia/Shanghai"
+
+
+# ==========================================
+# 默认热点事件容器（无关键词时使用）
+# ==========================================
+@dataclass
+class _DefaultEventItem:
+    """
+    无关键词订阅或关键词无匹配时的默认热点包装器，
+    接口与 MatchedEventResult 保持一致，方便统一传给模板。
+    """
+    event: UnifiedEvent
+    match_score: float = 0.0
+    exact_hits: list[str] = field(default_factory=list)
+    semantic_hits: list[dict[str, Any]] = field(default_factory=list)
+    tags: list[str] = field(default_factory=list)
+    is_default: bool = True
+
+
+# ==========================================
+# 时区工具
+# ==========================================
+def _time_to_minutes(t: dt_time) -> int:
+    return t.hour * 60 + t.minute
+
+
+def _is_within_window(schedule_time: dt_time, current_time: dt_time, window: int = DELIVERY_WINDOW_MINUTES) -> bool:
+    """判断 schedule_time 是否在 current_time ± window 分钟范围内（跨午夜安全）。"""
+    s = _time_to_minutes(schedule_time)
+    c = _time_to_minutes(current_time)
+    diff = abs(s - c)
+    return min(diff, 1440 - diff) <= window
+
+
+def _resolve_user_timezone(user_timezone: str | None) -> tzinfo:
+    """解析用户时区，异常时回退到默认时区。"""
+    tz_name = (user_timezone or "").strip() or DEFAULT_FALLBACK_TIMEZONE
+    try:
+        return ZoneInfo(tz_name)
+    except ZoneInfoNotFoundError:
+        logger.warning(
+            "用户时区无效，已回退默认时区。timezone=%s fallback=%s",
+            tz_name, DEFAULT_FALLBACK_TIMEZONE,
+        )
+        try:
+            return ZoneInfo(DEFAULT_FALLBACK_TIMEZONE)
+        except ZoneInfoNotFoundError:
+            logger.warning("系统缺少时区数据库，最终回退为 UTC。建议安装 tzdata 包。")
+            return timezone.utc
+
+
+def _user_local_time(now_utc: datetime, user_timezone: str | None) -> dt_time:
+    """把 UTC 当前时刻转换为用户本地时间（仅取 HH:MM）。"""
+    local_dt = now_utc.astimezone(_resolve_user_timezone(user_timezone))
+    return local_dt.time().replace(second=0, microsecond=0)
+
+
+def _ensure_aware(dt: datetime) -> datetime:
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=timezone.utc)
+    return dt
+
+
+# ==========================================
+# 数据库查询辅助
+# ==========================================
+def _should_skip_by_interval(db: Session, user_id: int) -> bool:
+    """检查用户是否仍在 30 分钟冷却期内。"""
+    row = (
+        db.query(DeliveryHistory.created_at)
+        .filter(
+            DeliveryHistory.user_id == user_id,
+            DeliveryHistory.status == TaskStatus.SUCCESS,
+        )
+        .order_by(DeliveryHistory.created_at.desc())
+        .first()
+    )
+    if row is None:
+        return False
+    last_time = _ensure_aware(row[0])
+    elapsed = (utcnow() - last_time).total_seconds() / 60.0
+    return elapsed < MIN_PUSH_INTERVAL_MINUTES
+
+
+def _get_user_email_endpoints(db: Session, user_id: int) -> list[UserPushEndpoint]:
+    """获取用户已启用的邮件类型推送渠道，按优先级排序。"""
+    return (
+        db.query(UserPushEndpoint)
+        .filter(
+            UserPushEndpoint.user_id == user_id,
+            UserPushEndpoint.channel_type == "EMAIL",
+            UserPushEndpoint.is_active == True,
+        )
+        .order_by(UserPushEndpoint.priority_level.asc())
+        .all()
+    )
+
+
+def _get_already_pushed_event_ids(db: Session, user_id: int) -> set[int]:
+    """获取已经推送过的事件 ID 集合，避免重复轰炸。"""
+    rows = (
+        db.query(DeliveryHistory.target_id)
+        .filter(
+            DeliveryHistory.user_id == user_id,
+            DeliveryHistory.target_type == TargetType.EVENT,
+            DeliveryHistory.status == TaskStatus.SUCCESS,
+        )
+        .all()
+    )
+    return {r[0] for r in rows}
+
+
+def _load_event_platforms(db: Session, event_ids: list[int]) -> dict[int, list[dict]]:
+    """
+    批量加载事件的平台来源数据。
+    返回：event_id → [{source_name, headline, url, ranking, icon_url}, ...]
+    按排名升序排列（rank 1 最靠前）。
+    """
+    if not event_ids:
+        return {}
+
+    rows = (
+        db.query(
+            TrendingEvent.unified_event_id,
+            TrendingEvent.current_headline,
+            TrendingEvent.event_url,
+            TrendingEvent.current_ranking,
+            TrendingEvent.icon_url,
+            InfoSource.source_name,
+        )
+        .join(InfoSource, TrendingEvent.source_id == InfoSource.id)
+        .filter(TrendingEvent.unified_event_id.in_(event_ids))
+        .order_by(
+            TrendingEvent.unified_event_id,
+            TrendingEvent.current_ranking.asc().nulls_last(),
+        )
+        .all()
+    )
+
+    result: dict[int, list[dict]] = {}
+    for event_id, headline, url, ranking, icon_url, source_name in rows:
+        result.setdefault(event_id, []).append({
+            "source_name": source_name or "未知",
+            "headline": headline or "",
+            "url": url or "",
+            "ranking": ranking,
+            "icon_url": icon_url or "",
+        })
+    return result
+
+
+def _load_event_tags(db: Session, event_ids: list[int]) -> dict[int, list[str]]:
+    """批量加载事件的标签，返回 event_id → [tag, ...]。"""
+    if not event_ids:
+        return {}
+    rows = (
+        db.query(ExtractedTopic.target_id, ExtractedTopic.topic_keyword)
+        .filter(
+            ExtractedTopic.target_type == TargetType.EVENT,
+            ExtractedTopic.target_id.in_(event_ids),
+        )
+        .all()
+    )
+    tags_map: dict[int, list[str]] = {}
+    for eid, kw in rows:
+        if kw:
+            tags_map.setdefault(eid, []).append(kw)
+    return tags_map
+
+
+def _user_has_keywords(db: Session, user_id: int) -> bool:
+    """判断用户是否配置了关键词订阅。"""
+    return (
+        db.query(UserTopicPreference.id)
+        .filter(UserTopicPreference.user_id == user_id)
+        .first()
+    ) is not None
+
+
+def _get_default_hot_events(
+    db: Session,
+    pushed_ids: set[int],
+) -> list[_DefaultEventItem]:
+    """
+    默认模式：获取热度 >= DEFAULT_MODE_HOT_THRESHOLD 的近期热点，
+    排除已推送过的，封装成与 MatchedEventResult 接口相同的对象。
+    """
+    time_limit = utcnow() - timedelta(hours=DEFAULT_MODE_HOURS)
+    events = (
+        db.query(UnifiedEvent)
+        .filter(
+            UnifiedEvent.hot_score >= DEFAULT_MODE_HOT_THRESHOLD,
+            UnifiedEvent.created_at >= time_limit,
+        )
+        .order_by(UnifiedEvent.hot_score.desc())
+        .limit(MAX_EVENTS_PER_PUSH * 2)
+        .all()
+    )
+
+    event_ids = [e.id for e in events if e.id not in pushed_ids]
+    tags_map = _load_event_tags(db, event_ids)
+
+    result: list[_DefaultEventItem] = []
+    for ev in events:
+        if ev.id in pushed_ids:
+            continue
+        result.append(_DefaultEventItem(
+            event=ev,
+            tags=list(dict.fromkeys(tags_map.get(ev.id, [])))[:6],
+        ))
+        if len(result) >= MAX_EVENTS_PER_PUSH:
+            break
+
+    return result
+
+
+def _record_delivery(
+    db: Session,
+    user_id: int,
+    event_ids: list[int],
+    status: TaskStatus,
+) -> None:
+    """批量写入推送历史记录。"""
+    for eid in event_ids:
+        record = DeliveryHistory(
+            user_id=user_id,
+            target_type=TargetType.EVENT,
+            target_id=eid,
+            status=status,
+        )
+        db.add(record)
+    db.commit()
+
+
+# ==========================================
+# 推送准备
+# ==========================================
+@dataclass
+class _PendingPush:
+    """暂存需要发送邮件的信息，便于在 async 上下文中发送。"""
+    user_id: int
+    email_targets: list[str]
+    subject: str
+    html_body: str
+    event_ids: list[int]
+
+
+def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedule) -> _PendingPush | None:
+    """
+    同步准备单个用户的推送数据（DB 操作），不实际发送邮件。
+    推送优先级：
+    1. 有关键词 且 有匹配 → 发送匹配事件
+    2. 有关键词 但 无匹配 → 发送默认热点（热度 >= 3）
+    3. 无关键词           → 发送默认热点（热度 >= 3）
+    """
+    user_id = user.id
+
+    if _should_skip_by_interval(db, user_id):
+        logger.info(f"用户 {user_id} 仍在 {MIN_PUSH_INTERVAL_MINUTES} 分钟冷却期内，跳过")
+        return None
+
+    email_endpoints = _get_user_email_endpoints(db, user_id)
+    if not email_endpoints:
+        logger.info(f"用户 {user_id} 无可用邮件渠道，跳过")
+        return None
+
+    pushed_ids = _get_already_pushed_event_ids(db, user_id)
+
+    # ——— 决策：匹配模式 or 默认模式 ———
+    items: list = []
+    is_default = False
+
+    has_keywords = _user_has_keywords(db, user_id)
+    if has_keywords:
+        matched = recommend_events_for_user(
+            db,
+            user_id=user_id,
+            min_hot=1,
+            hours=72,
+            limit=MAX_EVENTS_PER_PUSH * 2,
+        )
+        fresh_matched = [m for m in matched if m.event.id not in pushed_ids]
+        if fresh_matched:
+            items = fresh_matched[:MAX_EVENTS_PER_PUSH]
+            logger.info(f"用户 {user_id} 关键词匹配，推送 {len(items)} 条事件")
+        else:
+            logger.info(f"用户 {user_id} 关键词无匹配结果，切换为默认热点模式")
+            is_default = True
+    else:
+        logger.info(f"用户 {user_id} 未配置关键词，使用默认热点模式")
+        is_default = True
+
+    if is_default:
+        items = _get_default_hot_events(db, pushed_ids)
+        if not items:
+            logger.info(f"用户 {user_id} 默认热点无可推送内容，跳过")
+            return None
+
+    # 批量加载平台数据（来源名、标题、URL、排名）
+    event_ids = [item.event.id for item in items]
+    platforms_map = _load_event_platforms(db, event_ids)
+
+    time_str = schedule.delivery_time.strftime("%H:%M")
+    html_body = build_digest_html(
+        items=items,
+        delivery_time_str=time_str,
+        platforms_map=platforms_map,
+        is_default_push=is_default,
+    )
+
+    subject_suffix = "全网热点快报" if is_default else "个性化简报"
+    return _PendingPush(
+        user_id=user_id,
+        email_targets=[ep.channel_account for ep in email_endpoints],
+        subject=f"InsightRadar {subject_suffix} · {time_str}",
+        html_body=html_body,
+        event_ids=event_ids,
+    )
+
+
+# ==========================================
+# 调度主入口
+# ==========================================
+async def check_and_deliver() -> None:
+    """
+    定时推送主入口，由 APScheduler 每分钟调用。
+    流程：
+    1. 获取当前 UTC 时间
+    2. 查询所有启用的推送计划
+    3. 对每个计划，按用户本地时区判断是否在推送窗口
+    4. 同步准备推送数据 → 异步发送邮件 → 记录结果
+    """
+    now = datetime.now(timezone.utc)
+    current_utc = now.time().replace(second=0, microsecond=0)
+    logger.debug(f"推送调度检查 @ UTC {current_utc.strftime('%H:%M')}")
+
+    db: Session = SessionLocal()
+    try:
+        active_schedules = (
+            db.query(UserDeliverySchedule)
+            .filter(UserDeliverySchedule.is_active == True)
+            .all()
+        )
+
+        for schedule in active_schedules:
+            user = db.query(AppUser).filter(AppUser.id == schedule.user_id).first()
+            if not user:
+                continue
+
+            # 用户本地时间对比（核心时区修正）
+            user_current = _user_local_time(now, user.timezone)
+            if not _is_within_window(schedule.delivery_time, user_current):
+                continue
+
+            try:
+                pending = _prepare_user_push(db, user, schedule)
+                if pending is None:
+                    continue
+
+                # 异步按优先级尝试各邮件渠道
+                sent = False
+                for target_email in pending.email_targets:
+                    try:
+                        success = await send_html_email(
+                            to_email=target_email,
+                            subject=pending.subject,
+                            html_content=pending.html_body,
+                        )
+                        if success:
+                            sent = True
+                            logger.info(f"用户 {pending.user_id} 邮件发送成功 → {target_email}")
+                            break
+                        else:
+                            logger.warning(f"用户 {pending.user_id} 渠道 {target_email} 发送失败，尝试下一个")
+                    except Exception as e:
+                        logger.error(f"用户 {pending.user_id} 发送至 {target_email} 异常: {e}")
+
+                _record_delivery(
+                    db,
+                    user_id=pending.user_id,
+                    event_ids=pending.event_ids,
+                    status=TaskStatus.SUCCESS if sent else TaskStatus.ERROR,
+                )
+
+            except Exception as e:
+                logger.error(f"推送用户 {schedule.user_id} 时异常: {e}", exc_info=True)
+
+    except Exception as e:
+        logger.error(f"推送调度主循环异常: {e}", exc_info=True)
+    finally:
+        db.close()
@@ -0,0 +1,238 @@
+import os
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import numpy as np
+from sqlalchemy.orm import Session
+
+from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow
+from app.services.fetcher_service import embedder_model
+
+
+# 语义匹配阈值：用户关键词和事件标签向量相似度达到该值才计入语义命中
+DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
+PREFERENCE_SEMANTIC_THRESHOLD = float(
+    os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD))
+)
+# 推荐列表最大返回条数
+DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT = 50
+PREFERENCE_RECOMMEND_MAX_LIMIT = int(
+    os.getenv("PREFERENCE_RECOMMEND_MAX_LIMIT", str(DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT))
+)
+
+
+@dataclass
+class MatchedEventResult:
+    """用户兴趣匹配后的事件结果。"""
+    event: UnifiedEvent
+    match_score: float
+    exact_hits: list[str]
+    semantic_hits: list[dict[str, Any]]
+    tags: list[str]
+
+
+def _normalize_text(text: str) -> str:
+    """统一小写与首尾空白，便于做稳定匹配。"""
+    return text.strip().casefold()
+
+
+def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
+    """
+    批量生成关键词向量，并返回原词到向量的映射。
+    这里要求向量已归一化，后续可直接用点积表示余弦相似度。
+    """
+    if not keywords:
+        return {}
+
+    vectors = embedder_model.encode(keywords, normalize_embeddings=True)
+    result: dict[str, np.ndarray] = {}
+    for keyword, vec in zip(keywords, vectors):
+        result[keyword] = np.asarray(vec, dtype=np.float32)
+    return result
+
+
+def _ensure_aware(dt: datetime) -> datetime:
+    """SQLite 读出的 datetime 不带时区信息，统一补上 UTC 后才能和 utcnow() 做减法。"""
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=timezone.utc)
+    return dt
+
+
+def _calc_freshness_bonus(event: UnifiedEvent) -> float:
+    """根据事件新鲜度给一个小额加分，避免旧热点长期占据推荐位。"""
+    age_hours = max((utcnow() - _ensure_aware(event.created_at)).total_seconds() / 3600.0, 0.0)
+    if age_hours <= 6:
+        return 12.0
+    if age_hours <= 24:
+        return 8.0
+    if age_hours <= 72:
+        return 4.0
+    return 0.0
+
+
+def recommend_events_for_user(
+    db: Session,
+    *,
+    user_id: int,
+    min_hot: int = 3,
+    hours: int = 72,
+    limit: int = 20,
+    semantic_threshold: float | None = None,
+) -> list[MatchedEventResult]:
+    """
+    用户兴趣推荐主流程：
+    1) 精确匹配：用户词 == EVENT 标签
+    2) 语义匹配：用户词向量 vs EVENT 标签向量（超过阈值）
+    3) 打分融合：匹配分 + 标签相关度 + 热度 + 新鲜度
+    """
+    final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT))
+    similarity_threshold = (
+        semantic_threshold
+        if semantic_threshold is not None
+        else PREFERENCE_SEMANTIC_THRESHOLD
+    )
+
+    # 读取用户兴趣词
+    preferences = (
+        db.query(UserTopicPreference)
+        .filter(UserTopicPreference.user_id == user_id)
+        .all()
+    )
+    if not preferences:
+        return []
+
+    preference_keywords = [pref.interested_keyword.strip() for pref in preferences if pref.interested_keyword.strip()]
+    if not preference_keywords:
+        return []
+
+    # 读取候选事件（先做时间和热度过滤，避免全表扫描）
+    time_limit = utcnow() - timedelta(hours=hours)
+    events = (
+        db.query(UnifiedEvent)
+        .filter(
+            UnifiedEvent.hot_score >= min_hot,
+            UnifiedEvent.created_at >= time_limit,
+        )
+        .order_by(UnifiedEvent.hot_score.desc(), UnifiedEvent.created_at.desc())
+        .all()
+    )
+    if not events:
+        return []
+
+    event_id_list = [event.id for event in events]
+    topic_rows = (
+        db.query(
+            ExtractedTopic.target_id,
+            ExtractedTopic.topic_keyword,
+            ExtractedTopic.relevance_score,
+        )
+        .filter(
+            ExtractedTopic.target_type == TargetType.EVENT,
+            ExtractedTopic.target_id.in_(event_id_list),
+        )
+        .all()
+    )
+    if not topic_rows:
+        return []
+
+    # 组织事件标签映射：event_id -> [(tag, relevance_score), ...]
+    event_topics: dict[int, list[tuple[str, float | None]]] = {}
+    for event_id, topic_keyword, relevance_score in topic_rows:
+        if not topic_keyword:
+            continue
+        event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
+
+    # 如果某事件没有标签，就不参与推荐
+    if not event_topics:
+        return []
+
+    # 批量编码用户词和标签词，避免逐条调用模型
+    unique_preference_keywords = list(dict.fromkeys(preference_keywords))
+    unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
+    pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
+    topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
+
+    # 预先建立“标准化后用户词集合”，用于精确匹配
+    normalized_pref_set = {_normalize_text(word) for word in unique_preference_keywords}
+
+    scored_results: list[MatchedEventResult] = []
+    for event in events:
+        topic_list = event_topics.get(event.id, [])
+        if not topic_list:
+            continue
+
+        exact_hits: list[str] = []
+        semantic_hits: list[dict[str, Any]] = []
+        score = 0.0
+
+        # 对事件标签逐个匹配用户兴趣
+        for topic_keyword, topic_relevance in topic_list:
+            normalized_topic = _normalize_text(topic_keyword)
+            topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
+
+            # 1) 精确命中（包括完全相等与包含关系）
+            matched_exact = False
+            if normalized_topic in normalized_pref_set:
+                matched_exact = True
+            else:
+                for pref_word in normalized_pref_set:
+                    if pref_word and (pref_word in normalized_topic or normalized_topic in pref_word):
+                        matched_exact = True
+                        break
+
+            if matched_exact:
+                exact_hits.append(topic_keyword)
+                # 精确命中给较高基础分，标签自身相关度作为增益
+                score += 45.0 + topic_relevance_score * 0.2
+                continue
+
+            # 2) 语义命中（未精确命中时再算）
+            topic_vec = topic_vec_map.get(topic_keyword)
+            if topic_vec is None:
+                continue
+
+            best_pref = None
+            best_sim = -1.0
+            for pref_keyword, pref_vec in pref_vec_map.items():
+                sim = float(np.dot(topic_vec, pref_vec))
+                if sim > best_sim:
+                    best_sim = sim
+                    best_pref = pref_keyword
+
+            if best_pref is not None and best_sim >= similarity_threshold:
+                semantic_hits.append(
+                    {
+                        "preference_keyword": best_pref,
+                        "topic_keyword": topic_keyword,
+                        "similarity": round(best_sim, 4),
+                    }
+                )
+                # 语义命中分略低于精确命中，并由相似度放大
+                score += best_sim * 35.0 + topic_relevance_score * 0.12
+
+        # 如果精确和语义都没命中，直接跳过
+        if not exact_hits and not semantic_hits:
+            continue
+
+        # 融合事件热度和新鲜度，避免只看语义分
+        score += min(event.hot_score, 100) * 0.3
+        score += _calc_freshness_bonus(event)
+
+        # 返回标签时做去重，保证接口稳定
+        tags = list(dict.fromkeys([item[0] for item in topic_list]))
+        scored_results.append(
+            MatchedEventResult(
+                event=event,
+                match_score=round(score, 2),
+                exact_hits=list(dict.fromkeys(exact_hits)),
+                semantic_hits=semantic_hits,
+                tags=tags,
+            )
+        )
+
+    scored_results.sort(
+        key=lambda item: (item.match_score, item.event.hot_score, item.event.created_at),
+        reverse=True,
+    )
+    return scored_results[:final_limit]
@@ -1,104 +1,241 @@
 # app/services/summary_service.py
-import os
 import json
+import os
 from datetime import timedelta
+from typing import Any
+
+import numpy as np
 from openai import AsyncOpenAI

 from app.database import SessionLocal
-from app.models.models import UnifiedEvent, TrendingEvent, InfoSource, utcnow
+from app.models.models import (
+    ExtractedTopic,
+    InfoSource,
+    TargetType,
+    TrendingEvent,
+    UnifiedEvent,
+    utcnow,
+)
 from app.prompts.summary_prompts import (
    SUMMARY_SYSTEM_PROMPT,
    SUMMARY_USER_PROMPT_TEMPLATE,
 )
+from app.services.fetcher_service import embedder_model

 HOT_SCORE_THRESHOLD = int(os.getenv("HOT_SCORE_THRESHOLD", 3))
-AI_API_KEY = os.getenv("AI_API_KEY", '')
+TOPIC_TAG_MIN_HOT_SCORE = int(os.getenv("TOPIC_TAG_MIN_HOT_SCORE", HOT_SCORE_THRESHOLD))
+TOPIC_SIMILARITY_THRESHOLD = float(os.getenv("TOPIC_SIMILARITY_THRESHOLD", 0.82))
+TOPIC_TAG_MAX_COUNT = int(os.getenv("TOPIC_TAG_MAX_COUNT", 8))
+AI_API_KEY = os.getenv("AI_API_KEY", "")
+

-# 1. 初始化异步客户端 (全局复用)
 deepseek_client = AsyncOpenAI(
    api_key=AI_API_KEY,
-    base_url="https://api.deepseek.com"
+    base_url="https://api.deepseek.com",
 )


 async def call_llm_for_summary(platform_data_text: str) -> dict:
-    """调用 DeepSeek 生成统一标题和多平台视角摘要"""
-    prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
-        platform_data_text=platform_data_text
-    )
+    """Call LLM for unified title, summary and topic candidates."""
+    prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(platform_data_text=platform_data_text)

-    # await
    response = await deepseek_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
-            {"role": "user", "content": prompt}
+            {"role": "user", "content": prompt},
        ],
        response_format={"type": "json_object"},
-        temperature=1
+        temperature=1,
    )

    result_text = response.choices[0].message.content
    return json.loads(result_text)


+def _normalize_score(raw_score: Any) -> float | None:
+    try:
+        score = float(raw_score)
+    except (TypeError, ValueError):
+        return None
+
+    if score <= 1:
+        score *= 100
+
+    return max(0.0, min(100.0, score))
+
+
+def parse_topic_keywords(llm_result: dict) -> list[dict[str, Any]]:
+    """Parse topic keywords from LLM response; support list[str] and list[object]."""
+    raw_topics = llm_result.get("topic_keywords") or []
+    parsed: list[dict[str, Any]] = []
+    seen: set[str] = set()
+
+    for item in raw_topics:
+        keyword = ""
+        score = None
+
+        if isinstance(item, str):
+            keyword = item.strip()
+        elif isinstance(item, dict):
+            raw_keyword = (
+                item.get("keyword")
+                or item.get("topic_keyword")
+                or item.get("name")
+                or item.get("topic")
+                or ""
+            )
+            keyword = str(raw_keyword).strip()
+            score = _normalize_score(item.get("relevance_score") or item.get("score"))
+
+        if not keyword:
+            continue
+
+        keyword = keyword[:100]
+        normalized_key = keyword.casefold()
+        if normalized_key in seen:
+            continue
+
+        seen.add(normalized_key)
+        parsed.append({"keyword": keyword, "score": score})
+
+    return parsed
+
+
+def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Deduplicate semantically similar tags using embedding similarity."""
+    if not topic_candidates:
+        return []
+
+    keywords = [item["keyword"] for item in topic_candidates]
+    vectors = embedder_model.encode(keywords, normalize_embeddings=True)
+
+    clusters: list[dict[str, Any]] = []
+    for item, vector in zip(topic_candidates, vectors):
+        vec = np.asarray(vector, dtype=np.float32)
+
+        best_idx = -1
+        best_sim = -1.0
+        for idx, cluster in enumerate(clusters):
+            sim = float(np.dot(vec, cluster["vector"]))
+            if sim > best_sim:
+                best_sim = sim
+                best_idx = idx
+
+        if best_idx >= 0 and best_sim >= TOPIC_SIMILARITY_THRESHOLD:
+            cluster = clusters[best_idx]
+            merged = cluster["vector"] * cluster["count"] + vec
+            norm = float(np.linalg.norm(merged))
+            if norm > 0:
+                cluster["vector"] = merged / norm
+
+            cluster["count"] += 1
+            if item["score"] is not None and (
+                cluster["score"] is None or item["score"] > cluster["score"]
+            ):
+                cluster["score"] = item["score"]
+
+            # Prefer shorter tag as canonical keyword.
+            if len(item["keyword"]) < len(cluster["keyword"]):
+                cluster["keyword"] = item["keyword"]
+        else:
+            clusters.append(
+                {
+                    "keyword": item["keyword"],
+                    "score": item["score"],
+                    "vector": vec,
+                    "count": 1,
+                }
+            )
+
+    if any(cluster["score"] is not None for cluster in clusters):
+        clusters.sort(key=lambda x: x["score"] if x["score"] is not None else -1.0, reverse=True)
+
+    result = [
+        {"keyword": cluster["keyword"], "score": cluster["score"]}
+        for cluster in clusters[:TOPIC_TAG_MAX_COUNT]
+    ]
+    return result
+
+
+def replace_event_topics(db, event_id: int, normalized_topics: list[dict[str, Any]]) -> None:
+    """Replace EVENT tags for one unified event atomically within current transaction."""
+    db.query(ExtractedTopic).filter(
+        ExtractedTopic.target_type == TargetType.EVENT,
+        ExtractedTopic.target_id == event_id,
+    ).delete(synchronize_session=False)
+
+    for item in normalized_topics:
+        db.add(
+            ExtractedTopic(
+                target_type=TargetType.EVENT,
+                target_id=event_id,
+                topic_keyword=item["keyword"],
+                relevance_score=item["score"],
+            )
+        )
+
+
 async def generate_unified_summaries():
-    """定时任务：扫描高热度事件并生成/更新摘要"""
-    print(f"[{utcnow()}] 开始执行 DeepSeek 摘要生成任务...")
+    """Scheduled task: refresh summaries and topic tags for hot unified events."""
+    print(f"[{utcnow()}] Start unified summary generation task...")

    with SessionLocal() as db:
        recent_threshold = utcnow() - timedelta(days=3)

-        # 必须满足：热度达标 AND (当前热度 > 上次生成摘要时的热度) AND 近期活跃
        events = db.query(UnifiedEvent).filter(
            UnifiedEvent.hot_score >= HOT_SCORE_THRESHOLD,
            UnifiedEvent.hot_score > UnifiedEvent.last_summarized_trends_count,
-            UnifiedEvent.created_at >= recent_threshold
+            UnifiedEvent.created_at >= recent_threshold,
        ).all()

        if not events:
-            print("当前没有需要更新摘要的大事件，任务结束。")
+            print("No events require summary update in this round.")
            return

        for event in events:
-            # 联合查询获取该事件在各平台的子新闻
-            trends = db.query(TrendingEvent, InfoSource.source_name) \
-                .join(InfoSource, TrendingEvent.source_id == InfoSource.id) \
-                .filter(TrendingEvent.unified_event_id == event.id) \
+            trends = (
+                db.query(TrendingEvent, InfoSource.source_name)
+                .join(InfoSource, TrendingEvent.source_id == InfoSource.id)
+                .filter(TrendingEvent.unified_event_id == event.id)
                .all()
+            )

            if not trends:
                continue

-            # 按平台归类标题并去重
-            platform_dict = {}
+            platform_dict: dict[str, set[str]] = {}
            for trend_record, source_name in trends:
-                if source_name not in platform_dict:
-                    platform_dict[source_name] = set()
-                platform_dict[source_name].add(trend_record.current_headline)
+                platform_dict.setdefault(source_name, set()).add(trend_record.current_headline)

-            # 组装给大模型的 Prompt 数据
-            prompt_lines = [f"【{platform}】: {', '.join(headlines)}" for platform, headlines in platform_dict.items()]
+            prompt_lines = [
+                f"[{platform}] {', '.join(sorted(headlines))}"
+                for platform, headlines in platform_dict.items()
+            ]
            platform_data_text = "\n".join(prompt_lines)

            try:
-                # 调用封装好的异步函数
                llm_result = await call_llm_for_summary(platform_data_text)

-                if "unified_title" in llm_result:
+                if "unified_title" in llm_result and llm_result["unified_title"]:
                    event.unified_title = llm_result["unified_title"]
-                if "ai_comprehensive_summary" in llm_result:
+                if "ai_comprehensive_summary" in llm_result and llm_result["ai_comprehensive_summary"]:
                    event.ai_comprehensive_summary = llm_result["ai_comprehensive_summary"]

-                # 成功后更新水位线
-                # 将最后一次总结时的热搜数量，更新为当前最新的 hot_score
+                if event.hot_score >= TOPIC_TAG_MIN_HOT_SCORE:
+                    topic_candidates = parse_topic_keywords(llm_result)
+                    normalized_topics = normalize_topic_keywords(topic_candidates)
+                    if normalized_topics:
+                        replace_event_topics(db, event.id, normalized_topics)
+
                event.last_summarized_trends_count = event.hot_score
+                print(
+                    f"Updated event {event.id} summary"
+                    f" (hot_score={event.hot_score})."
+                )

-                print(f"成功更新大事件 ID {event.id} 的深度摘要 (当前热度: {event.hot_score})。")
-
-            except Exception as e:
-                print(f"大事件 ID {event.id} 摘要生成失败: {e}")
+            except Exception as exc:
+                print(f"Event {event.id} summary generation failed: {exc}")
                continue

-        # 提交事务
        db.commit()