backend 去ai化

2026-06-06 00:57:51 +08:00 · 2026-04-20 15:53:02 +08:00
parent 7a34fc0079
commit bba6de25ac
28 changed files with 161 additions and 228 deletions
@@ -1,7 +1,3 @@
-# 定时推送调度服务
-# 由 APScheduler 每分钟调用，检查当前时刻是否有用户需要接收推送，
-# 如匹配则生成摘要邮件并发送，同时写入 DeliveryHistory 防重复。
-# 推送优先级：有关键词且匹配 → 个性化简报；无关键词或无匹配 → 默认热点快报
 import logging
 import os
 from logging.handlers import TimedRotatingFileHandler
@@ -34,7 +30,7 @@ from app.utils.email_utils import send_html_email

 logger = logging.getLogger("delivery_service")

-# delivery_service 日志单独写文件
+
 _delivery_log_dir = Path(__file__).resolve().parents[2] / "logs"
 _delivery_log_dir.mkdir(parents=True, exist_ok=True)
 _delivery_log_file = _delivery_log_dir / "delivery_check.log"
@@ -51,6 +47,8 @@ if not logger.handlers:
 logger.setLevel(logging.INFO)
 logger.propagate = False

+# AI辅助生成：deepseek-v3-2，2026年3月20日
+
 # 推送时间窗口：实际执行时刻与设定时间的最大容差（分钟）
 DELIVERY_WINDOW_MINUTES = int(os.getenv("DELIVERY_WINDOW_MINUTES", 2))
 # 同一用户两次推送之间的最小间隔（分钟）
@@ -64,13 +62,10 @@ DEFAULT_MODE_HOURS = int(os.getenv("DEFAULT_MODE_HOURS", 24))
 # 用户时区无效时的兜底时区
 DEFAULT_FALLBACK_TIMEZONE = os.getenv("DEFAULT_FALLBACK_TIMEZONE", "Asia/Shanghai")

-
-# ==========================================
-# 默认热点事件容器（无关键词时使用）
-# ==========================================
@dataclass
 class _DefaultEventItem:
    """
+    默认热点事件容器
    无关键词订阅或关键词无匹配时的默认热点包装器，
    接口与 MatchedEventResult 保持一致，方便统一传给模板。
    """
@@ -81,10 +76,6 @@ class _DefaultEventItem:
    tags: list[str] = field(default_factory=list)
    is_default: bool = True

-
-# ==========================================
-# 时区工具
-# ==========================================
 def _time_to_minutes(t: dt_time) -> int:
    return t.hour * 60 + t.minute

@@ -125,10 +116,10 @@ def _ensure_aware(dt: datetime) -> datetime:
        return dt.replace(tzinfo=timezone.utc)
    return dt

+# AI辅助生成结束
+

-# ==========================================
 # 数据库查询辅助
-# ==========================================
 def _should_skip_by_interval(db: Session, user_id: int) -> bool:
    """检查用户是否仍在冷却期内，避免短时间内重复推送"""
    row = (
@@ -297,9 +288,9 @@ def _record_delivery(
    db.commit()


-# ==========================================
+# AI辅助生成：deepseek-v3-2，2026年3月20日
+
 # 推送准备
-# ==========================================
@dataclass
 class _PendingPush:
    """暂存需要发送邮件的信息，便于在 async 上下文中发送。"""
@@ -309,6 +300,7 @@ class _PendingPush:
    html_body: str
    event_ids: list[int]

+# AI生成结束

 def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedule) -> _PendingPush | None:
    """
@@ -331,7 +323,6 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul

    pushed_ids = _get_already_pushed_event_ids(db, user_id)

-    # 决策：有关键词且有匹配 → 匹配模式；否则 → 默认热点模式
    items: list = []
    is_default = False

@@ -361,7 +352,6 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul
            logger.info(f"用户 {user_id} 默认热点无可推送内容，跳过")
            return None

-    # 批量加载平台数据（来源名、标题、URL、排名）
    event_ids = [item.event.id for item in items]
    platforms_map = _load_event_platforms(db, event_ids)

@@ -383,9 +373,6 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul
    )


-# ==========================================
-# 调度主入口
-# ==========================================
 async def check_and_deliver() -> None:
    """
    定时推送主入口，由 APScheduler 每分钟调用。
@@ -412,7 +399,6 @@ async def check_and_deliver() -> None:
            if not user:
                continue

-            # 将 UTC 转为用户本地时间，判断是否落在推送窗口内
            user_current = _user_local_time(now, user.timezone)
            if not _is_within_window(schedule.delivery_time, user_current):
                continue
@@ -422,7 +408,6 @@ async def check_and_deliver() -> None:
                if pending is None:
                    continue

-                # 异步按优先级尝试各邮件渠道
                sent = False
                for target_email in pending.email_targets:
                    try:
@@ -1,8 +1,3 @@
-# app/services/fetcher_service.py
-"""
-抓取服务：从外部 API 拉取热搜/RSS 数据，做查重、向量聚类、入库
-热搜分支：语义聚类到 UnifiedEvent；RSS 分支：写入 NewsArticle
-"""
 import os
 import hashlib
 from datetime import timedelta
@@ -19,6 +14,8 @@ from app.models.models import (
    HeadlineRevision, RankingLog, SourceType, utcnow, UnifiedEvent
 )

+# AI辅助生成：deepseek-v3-2，2026年3月20日
+
 # 加载环境变量
 load_dotenv()
 hf_token = os.getenv("HF_TOKEN")
@@ -31,6 +28,8 @@ print("正在加载模型...")
 embedder_model = SentenceTransformer(EMBEDDING_MODEL_PATH, local_files_only=True)
 print("模型加载完成。")

+# AI生成结束
+

 def generate_md5(text: str) -> str:
    """生成 32 位 MD5 作为 external_id，用于跨平台去重"""
@@ -88,10 +87,10 @@ class UnifiedEventClusterer:
        new_unified = UnifiedEvent(
            unified_title=title,
            center_embedding=embedding_json,
-            hot_score=1  # 初始热度
+            hot_score=1
        )
        self.db.add(new_unified)
-        self.db.flush()  # 获取自增的主键 ID
+        self.db.flush()
        
        # 更新缓存
        self.event_vectors.append(new_vec)
@@ -109,11 +108,8 @@ def process_hot_trend_item(db, source, item, index: int, external_id: str, exist

    event_to_log = None

-    # 查重：已存在则可能只需更新标题/排名；不存在则需聚类并新建
    if existing_event:
-        # 场景 A1：老熟人
        if existing_event.current_headline != title:
-            # 标题被暗改，此时需要重新算一次 Embedding
            new_embedding_json, _ = embeddings_dict[title]

            revision = HeadlineRevision(
@@ -123,30 +119,25 @@ def process_hot_trend_item(db, source, item, index: int, external_id: str, exist
            )
            db.add(revision)
            existing_event.current_headline = title
-            existing_event.title_embedding = new_embedding_json  # 更新为新标题的语义向量
-            # 注：这里不改变它所属的 unified_event_id，因为大体还是同一件事
+            existing_event.title_embedding = new_embedding_json

        existing_event.current_ranking = index
        existing_event.event_url = item_url
        event_to_log = existing_event

    else:
-        # 场景 A2：这是一条彻底的全新热搜
-        # 1. 计算向量
-        new_embedding_json, new_vec = embeddings_dict[title]

-        # 2. 扔进聚类中枢找归宿
+        new_embedding_json, new_vec = embeddings_dict[title]
        matched_event_id = clusterer.match_or_create(title, new_embedding_json, new_vec)

-        # 3. 落库
        new_event = TrendingEvent(
            source_id=source.id,
            external_id=external_id,
            current_headline=title,
            event_url=item_url,
            current_ranking=index,
-            title_embedding=new_embedding_json,  # 存入向量
-            unified_event_id=matched_event_id  # 挂载到大事件下
+            title_embedding=new_embedding_json,
+            unified_event_id=matched_event_id
        )
        db.add(new_event)
        db.flush()
@@ -192,7 +183,6 @@ def process_source_data(db, source, items: list) -> int:
    saved_count = 0
    platform_id = source.home_url

-    # 1. 批量计算外部 ID 并聚合要计算的文本
    valid_items = []
    external_ids = []
    for item in items:
@@ -209,7 +199,6 @@ def process_source_data(db, source, items: list) -> int:
    if not valid_items:
        return 0

-    # 批量查重：按 external_id 判断是更新还是新增
    existing_events_dict = {}
    existing_articles_dict = {}
    
@@ -226,7 +215,6 @@ def process_source_data(db, source, items: list) -> int:
        ).all()
        existing_articles_dict = {art.external_id: art for art in existing_articles}

-    # 仅对需要算向量的标题做批量 embedding，避免重复计算
    texts_to_embed = []
    if source.source_type in (SourceType.HOT_TREND, SourceType.API):
        for item, external_id in valid_items:
@@ -238,15 +226,12 @@ def process_source_data(db, source, items: list) -> int:
            else:
                texts_to_embed.append(title)
                
-    # 4. 批量执行大模型推理
    embeddings_dict = generate_embeddings_batch(texts_to_embed)
    
-    # 初始化聚类器（只在热搜模式下需要，且只初始化一次）
    clusterer = None
    if source.source_type in (SourceType.HOT_TREND, SourceType.API):
        clusterer = UnifiedEventClusterer(db)

-    # 按来源类型分流：热搜/API → TrendingEvent + 聚类；RSS → NewsArticle
    for index, (item, external_id) in enumerate(valid_items, 1):
        if source.source_type in (SourceType.HOT_TREND, SourceType.API):
            existing_event = existing_events_dict.get(external_id)
@@ -269,14 +254,12 @@ async def fetch_and_save_trending_data():
    """
    print(f"[{utcnow()}] 开始执行定时抓取任务...")

-    # 获取启用的信息源 - 这个只读操作用一个短连接
    with SessionLocal() as db:
        sources = db.query(InfoSource).filter(InfoSource.is_enabled == True).all()
        if not sources:
            print("没有找到启用的信息源，任务结束。")
            return
            
-        # 我们把 source 的信息提前提取出来，避免在异步中长期持有 session
        source_configs = [
            {
                "id": s.id,
@@ -287,7 +270,6 @@ async def fetch_and_save_trending_data():
            for s in sources
        ]

-    # 伪装请求头，规避反爬
    custom_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
@@ -304,13 +286,11 @@ async def fetch_and_save_trending_data():
            url = f"{API_BASE_URL}?id={platform_id}&latest"

            try:
-                # 1. 网络请求（可能耗时较长，不要包在 db session 里）
                response = await client.get(url)
                response.raise_for_status()
                data_json = response.json()
                items = data_json.get("items", [])
                
-                # 2. 数据库事务操作（尽量短，单独使用 session）
                with SessionLocal() as db:
                    # 重新从短 session 中获取 source 实例，以免 detached
                    source = db.query(InfoSource).get(s_config["id"])
@@ -319,10 +299,8 @@ async def fetch_and_save_trending_data():
                        
                    task_log = DataSyncTask(source_id=source.id, items_fetched=0)
                    try:
-                        # 调用数据处理层
                        saved_count = process_source_data(db, source, items)

-                        # 业务事务成功提交
                        task_log.items_fetched = saved_count
                        task_log.task_status = TaskStatus.SUCCESS
                        db.add(task_log)
@@ -330,10 +308,9 @@ async def fetch_and_save_trending_data():
                        print(f"[{source.source_name}] ({source.source_type}) 成功抓取并更新了 {saved_count} 条数据")
                    except Exception as e:
                        db.rollback()
-                        raise e # 抛出给外层捕获记录日志
+                        raise e
                        
            except Exception as e:
-                # 异常拦截与错误隔离，另起一个超短事务记录日志
                with SessionLocal() as log_db:
                    try:
                        new_task_log = DataSyncTask(source_id=s_config["id"], items_fetched=0)
@@ -1,7 +1,3 @@
-"""
-匹配服务：根据用户兴趣关键词（精确 + 语义）推荐事件
-打分融合：标签/标题匹配分 + 标签相关度 + 热度 + 新鲜度加成
-"""
 import os
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
@@ -13,6 +9,7 @@ from sqlalchemy.orm import Session
 from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow
 from app.services.fetcher_service import embedder_model

+# AI辅助生成：deepseek-v3-2，2026年3月20日

 # 语义匹配阈值：用户关键词和事件标签/标题向量相似度达到该值才计入语义命中
 DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
@@ -35,6 +32,7 @@ class MatchedEventResult:
    semantic_hits: list[dict[str, Any]]
    tags: list[str]

+# AI生成结束

 def _normalize_text(text: str) -> str:
    """统一小写与首尾空白，便于做稳定匹配。"""
@@ -80,7 +78,6 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
        
    uncached_keywords = []
    
-    # 1. 尝试从缓存获取
    for keyword in keywords:
        if not keyword:
            continue
@@ -89,9 +86,7 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
        else:
            uncached_keywords.append(keyword)
            
-    # 2. 对未命中的词进行统一的批量推理
    if uncached_keywords:
-        # 去重，避免同一个未缓存的词被计算多次
        unique_uncached = list(dict.fromkeys(uncached_keywords))
        
        vectors = embedder_model.encode(unique_uncached, normalize_embeddings=True, show_progress_bar=False)
@@ -102,7 +97,6 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
            for k in keys_to_delete:
                del _EMBEDDING_CACHE[k]
                
-        # 3. 将新计算的向量存入缓存并回填结果
        for keyword, vec in zip(unique_uncached, vectors):
            vec_array = np.asarray(vec, dtype=np.float32)
            _EMBEDDING_CACHE[keyword] = vec_array
@@ -172,7 +166,6 @@ def recommend_events_for_user(
        else PREFERENCE_SEMANTIC_THRESHOLD
    )

-    # 1. 读取用户兴趣词
    preferences = (
        db.query(UserTopicPreference)
        .filter(UserTopicPreference.user_id == user_id)
@@ -185,7 +178,6 @@ def recommend_events_for_user(
    if not preference_keywords:
        return []

-    # 2. 读取候选事件（时间 + 热度过滤，避免全表扫描）
    time_limit = utcnow() - timedelta(hours=hours)
    events = (
        db.query(UnifiedEvent)
@@ -213,20 +205,17 @@ def recommend_events_for_user(
        .all()
    )

-    # 组织事件标签映射：event_id -> [(tag, relevance_score), ...]
    event_topics: dict[int, list[tuple[str, float | None]]] = {}
    for event_id, topic_keyword, relevance_score in topic_rows:
        if not topic_keyword:
            continue
        event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))

-    # 3. 批量编码用户词与标签词，减少模型调用次数
    unique_preference_keywords = list(dict.fromkeys(preference_keywords))
    unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
    pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
    topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)

-    # 预先建立“标准化后用户词集合”，用于精确匹配
    normalized_preference_pairs = [
        (word, _normalize_text(word))
        for word in unique_preference_keywords
@@ -246,20 +235,15 @@ def recommend_events_for_user(
        exact_hits: list[str] = []
        semantic_hits: list[dict[str, Any]] = []
        score = 0.0
-
-        # 对每个事件标签做精确匹配或语义匹配
        for topic_keyword, topic_relevance in topic_list:
            topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
-
-            # 1) 精确命中（包括完全相等与包含关系）
+            
            matched_pref = _find_exact_preference_match(topic_keyword, normalized_preference_pairs)
            if matched_pref is not None:
                exact_hits.append(topic_keyword)
-                # 精确命中给较高基础分，标签自身相关度作为增益
                score += 45.0 + topic_relevance_score * 0.2
                continue

-            # 2) 语义命中（未精确命中时再算）
            best_pref, best_sim = _find_best_semantic_match(topic_keyword, topic_vec_map, pref_vec_map)

            if best_pref is not None and best_sim >= similarity_threshold:
@@ -270,10 +254,8 @@ def recommend_events_for_user(
                        "similarity": round(best_sim, 4),
                    }
                )
-                # 语义命中分略低于精确命中，并由相似度放大
                score += best_sim * 35.0 + topic_relevance_score * 0.12

-        # 标题也参与匹配，但权重低于结构化标签，避免长标题过度主导排序。
        event_title = (event.unified_title or "").strip()
        if event_title:
            title_exact_pref = _find_exact_preference_match(event_title, normalized_preference_pairs)
@@ -292,15 +274,12 @@ def recommend_events_for_user(
                    )
                    score += best_sim * 24.0

-        # 如果精确和语义都没命中，直接跳过
        if not exact_hits and not semantic_hits:
            continue

-        # 融合事件热度和新鲜度，避免只看语义分
        score += min(event.hot_score, 100) * 0.3
        score += _calc_freshness_bonus(event)

-        # 返回标签时做去重，保证接口稳定
        tags = list(dict.fromkeys([item[0] for item in topic_list]))
        scored_results.append(
            MatchedEventResult(
@@ -1,8 +1,3 @@
-# app/services/summary_service.py
-"""
-摘要服务：调用 LLM 生成统一标题、综合摘要、话题标签
-定时任务：对热度达标且未摘要的事件批量处理
-"""
 import json
 import os
 from datetime import timedelta
@@ -26,12 +21,16 @@ from app.prompts.summary_prompts import (
 )
 from app.services.fetcher_service import embedder_model

+# AI辅助生成：deepseek-v3-2，2026年3月20日
+
 HOT_SCORE_THRESHOLD = int(os.getenv("HOT_SCORE_THRESHOLD", 3))
 TOPIC_TAG_MIN_HOT_SCORE = int(os.getenv("TOPIC_TAG_MIN_HOT_SCORE", HOT_SCORE_THRESHOLD))
 TOPIC_SIMILARITY_THRESHOLD = float(os.getenv("TOPIC_SIMILARITY_THRESHOLD", 0.82))
 TOPIC_TAG_MAX_COUNT = int(os.getenv("TOPIC_TAG_MAX_COUNT", 8))
 AI_API_KEY = os.getenv("AI_API_KEY", "")

+# AI生成结束
+

 deepseek_client = AsyncOpenAI(
    api_key=AI_API_KEY,
@@ -184,7 +183,6 @@ async def generate_unified_summaries():
    """定时任务：对热度达标且未摘要的事件刷新标题、摘要、标签"""
    print(f"[{utcnow()}] Start unified summary generation task...")

-    # 先提取需要处理的事件 ID，尽早释放 session，不长期占用 db session
    with SessionLocal() as db:
        recent_threshold = utcnow() - timedelta(days=3)
        events = db.query(UnifiedEvent).filter(
@@ -197,11 +195,9 @@ async def generate_unified_summaries():
            print("No events require summary update in this round.")
            return
            
-        # 复制出需要的信息，脱离 session
        event_ids = [e.id for e in events]
        event_hot_scores = {e.id: e.hot_score for e in events}

-    # 外层循环：针对每个 event_id 开启一个极短生命周期的 session 获取依赖数据
    for event_id in event_ids:
        platform_dict: dict[str, set[str]] = {}
        with SessionLocal() as db: