refresh

2026-06-06 00:57:51 +08:00 · 2026-03-09 18:13:35 +08:00
parent 3c57dd0cce
commit 5b541bbea3
11 changed files with 464 additions and 75 deletions
@@ -0,0 +1,167 @@
+# app/services/fetcher_service.py
+import os
+import hashlib
+import httpx
+from dotenv import load_dotenv
+
+from app.database import SessionLocal
+from app.models.models import (
+    InfoSource, TrendingEvent, DataSyncTask, TaskStatus,
+    HeadlineRevision, RankingLog, utcnow
+)
+
+# 加载 .env 文件中的环境变量
+load_dotenv()
+
+# 从环境变量获取 API 基础地址，提供默认回退地址
+API_BASE_URL = os.getenv("API_BASE_URL", "https://newsnow.busiyi.world/api/s")
+
+
+def generate_md5(text: str) -> str:
+    """
+    生成32位MD5哈希
+    用途：为不同平台的数据生成统一的、长度固定的外部唯一标识（external_id），
+    方便建立数据库的唯一索引，防止同一条热搜重复插入。
+    """
+    return hashlib.md5(text.encode('utf-8')).hexdigest()
+
+
+async def fetch_and_save_trending_data():
+    """
+    核心定时任务：从数据库读取信息源 -> 抓取API -> 解析并存入数据库
+
+    业务流程：
+    1. 查询所有已启用的信息源 (is_enabled == True)
+    2. 伪装成浏览器发起异步 HTTP 请求
+    3. 遍历解析数据，进行去重判断 (MD5)
+    4. 记录标题变更轨迹 (HeadlineRevision) 和 热搜排名轨迹 (RankingLog)
+    5. 统一提交或在发生异常时回滚脏数据
+    """
+    print(f"[{utcnow()}] 开始执行定时抓取任务...")
+
+    # 使用上下文管理器确保数据库连接池正确归还连接
+    with SessionLocal() as db:
+        # 1. 动态获取抓取源，这样在后台开关信息源不需要重启服务
+        sources = db.query(InfoSource).filter(InfoSource.is_enabled == True).all()
+        if not sources:
+            print("没有找到启用的信息源，任务结束。")
+            return
+
+        # 2. 伪装 HTTP 请求头，绕过目标服务器的反爬/防盗链机制
+        custom_headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
+            "Accept": "application/json, text/plain, */*",
+            "Referer": "https://newsnow.busiyi.world/",
+            "Origin": "https://newsnow.busiyi.world"
+        }
+
+        # 复用异步客户端，提高并发抓取效率
+        async with httpx.AsyncClient(timeout=15.0, headers=custom_headers) as client:
+            for source in sources:
+                # 平台标识，如 "weibo", "zhihu" 等，这里复用了 home_url 字段存储
+                platform_id = source.home_url
+                if not platform_id:
+                    continue
+
+                # 拼装最终的抓取 URL
+                url = f"{API_BASE_URL}?id={platform_id}&latest"
+                # 初始化本次抓取任务的日志记录
+                task_log = DataSyncTask(source_id=source.id, items_fetched=0)
+
+                try:
+                    # 发起请求并校验 HTTP 状态码
+                    response = await client.get(url)
+                    response.raise_for_status()
+                    data_json = response.json()
+
+                    items = data_json.get("items", [])
+                    saved_count = 0
+
+                    for index, item in enumerate(items, 1):
+                        title = item.get("title")
+                        if not title:
+                            continue
+
+                        item_url = item.get("url", "")
+
+                        # 3. ID 去重策略：优先用接口自带 ID，没有则用 URL，最差情况用标题兜底
+                        raw_id = item.get("id") or item_url or title
+                        # 组合平台标识和原始 ID 算出全局唯一的 MD5
+                        external_id = generate_md5(f"{platform_id}_{raw_id}")
+
+                        # 4. 在数据库中查询是否已经存在这条热搜
+                        existing_event = db.query(TrendingEvent).filter(
+                            TrendingEvent.source_id == source.id,
+                            TrendingEvent.external_id == external_id
+                        ).first()
+
+                        event_to_log = None  # 留个指针，用来后续记录名次历史
+
+                        if existing_event:
+                            # -------- 分支 A：热搜已存在，执行更新逻辑 --------
+
+                            # 监控标题变化（例如：微博热搜经常会改词条名字）
+                            if existing_event.current_headline != title:
+                                # 标题发生改变！立刻记录到修订历史表
+                                revision = HeadlineRevision(
+                                    event_id=existing_event.id,
+                                    previous_headline=existing_event.current_headline,
+                                    revised_headline=title
+                                )
+                                db.add(revision)
+                                existing_event.current_headline = title  # 更新为主表最新标题
+
+                            # 更新当前最新的排名和链接
+                            existing_event.current_ranking = index
+                            existing_event.event_url = item_url
+
+                            event_to_log = existing_event
+                        else:
+                            # -------- 分支 B：全新热搜，执行插入逻辑 --------
+                            new_event = TrendingEvent(
+                                source_id=source.id,
+                                external_id=external_id,
+                                current_headline=title,
+                                event_url=item_url,
+                                current_ranking=index,
+                            )
+                            db.add(new_event)
+                            # 核心操作！flush 会将数据推入数据库生成自增的 ID，但不提交事务 (commit)。
+                            # 这样接下来的 RankingLog 就能立刻拿到 `new_event.id` 作为外键。
+                            db.flush()
+                            event_to_log = new_event
+
+                        # -------- 无论新旧，统一记录排名轨迹 --------
+                        # 只要抓取到，不管新旧，必须记一笔当前的排名！ <---
+                        # 借助这个表，后续可以画出某条热搜随时间变化的“排名上升/下降曲线”
+                        rank_log = RankingLog(
+                            event_id=event_to_log.id,
+                            ranking_position=index
+                        )
+                        db.add(rank_log)
+
+                        saved_count += 1
+
+                    # 如果这一个平台的数据全部处理顺利，标记成功并整体提交
+                    task_log.items_fetched = saved_count
+                    task_log.task_status = TaskStatus.SUCCESS
+                    db.add(task_log)
+                    db.commit()  # ✅ 只在 try 顺利走完整个平台的数据时，才统一提交业务数据到硬盘
+                    print(f"[{source.source_name}] 成功抓取并更新了 {saved_count} 条数据")
+
+                except Exception as e:
+                    # -------- 异常处理机制 --------
+                    # 独立日志记录
+
+                    # 第一步：遇到报错（如网络中断、解析错误），立刻回滚。
+                    # 丢弃这批脏数据，防止数据库出现一半更新一半没更新的“不一致状态”。
+                    db.rollback()
+
+                    # 第二步：记录错误日志并独立提交。
+                    # 因为上面执行了 rollback，之前 add 的 task_log 也被清空了状态，
+                    # 此时重新设置状态，并作为一次新的独立事务提交到数据库，方便后台监控报错。
+                    task_log.task_status = TaskStatus.ERROR
+                    task_log.error_trace = str(e)
+                    db.add(task_log)
+                    db.commit()  # 提交错误日志本身
+                    print(f"[{source.source_name}] 抓取失败: {e}")