InsightRadar/backend/app/services/fetcher_service.py

# app/services/fetcher_service.py
import os
import hashlib
import httpx
from dotenv import load_dotenv

from app.database import SessionLocal
from app.models.models import (
    InfoSource, TrendingEvent, DataSyncTask, TaskStatus,
    HeadlineRevision, RankingLog, utcnow
)

# 加载 .env 文件中的环境变量
load_dotenv()

# 从环境变量获取 API 基础地址，提供默认回退地址
API_BASE_URL = os.getenv("API_BASE_URL", "https://newsnow.busiyi.world/api/s")


def generate_md5(text: str) -> str:
    """
    生成32位MD5哈希
    用途：为不同平台的数据生成统一的、长度固定的外部唯一标识（external_id），
    方便建立数据库的唯一索引，防止同一条热搜重复插入。
    """
    return hashlib.md5(text.encode('utf-8')).hexdigest()


async def fetch_and_save_trending_data():
    """
    核心定时任务：从数据库读取信息源 -> 抓取API -> 解析并存入数据库

    业务流程：
    1. 查询所有已启用的信息源 (is_enabled == True)
    2. 伪装成浏览器发起异步 HTTP 请求
    3. 遍历解析数据，进行去重判断 (MD5)
    4. 记录标题变更轨迹 (HeadlineRevision) 和 热搜排名轨迹 (RankingLog)
    5. 统一提交或在发生异常时回滚脏数据
    """
    print(f"[{utcnow()}] 开始执行定时抓取任务...")

    # 使用上下文管理器确保数据库连接池正确归还连接
    with SessionLocal() as db:
        # 1. 动态获取抓取源，这样在后台开关信息源不需要重启服务
        sources = db.query(InfoSource).filter(InfoSource.is_enabled == True).all()
        if not sources:
            print("没有找到启用的信息源，任务结束。")
            return

        # 2. 伪装 HTTP 请求头，绕过目标服务器的反爬/防盗链机制
        custom_headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
            "Accept": "application/json, text/plain, */*",
            "Referer": "https://newsnow.busiyi.world/",
            "Origin": "https://newsnow.busiyi.world"
        }

        # 复用异步客户端，提高并发抓取效率
        async with httpx.AsyncClient(timeout=15.0, headers=custom_headers) as client:
            for source in sources:
                # 平台标识，如 "weibo", "zhihu" 等，这里复用了 home_url 字段存储
                platform_id = source.home_url
                if not platform_id:
                    continue

                # 拼装最终的抓取 URL
                url = f"{API_BASE_URL}?id={platform_id}&latest"
                # 初始化本次抓取任务的日志记录
                task_log = DataSyncTask(source_id=source.id, items_fetched=0)

                try:
                    # 发起请求并校验 HTTP 状态码
                    response = await client.get(url)
                    response.raise_for_status()
                    data_json = response.json()

                    items = data_json.get("items", [])
                    saved_count = 0

                    for index, item in enumerate(items, 1):
                        title = item.get("title")
                        if not title:
                            continue

                        item_url = item.get("url", "")

                        # 3. ID 去重策略：优先用接口自带 ID，没有则用 URL，最差情况用标题兜底
                        raw_id = item.get("id") or item_url or title
                        # 组合平台标识和原始 ID 算出全局唯一的 MD5
                        external_id = generate_md5(f"{platform_id}_{raw_id}")

                        # 4. 在数据库中查询是否已经存在这条热搜
                        existing_event = db.query(TrendingEvent).filter(
                            TrendingEvent.source_id == source.id,
                            TrendingEvent.external_id == external_id
                        ).first()

                        event_to_log = None  # 留个指针，用来后续记录名次历史

                        if existing_event:
                            # -------- 分支 A：热搜已存在，执行更新逻辑 --------

                            # 监控标题变化（例如：微博热搜经常会改词条名字）
                            if existing_event.current_headline != title:
                                # 标题发生改变！立刻记录到修订历史表
                                revision = HeadlineRevision(
                                    event_id=existing_event.id,
                                    previous_headline=existing_event.current_headline,
                                    revised_headline=title
                                )
                                db.add(revision)
                                existing_event.current_headline = title  # 更新为主表最新标题

                            # 更新当前最新的排名和链接
                            existing_event.current_ranking = index
                            existing_event.event_url = item_url

                            event_to_log = existing_event
                        else:
                            # -------- 分支 B：全新热搜，执行插入逻辑 --------
                            new_event = TrendingEvent(
                                source_id=source.id,
                                external_id=external_id,
                                current_headline=title,
                                event_url=item_url,
                                current_ranking=index,
                            )
                            db.add(new_event)
                            # 核心操作！flush 会将数据推入数据库生成自增的 ID，但不提交事务 (commit)。
                            # 这样接下来的 RankingLog 就能立刻拿到 `new_event.id` 作为外键。
                            db.flush()
                            event_to_log = new_event

                        # -------- 无论新旧，统一记录排名轨迹 --------
                        # 只要抓取到，不管新旧，必须记一笔当前的排名！ <---
                        # 借助这个表，后续可以画出某条热搜随时间变化的“排名上升/下降曲线”
                        rank_log = RankingLog(
                            event_id=event_to_log.id,
                            ranking_position=index
                        )
                        db.add(rank_log)

                        saved_count += 1

                    # 如果这一个平台的数据全部处理顺利，标记成功并整体提交
                    task_log.items_fetched = saved_count
                    task_log.task_status = TaskStatus.SUCCESS
                    db.add(task_log)
                    db.commit()  # ✅ 只在 try 顺利走完整个平台的数据时，才统一提交业务数据到硬盘
                    print(f"[{source.source_name}] 成功抓取并更新了 {saved_count} 条数据")

                except Exception as e:
                    # -------- 异常处理机制 --------
                    # 独立日志记录

                    # 第一步：遇到报错（如网络中断、解析错误），立刻回滚。
                    # 丢弃这批脏数据，防止数据库出现一半更新一半没更新的“不一致状态”。
                    db.rollback()

                    # 第二步：记录错误日志并独立提交。
                    # 因为上面执行了 rollback，之前 add 的 task_log 也被清空了状态，
                    # 此时重新设置状态，并作为一次新的独立事务提交到数据库，方便后台监控报错。
                    task_log.task_status = TaskStatus.ERROR
                    task_log.error_trace = str(e)
                    db.add(task_log)
                    db.commit()  # 提交错误日志本身
                    print(f"[{source.source_name}] 抓取失败: {e}")