mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-05 23:07:51 +08:00
refresh
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
# app/api/endpoints/sources.py
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List
|
||||
|
||||
from app.database import get_db
|
||||
from app.schemas.schemas import (
|
||||
InfoSourceCreate, InfoSourceUpdate, InfoSourceResponse, PaginatedResponse
|
||||
)
|
||||
from app.crud import crud_source
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/", response_model=InfoSourceResponse, status_code=status.HTTP_201_CREATED)
|
||||
async def create_info_source(source_in: InfoSourceCreate, db: Session = Depends(get_db)):
|
||||
"""新建一个信息源"""
|
||||
return crud_source.create(db=db, obj_in=source_in)
|
||||
|
||||
|
||||
@router.get("/", response_model=PaginatedResponse[InfoSourceResponse])
|
||||
async def get_all_sources(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)):
|
||||
"""获取所有信息源列表(加入了分页参数)"""
|
||||
sources = crud_source.get_multi(db=db, skip=skip, limit=limit)
|
||||
return {"total": len(sources), "data": sources}
|
||||
|
||||
|
||||
@router.get("/{source_id}", response_model=InfoSourceResponse)
|
||||
async def get_info_source(source_id: int, db: Session = Depends(get_db)):
|
||||
"""获取单个信息源详情"""
|
||||
source = crud_source.get(db=db, source_id=source_id)
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail="该信息源不存在")
|
||||
return source
|
||||
|
||||
|
||||
@router.patch("/{source_id}", response_model=InfoSourceResponse)
|
||||
async def update_info_source(source_id: int, source_in: InfoSourceUpdate, db: Session = Depends(get_db)):
|
||||
"""更新信息源"""
|
||||
source = crud_source.get(db=db, source_id=source_id)
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail="该信息源不存在")
|
||||
|
||||
# 直接把查出来的数据库对象和前端传来的 Pydantic 对象丢给 CRUD 处理
|
||||
return crud_source.update(db=db, db_obj=source, obj_in=source_in)
|
||||
@@ -0,0 +1,8 @@
|
||||
# app/api/router.py
|
||||
from fastapi import APIRouter
|
||||
from app.api.endpoints import sources
|
||||
|
||||
api_router = APIRouter()
|
||||
|
||||
# 信息源管理
|
||||
api_router.include_router(sources.router, prefix="/sources", tags=["信息源管理"])
|
||||
@@ -0,0 +1,40 @@
|
||||
# app/crud/crud_source.py
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List, Optional
|
||||
|
||||
from app.models.models import InfoSource
|
||||
from app.schemas.schemas import InfoSourceCreate, InfoSourceUpdate
|
||||
|
||||
|
||||
def get(db: Session, source_id: int) -> Optional[InfoSource]:
|
||||
"""通过 ID 获取单条信息源"""
|
||||
return db.query(InfoSource).filter(InfoSource.id == source_id).first()
|
||||
|
||||
|
||||
def get_multi(db: Session, skip: int = 0, limit: int = 100) -> List[InfoSource]:
|
||||
"""获取所有信息源列表(支持分页)"""
|
||||
return db.query(InfoSource).offset(skip).limit(limit).all()
|
||||
|
||||
|
||||
def create(db: Session, obj_in: InfoSourceCreate) -> InfoSource:
|
||||
"""创建新的信息源"""
|
||||
db_obj = InfoSource(**obj_in.model_dump())
|
||||
db.add(db_obj)
|
||||
db.commit()
|
||||
db.refresh(db_obj)
|
||||
return db_obj
|
||||
|
||||
|
||||
def update(db: Session, db_obj: InfoSource, obj_in: InfoSourceUpdate) -> InfoSource:
|
||||
"""更新信息源"""
|
||||
# 提取前端真正要求更新的字段
|
||||
update_data = obj_in.model_dump(exclude_unset=True)
|
||||
|
||||
# 遍历更新模型对象的属性
|
||||
for field, value in update_data.items():
|
||||
setattr(db_obj, field, value)
|
||||
|
||||
db.add(db_obj)
|
||||
db.commit()
|
||||
db.refresh(db_obj)
|
||||
return db_obj
|
||||
+44
-50
@@ -1,71 +1,65 @@
|
||||
# app/main.py
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, Depends
|
||||
from sqlalchemy.orm import Session
|
||||
from fastapi import FastAPI
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from app.services.fetcher_service import fetch_and_save_trending_data
|
||||
from app.database import engine
|
||||
from app.models.models import Base
|
||||
|
||||
from app.database import engine, get_db
|
||||
from app.models.models import Base, InfoSource, SourceType
|
||||
# 路由总线
|
||||
from app.api.router import api_router
|
||||
|
||||
load_dotenv()
|
||||
CRAWL_INTERVAL = int(os.getenv("CRAWL_INTERVAL_MINUTES", 10))
|
||||
|
||||
scheduler = AsyncIOScheduler()
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 1. 生命周期管理:App 启动时自动建表
|
||||
# 1. 生命周期管理:App 启动时自动建表 & 启动调度器
|
||||
# ==========================================
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# 启动时执行:检查模型,如果表不存在,自动在 SQLite 中建表!
|
||||
# 1. 数据库建表
|
||||
print("正在初始化数据库表...")
|
||||
# ---> 核心修改 2:直接使用 Base,而不是 models.Base <---
|
||||
Base.metadata.create_all(bind=engine)
|
||||
print("数据库表初始化完成!")
|
||||
yield
|
||||
|
||||
# 2. 配置并启动定时任务
|
||||
scheduler.add_job(
|
||||
fetch_and_save_trending_data,
|
||||
'interval',
|
||||
minutes=CRAWL_INTERVAL,
|
||||
id='trending_fetch_job',
|
||||
replace_existing=True
|
||||
)
|
||||
scheduler.start()
|
||||
print(f"定时抓取任务已启动,每 {CRAWL_INTERVAL} 分钟执行一次")
|
||||
|
||||
# 为了测试方便,启动时立即执行一次
|
||||
await fetch_and_save_trending_data()
|
||||
|
||||
yield # 此时 FastAPI 开始接受请求
|
||||
|
||||
# 3. 优雅关闭
|
||||
scheduler.shutdown()
|
||||
print("定时任务已安全关闭")
|
||||
|
||||
|
||||
|
||||
# 初始化 FastAPI,挂载生命周期
|
||||
# 初始化 FastAPI
|
||||
app = FastAPI(title="AI 新闻聚合引擎 API", lifespan=lifespan)
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 2. 路由 API 定义
|
||||
# 2. 挂载路由总线
|
||||
# ==========================================
|
||||
# 版本控制
|
||||
app.include_router(api_router, prefix="/api/v1")
|
||||
|
||||
@app.get("/")
|
||||
|
||||
# 健康检查
|
||||
@app.get("/", tags=["健康检查"])
|
||||
async def root():
|
||||
return {"message": "Welcome to AI News Aggregator API"}
|
||||
|
||||
|
||||
@app.get("/hello/{name}")
|
||||
async def say_hello(name: str):
|
||||
return {"message": f"Hello {name}"}
|
||||
|
||||
|
||||
# --->与数据库交互的真实接口
|
||||
|
||||
@app.post("/sources/")
|
||||
async def create_info_source(name: str, url: str, db: Session = Depends(get_db)):
|
||||
"""
|
||||
测试接口:向数据库中添加一个新闻信息源
|
||||
"""
|
||||
# ---> 核心修改 3:直接使用 InfoSource 和 SourceType <---
|
||||
new_source = InfoSource(
|
||||
source_name=name,
|
||||
source_type=SourceType.RSS_FEED, # 默认用 RSS 测试
|
||||
home_url=url
|
||||
)
|
||||
|
||||
db.add(new_source)
|
||||
db.commit()
|
||||
db.refresh(new_source) # 刷新以获取自动生成的 ID
|
||||
|
||||
return {"message": "创建成功!", "data": new_source}
|
||||
|
||||
|
||||
@app.get("/sources/")
|
||||
async def get_all_sources(db: Session = Depends(get_db)):
|
||||
"""
|
||||
测试接口:查询数据库中所有的信息源
|
||||
"""
|
||||
# ---> 核心修改 4:直接使用 InfoSource <---
|
||||
sources = db.query(InfoSource).all()
|
||||
return {"total": len(sources), "data": sources}
|
||||
return {"message": "Welcome to AI News Aggregator API", "status": "ok"}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# models.py
|
||||
from datetime import datetime, timezone, time
|
||||
from typing import Optional, Any
|
||||
import enum
|
||||
@@ -10,32 +11,47 @@ from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 0. 全局基类与枚举定义
|
||||
# 0. 全局基类、枚举定义与动态类型
|
||||
# ==========================================
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
"""SQLAlchemy 2.0 声明式基类"""
|
||||
"""
|
||||
SQLAlchemy 2.0 声明式基类
|
||||
所有的表模型都必须继承这个基类。
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
# 让代码在 SQLite 环境下自动降级为 Integer 以保证自增正常工作,
|
||||
# 而在生产环境部署到 PostgreSQL 或 MySQL 时,依然会使用容量更大的 BigInteger。
|
||||
BigIntType = BigInteger().with_variant(Integer, "sqlite")
|
||||
|
||||
|
||||
class SourceType(str, enum.Enum):
|
||||
HOT_TREND = "HOT_TREND"
|
||||
RSS_FEED = "RSS_FEED"
|
||||
API = "API"
|
||||
"""信息源的抓取方式"""
|
||||
HOT_TREND = "HOT_TREND" # 热搜榜单类
|
||||
RSS_FEED = "RSS_FEED" # 传统RSS订阅
|
||||
API = "API" # 接口抓取类
|
||||
|
||||
|
||||
class TargetType(str, enum.Enum):
|
||||
EVENT = "EVENT"
|
||||
TREND = "TREND"
|
||||
ARTICLE = "ARTICLE"
|
||||
"""
|
||||
多态目标类型 (Polymorphic Target)
|
||||
用于标记一条评论或一个标签到底是挂载在哪个实体下的。
|
||||
"""
|
||||
EVENT = "EVENT" # 挂载在单个热搜事件下
|
||||
TREND = "TREND" # 挂载在宏观趋势下
|
||||
ARTICLE = "ARTICLE" # 挂载在具体新闻文章下
|
||||
|
||||
|
||||
class TaskStatus(str, enum.Enum):
|
||||
"""后台任务状态"""
|
||||
SUCCESS = "SUCCESS"
|
||||
ERROR = "ERROR"
|
||||
|
||||
|
||||
class GenderType(str, enum.Enum):
|
||||
"""用户性别枚举"""
|
||||
MALE = "MALE"
|
||||
FEMALE = "FEMALE"
|
||||
OTHER = "OTHER"
|
||||
@@ -43,7 +59,10 @@ class GenderType(str, enum.Enum):
|
||||
|
||||
|
||||
def utcnow():
|
||||
"""获取带UTC时区的当前时间 (推荐实践)"""
|
||||
"""
|
||||
获取带UTC时区的当前时间 (最佳实践)
|
||||
服务器内部和数据库统一存储 UTC 时间,只在前端展示时转为用户本地时区,避免时区错乱。
|
||||
"""
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
@@ -51,6 +70,10 @@ def utcnow():
|
||||
# 模块一:信息源管理
|
||||
# ==========================================
|
||||
class InfoSource(Base):
|
||||
"""
|
||||
抓取源配置表
|
||||
充当爬虫的“任务清单”,后台可以随时开关特定的信息源,而不需要重启代码。
|
||||
"""
|
||||
__tablename__ = "info_sources"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
||||
@@ -67,14 +90,19 @@ class InfoSource(Base):
|
||||
# 模块二:AI 语义聚类中枢 (大事件池)
|
||||
# ==========================================
|
||||
class UnifiedEvent(Base):
|
||||
"""
|
||||
AI 统一事件表
|
||||
核心业务逻辑:比如微博热搜叫“苹果发布会”,知乎热搜叫“iPhone 16 测评”,
|
||||
它们在子表(TrendingEvent)是两条记录,但通过 AI 语义向量对比后,
|
||||
会将它们统一挂载到这个表的一个 UnifiedEvent ID 下,实现跨平台事件聚合。
|
||||
"""
|
||||
__tablename__ = "unified_events"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
unified_title: Mapped[str] = mapped_column(String(255), comment="AI统一标题")
|
||||
ai_comprehensive_summary: Mapped[Optional[str]] = mapped_column(Text, comment="AI全局深度总结")
|
||||
|
||||
# SQLite 没有原生 Vector 类型,存为用逗号分隔的字符串或JSON,Postgres可换成 PGVector
|
||||
center_embedding: Mapped[Optional[str]] = mapped_column(Text, comment="中心向量")
|
||||
center_embedding: Mapped[Optional[str]] = mapped_column(Text, comment="中心向量") # 用于高维空间相似度计算
|
||||
hot_score: Mapped[int] = mapped_column(Integer, default=0, comment="聚合热度得分")
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
@@ -85,12 +113,16 @@ class UnifiedEvent(Base):
|
||||
# 模块三:内容存储库 (热搜 & 新闻子节点)
|
||||
# ==========================================
|
||||
class TrendingEvent(Base):
|
||||
"""
|
||||
各平台热搜数据明细表
|
||||
"""
|
||||
__tablename__ = "trending_events"
|
||||
__table_args__ = (
|
||||
# 联合唯一索引:同一个来源(比如微博)的同一条外部ID(MD5)只能存在一条记录,防重插核心保障
|
||||
UniqueConstraint("source_id", "external_id", name="idx_unique_external_trend"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
source_id: Mapped[int] = mapped_column(ForeignKey("info_sources.id"))
|
||||
unified_event_id: Mapped[Optional[int]] = mapped_column(ForeignKey("unified_events.id"))
|
||||
|
||||
@@ -109,12 +141,13 @@ class TrendingEvent(Base):
|
||||
|
||||
|
||||
class NewsArticle(Base):
|
||||
"""新闻文章明细表 (与 TrendingEvent 类似,但侧重长文本阅读)"""
|
||||
__tablename__ = "news_articles"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("source_id", "external_id", name="idx_unique_external_article"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
source_id: Mapped[int] = mapped_column(ForeignKey("info_sources.id"))
|
||||
unified_event_id: Mapped[Optional[int]] = mapped_column(ForeignKey("unified_events.id"))
|
||||
|
||||
@@ -136,9 +169,13 @@ class NewsArticle(Base):
|
||||
# 模块四:热度与轨迹追踪
|
||||
# ==========================================
|
||||
class HeadlineRevision(Base):
|
||||
"""
|
||||
标题修订历史表
|
||||
用于记录平台方暗戳戳修改热搜词条的行为(例如公关介入改标题)。
|
||||
"""
|
||||
__tablename__ = "headline_revisions"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
event_id: Mapped[int] = mapped_column(ForeignKey("trending_events.id"))
|
||||
previous_headline: Mapped[str] = mapped_column(String(255))
|
||||
revised_headline: Mapped[str] = mapped_column(String(255))
|
||||
@@ -147,12 +184,17 @@ class HeadlineRevision(Base):
|
||||
|
||||
|
||||
class RankingLog(Base):
|
||||
"""
|
||||
热搜排名时间序列化日志
|
||||
每一次抓取都会生成一条记录,可以用于前端绘制热搜“排名起伏折线图”。
|
||||
"""
|
||||
__tablename__ = "ranking_logs"
|
||||
__table_args__ = (
|
||||
# 针对时间序列查询优化的复合索引,加速类似 "查询某事件在过去24小时内的排名变化" 的操作
|
||||
Index("idx_event_time", "event_id", "observed_at"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
event_id: Mapped[int] = mapped_column(ForeignKey("trending_events.id"))
|
||||
ranking_position: Mapped[int] = mapped_column(Integer)
|
||||
observed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
@@ -163,16 +205,22 @@ class RankingLog(Base):
|
||||
# ==========================================
|
||||
# 模块五:多态话题与多态评论
|
||||
# ==========================================
|
||||
# 【设计模式】:多态设计
|
||||
# 通过 target_type (存表名/类型) + target_id (存主键ID) 的组合,
|
||||
# 让这两个表既能挂载在"单一热搜"下,也能挂载在"新闻文章"下,甚至挂在"统一大事件"下,避免了建立无数个外键的冗余。
|
||||
|
||||
class ExtractedTopic(Base):
|
||||
"""AI 提取的核心话题标签表"""
|
||||
__tablename__ = "extracted_topics"
|
||||
__table_args__ = (
|
||||
Index("idx_topic_keyword", "topic_keyword"),
|
||||
# 多态查询索引,加速 target_type + target_id 的组合查询
|
||||
Index("idx_polymorphic_topics", "target_type", "target_id"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
target_type: Mapped[TargetType] = mapped_column(Enum(TargetType))
|
||||
target_id: Mapped[int] = mapped_column(BigInteger)
|
||||
target_id: Mapped[int] = mapped_column(BigIntType)
|
||||
topic_keyword: Mapped[str] = mapped_column(String(100))
|
||||
relevance_score: Mapped[Optional[float]] = mapped_column(Float)
|
||||
|
||||
@@ -180,14 +228,15 @@ class ExtractedTopic(Base):
|
||||
|
||||
|
||||
class DiscussionComment(Base):
|
||||
"""全平台统一评论表"""
|
||||
__tablename__ = "discussion_comments"
|
||||
__table_args__ = (
|
||||
Index("idx_polymorphic_comments", "target_type", "target_id"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
target_type: Mapped[TargetType] = mapped_column(Enum(TargetType))
|
||||
target_id: Mapped[int] = mapped_column(BigInteger)
|
||||
target_id: Mapped[int] = mapped_column(BigIntType)
|
||||
|
||||
commenter_name: Mapped[Optional[str]] = mapped_column(String(100))
|
||||
comment_content: Mapped[str] = mapped_column(Text)
|
||||
@@ -202,6 +251,7 @@ class DiscussionComment(Base):
|
||||
# 模块六:用户画像与多渠道高可用推送系统
|
||||
# ==========================================
|
||||
class AppUser(Base):
|
||||
"""系统核心用户表"""
|
||||
__tablename__ = "app_users"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
@@ -212,7 +262,7 @@ class AppUser(Base):
|
||||
avatar_url: Mapped[Optional[str]] = mapped_column(String(500))
|
||||
gender: Mapped[GenderType] = mapped_column(Enum(GenderType), default=GenderType.UNKNOWN)
|
||||
|
||||
# 核心:万能扩展收纳箱 (SQLite 完美支持通过 SQLAlchemy 存储 JSON)
|
||||
# 预留的 JSON 字段,可以存放未来灵活变化的用户配置,避免频繁修改表结构
|
||||
metadata_: Mapped[Optional[Any]] = mapped_column("metadata", JSON, comment="自定义扩展偏好")
|
||||
|
||||
timezone: Mapped[str] = mapped_column(String(50), default="Asia/Shanghai")
|
||||
@@ -221,6 +271,10 @@ class AppUser(Base):
|
||||
|
||||
|
||||
class UserPushEndpoint(Base):
|
||||
"""
|
||||
多渠道推送端点配置表
|
||||
一个用户可能绑定了邮箱(EMAIL)和微信(WECHAT),支持配置降级重试(priority_level)。
|
||||
"""
|
||||
__tablename__ = "user_push_endpoints"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("user_id", "channel_type", name="idx_unique_user_channel"),
|
||||
@@ -238,6 +292,7 @@ class UserPushEndpoint(Base):
|
||||
|
||||
|
||||
class UserTopicPreference(Base):
|
||||
"""用户订阅的兴趣标签库"""
|
||||
__tablename__ = "user_topic_preferences"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("user_id", "interested_keyword", name="idx_unique_preference"),
|
||||
@@ -251,6 +306,7 @@ class UserTopicPreference(Base):
|
||||
|
||||
|
||||
class UserDeliverySchedule(Base):
|
||||
"""用户勿扰/定时推送时间表"""
|
||||
__tablename__ = "user_delivery_schedules"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("user_id", "delivery_time", name="idx_unique_schedule"),
|
||||
@@ -265,15 +321,20 @@ class UserDeliverySchedule(Base):
|
||||
|
||||
|
||||
class DeliveryHistory(Base):
|
||||
"""
|
||||
推送历史防刷表
|
||||
核心用途:一旦给某个用户推送过某条新闻/事件,就记录在这里。
|
||||
下次再触发推荐时,检查这个表,防止给同一个用户反复发送相同的内容。
|
||||
"""
|
||||
__tablename__ = "delivery_history"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("user_id", "target_type", "target_id", name="idx_prevent_duplicate_push"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[int] = mapped_column(ForeignKey("app_users.id"))
|
||||
target_type: Mapped[TargetType] = mapped_column(Enum(TargetType))
|
||||
target_id: Mapped[int] = mapped_column(BigInteger)
|
||||
target_id: Mapped[int] = mapped_column(BigIntType)
|
||||
status: Mapped[TaskStatus] = mapped_column(Enum(TaskStatus))
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
@@ -283,9 +344,13 @@ class DeliveryHistory(Base):
|
||||
# 模块七:系统任务监控
|
||||
# ==========================================
|
||||
class DataSyncTask(Base):
|
||||
"""
|
||||
数据同步健康度监控表
|
||||
这就是爬虫脚本每次运行都要写入记录的地方,用于后台 Dashboard 监控爬虫健康状态和错误堆栈。
|
||||
"""
|
||||
__tablename__ = "data_sync_tasks"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
source_id: Mapped[int] = mapped_column(ForeignKey("info_sources.id"))
|
||||
items_fetched: Mapped[int] = mapped_column(Integer, default=0)
|
||||
task_status: Mapped[TaskStatus] = mapped_column(Enum(TaskStatus))
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
# app/schemas/schemas.py
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
# 枚举
|
||||
from app.models.models import SourceType
|
||||
|
||||
|
||||
# ==========================================
|
||||
# InfoSource (信息源) 相关的 Schemas
|
||||
# ==========================================
|
||||
|
||||
class InfoSourceBase(BaseModel):
|
||||
"""
|
||||
信息源的基础 Schema,包含各个操作通用的字段。
|
||||
使用 Field() 可以为 Swagger UI 接口文档提供明确的注释和校验规则。
|
||||
"""
|
||||
source_name: str = Field(..., max_length=100, description="信息源名称,例如:微博热搜、知乎热榜", examples=["微博热搜"])
|
||||
source_type: SourceType = Field(default=SourceType.API, description="抓取方式枚举")
|
||||
home_url: Optional[str] = Field(default=None, max_length=255, description="平台主页或接口的基础URL", examples=["weibo"])
|
||||
is_enabled: bool = Field(default=True, description="是否启用该信息源的定时抓取")
|
||||
|
||||
|
||||
class InfoSourceCreate(InfoSourceBase):
|
||||
"""
|
||||
创建信息源 (POST) 时接收的请求体。
|
||||
目前直接继承 Base 即可,如果以后有“只有创建时才能传的字段”,可以写在这里。
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class InfoSourceUpdate(BaseModel):
|
||||
"""
|
||||
更新信息源 (PATCH/PUT) 时接收的请求体。
|
||||
更新时,所有的字段都应该是可选的 (Optional),因为前端可能只想改其中一个字段。
|
||||
"""
|
||||
source_name: Optional[str] = Field(default=None, max_length=100)
|
||||
source_type: Optional[SourceType] = None
|
||||
home_url: Optional[str] = Field(default=None, max_length=255)
|
||||
is_enabled: Optional[bool] = None
|
||||
|
||||
|
||||
class InfoSourceResponse(InfoSourceBase):
|
||||
"""
|
||||
返回给前端的响应体格式 (GET 或 POST/PATCH 成功后的返回)。
|
||||
包含了数据库自动生成的 ID 和时间戳字段。
|
||||
"""
|
||||
id: int = Field(..., description="数据库主键ID")
|
||||
created_at: datetime = Field(..., description="创建时间")
|
||||
updated_at: datetime = Field(..., description="最后一次更新时间")
|
||||
|
||||
# 核心配置:告诉 Pydantic,允许传入 SQLAlchemy 的 ORM 类实例并自动提取属性
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 分页响应包装器 (泛型设计,方便以后复用)
|
||||
# ==========================================
|
||||
from typing import Generic, TypeVar, List
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
class PaginatedResponse(BaseModel, Generic[T]):
|
||||
"""
|
||||
通用的列表响应格式,带总数统计,方便前端分页。
|
||||
"""
|
||||
total: int = Field(..., description="符合条件的数据总条数")
|
||||
data: List[T] = Field(..., description="当前页的数据列表")
|
||||
@@ -0,0 +1,167 @@
|
||||
# app/services/fetcher_service.py
|
||||
import os
|
||||
import hashlib
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from app.database import SessionLocal
|
||||
from app.models.models import (
|
||||
InfoSource, TrendingEvent, DataSyncTask, TaskStatus,
|
||||
HeadlineRevision, RankingLog, utcnow
|
||||
)
|
||||
|
||||
# 加载 .env 文件中的环境变量
|
||||
load_dotenv()
|
||||
|
||||
# 从环境变量获取 API 基础地址,提供默认回退地址
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "https://newsnow.busiyi.world/api/s")
|
||||
|
||||
|
||||
def generate_md5(text: str) -> str:
|
||||
"""
|
||||
生成32位MD5哈希
|
||||
用途:为不同平台的数据生成统一的、长度固定的外部唯一标识(external_id),
|
||||
方便建立数据库的唯一索引,防止同一条热搜重复插入。
|
||||
"""
|
||||
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
async def fetch_and_save_trending_data():
|
||||
"""
|
||||
核心定时任务:从数据库读取信息源 -> 抓取API -> 解析并存入数据库
|
||||
|
||||
业务流程:
|
||||
1. 查询所有已启用的信息源 (is_enabled == True)
|
||||
2. 伪装成浏览器发起异步 HTTP 请求
|
||||
3. 遍历解析数据,进行去重判断 (MD5)
|
||||
4. 记录标题变更轨迹 (HeadlineRevision) 和 热搜排名轨迹 (RankingLog)
|
||||
5. 统一提交或在发生异常时回滚脏数据
|
||||
"""
|
||||
print(f"[{utcnow()}] 开始执行定时抓取任务...")
|
||||
|
||||
# 使用上下文管理器确保数据库连接池正确归还连接
|
||||
with SessionLocal() as db:
|
||||
# 1. 动态获取抓取源,这样在后台开关信息源不需要重启服务
|
||||
sources = db.query(InfoSource).filter(InfoSource.is_enabled == True).all()
|
||||
if not sources:
|
||||
print("没有找到启用的信息源,任务结束。")
|
||||
return
|
||||
|
||||
# 2. 伪装 HTTP 请求头,绕过目标服务器的反爬/防盗链机制
|
||||
custom_headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Referer": "https://newsnow.busiyi.world/",
|
||||
"Origin": "https://newsnow.busiyi.world"
|
||||
}
|
||||
|
||||
# 复用异步客户端,提高并发抓取效率
|
||||
async with httpx.AsyncClient(timeout=15.0, headers=custom_headers) as client:
|
||||
for source in sources:
|
||||
# 平台标识,如 "weibo", "zhihu" 等,这里复用了 home_url 字段存储
|
||||
platform_id = source.home_url
|
||||
if not platform_id:
|
||||
continue
|
||||
|
||||
# 拼装最终的抓取 URL
|
||||
url = f"{API_BASE_URL}?id={platform_id}&latest"
|
||||
# 初始化本次抓取任务的日志记录
|
||||
task_log = DataSyncTask(source_id=source.id, items_fetched=0)
|
||||
|
||||
try:
|
||||
# 发起请求并校验 HTTP 状态码
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
data_json = response.json()
|
||||
|
||||
items = data_json.get("items", [])
|
||||
saved_count = 0
|
||||
|
||||
for index, item in enumerate(items, 1):
|
||||
title = item.get("title")
|
||||
if not title:
|
||||
continue
|
||||
|
||||
item_url = item.get("url", "")
|
||||
|
||||
# 3. ID 去重策略:优先用接口自带 ID,没有则用 URL,最差情况用标题兜底
|
||||
raw_id = item.get("id") or item_url or title
|
||||
# 组合平台标识和原始 ID 算出全局唯一的 MD5
|
||||
external_id = generate_md5(f"{platform_id}_{raw_id}")
|
||||
|
||||
# 4. 在数据库中查询是否已经存在这条热搜
|
||||
existing_event = db.query(TrendingEvent).filter(
|
||||
TrendingEvent.source_id == source.id,
|
||||
TrendingEvent.external_id == external_id
|
||||
).first()
|
||||
|
||||
event_to_log = None # 留个指针,用来后续记录名次历史
|
||||
|
||||
if existing_event:
|
||||
# -------- 分支 A:热搜已存在,执行更新逻辑 --------
|
||||
|
||||
# 监控标题变化(例如:微博热搜经常会改词条名字)
|
||||
if existing_event.current_headline != title:
|
||||
# 标题发生改变!立刻记录到修订历史表
|
||||
revision = HeadlineRevision(
|
||||
event_id=existing_event.id,
|
||||
previous_headline=existing_event.current_headline,
|
||||
revised_headline=title
|
||||
)
|
||||
db.add(revision)
|
||||
existing_event.current_headline = title # 更新为主表最新标题
|
||||
|
||||
# 更新当前最新的排名和链接
|
||||
existing_event.current_ranking = index
|
||||
existing_event.event_url = item_url
|
||||
|
||||
event_to_log = existing_event
|
||||
else:
|
||||
# -------- 分支 B:全新热搜,执行插入逻辑 --------
|
||||
new_event = TrendingEvent(
|
||||
source_id=source.id,
|
||||
external_id=external_id,
|
||||
current_headline=title,
|
||||
event_url=item_url,
|
||||
current_ranking=index,
|
||||
)
|
||||
db.add(new_event)
|
||||
# 核心操作!flush 会将数据推入数据库生成自增的 ID,但不提交事务 (commit)。
|
||||
# 这样接下来的 RankingLog 就能立刻拿到 `new_event.id` 作为外键。
|
||||
db.flush()
|
||||
event_to_log = new_event
|
||||
|
||||
# -------- 无论新旧,统一记录排名轨迹 --------
|
||||
# 只要抓取到,不管新旧,必须记一笔当前的排名! <---
|
||||
# 借助这个表,后续可以画出某条热搜随时间变化的“排名上升/下降曲线”
|
||||
rank_log = RankingLog(
|
||||
event_id=event_to_log.id,
|
||||
ranking_position=index
|
||||
)
|
||||
db.add(rank_log)
|
||||
|
||||
saved_count += 1
|
||||
|
||||
# 如果这一个平台的数据全部处理顺利,标记成功并整体提交
|
||||
task_log.items_fetched = saved_count
|
||||
task_log.task_status = TaskStatus.SUCCESS
|
||||
db.add(task_log)
|
||||
db.commit() # ✅ 只在 try 顺利走完整个平台的数据时,才统一提交业务数据到硬盘
|
||||
print(f"[{source.source_name}] 成功抓取并更新了 {saved_count} 条数据")
|
||||
|
||||
except Exception as e:
|
||||
# -------- 异常处理机制 --------
|
||||
# 独立日志记录
|
||||
|
||||
# 第一步:遇到报错(如网络中断、解析错误),立刻回滚。
|
||||
# 丢弃这批脏数据,防止数据库出现一半更新一半没更新的“不一致状态”。
|
||||
db.rollback()
|
||||
|
||||
# 第二步:记录错误日志并独立提交。
|
||||
# 因为上面执行了 rollback,之前 add 的 task_log 也被清空了状态,
|
||||
# 此时重新设置状态,并作为一次新的独立事务提交到数据库,方便后台监控报错。
|
||||
task_log.task_status = TaskStatus.ERROR
|
||||
task_log.error_trace = str(e)
|
||||
db.add(task_log)
|
||||
db.commit() # 提交错误日志本身
|
||||
print(f"[{source.source_name}] 抓取失败: {e}")
|
||||
Binary file not shown.
Reference in New Issue
Block a user