backend 去ai化

This commit is contained in:
2026-04-20 15:53:02 +08:00
parent 7a34fc0079
commit bba6de25ac
28 changed files with 161 additions and 228 deletions
+9 -24
View File
@@ -1,7 +1,3 @@
# 定时推送调度服务
# 由 APScheduler 每分钟调用,检查当前时刻是否有用户需要接收推送,
# 如匹配则生成摘要邮件并发送,同时写入 DeliveryHistory 防重复。
# 推送优先级:有关键词且匹配 → 个性化简报;无关键词或无匹配 → 默认热点快报
import logging
import os
from logging.handlers import TimedRotatingFileHandler
@@ -34,7 +30,7 @@ from app.utils.email_utils import send_html_email
logger = logging.getLogger("delivery_service")
# delivery_service 日志单独写文件
_delivery_log_dir = Path(__file__).resolve().parents[2] / "logs"
_delivery_log_dir.mkdir(parents=True, exist_ok=True)
_delivery_log_file = _delivery_log_dir / "delivery_check.log"
@@ -51,6 +47,8 @@ if not logger.handlers:
logger.setLevel(logging.INFO)
logger.propagate = False
# AI辅助生成:deepseek-v3-22026年3月20日
# 推送时间窗口:实际执行时刻与设定时间的最大容差(分钟)
DELIVERY_WINDOW_MINUTES = int(os.getenv("DELIVERY_WINDOW_MINUTES", 2))
# 同一用户两次推送之间的最小间隔(分钟)
@@ -64,13 +62,10 @@ DEFAULT_MODE_HOURS = int(os.getenv("DEFAULT_MODE_HOURS", 24))
# 用户时区无效时的兜底时区
DEFAULT_FALLBACK_TIMEZONE = os.getenv("DEFAULT_FALLBACK_TIMEZONE", "Asia/Shanghai")
# ==========================================
# 默认热点事件容器(无关键词时使用)
# ==========================================
@dataclass
class _DefaultEventItem:
"""
默认热点事件容器
无关键词订阅或关键词无匹配时的默认热点包装器,
接口与 MatchedEventResult 保持一致,方便统一传给模板。
"""
@@ -81,10 +76,6 @@ class _DefaultEventItem:
tags: list[str] = field(default_factory=list)
is_default: bool = True
# ==========================================
# 时区工具
# ==========================================
def _time_to_minutes(t: dt_time) -> int:
return t.hour * 60 + t.minute
@@ -125,10 +116,10 @@ def _ensure_aware(dt: datetime) -> datetime:
return dt.replace(tzinfo=timezone.utc)
return dt
# AI辅助生成结束
# ==========================================
# 数据库查询辅助
# ==========================================
def _should_skip_by_interval(db: Session, user_id: int) -> bool:
"""检查用户是否仍在冷却期内,避免短时间内重复推送"""
row = (
@@ -297,9 +288,9 @@ def _record_delivery(
db.commit()
# ==========================================
# AI辅助生成:deepseek-v3-22026年3月20日
# 推送准备
# ==========================================
@dataclass
class _PendingPush:
"""暂存需要发送邮件的信息,便于在 async 上下文中发送。"""
@@ -309,6 +300,7 @@ class _PendingPush:
html_body: str
event_ids: list[int]
# AI生成结束
def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedule) -> _PendingPush | None:
"""
@@ -331,7 +323,6 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul
pushed_ids = _get_already_pushed_event_ids(db, user_id)
# 决策:有关键词且有匹配 → 匹配模式;否则 → 默认热点模式
items: list = []
is_default = False
@@ -361,7 +352,6 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul
logger.info(f"用户 {user_id} 默认热点无可推送内容,跳过")
return None
# 批量加载平台数据(来源名、标题、URL、排名)
event_ids = [item.event.id for item in items]
platforms_map = _load_event_platforms(db, event_ids)
@@ -383,9 +373,6 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul
)
# ==========================================
# 调度主入口
# ==========================================
async def check_and_deliver() -> None:
"""
定时推送主入口,由 APScheduler 每分钟调用。
@@ -412,7 +399,6 @@ async def check_and_deliver() -> None:
if not user:
continue
# 将 UTC 转为用户本地时间,判断是否落在推送窗口内
user_current = _user_local_time(now, user.timezone)
if not _is_within_window(schedule.delivery_time, user_current):
continue
@@ -422,7 +408,6 @@ async def check_and_deliver() -> None:
if pending is None:
continue
# 异步按优先级尝试各邮件渠道
sent = False
for target_email in pending.email_targets:
try:
+11 -34
View File
@@ -1,8 +1,3 @@
# app/services/fetcher_service.py
"""
抓取服务:从外部 API 拉取热搜/RSS 数据,做查重、向量聚类、入库
热搜分支:语义聚类到 UnifiedEventRSS 分支:写入 NewsArticle
"""
import os
import hashlib
from datetime import timedelta
@@ -19,6 +14,8 @@ from app.models.models import (
HeadlineRevision, RankingLog, SourceType, utcnow, UnifiedEvent
)
# AI辅助生成:deepseek-v3-22026年3月20日
# 加载环境变量
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
@@ -31,6 +28,8 @@ print("正在加载模型...")
embedder_model = SentenceTransformer(EMBEDDING_MODEL_PATH, local_files_only=True)
print("模型加载完成。")
# AI生成结束
def generate_md5(text: str) -> str:
"""生成 32 位 MD5 作为 external_id,用于跨平台去重"""
@@ -88,10 +87,10 @@ class UnifiedEventClusterer:
new_unified = UnifiedEvent(
unified_title=title,
center_embedding=embedding_json,
hot_score=1 # 初始热度
hot_score=1
)
self.db.add(new_unified)
self.db.flush() # 获取自增的主键 ID
self.db.flush()
# 更新缓存
self.event_vectors.append(new_vec)
@@ -109,11 +108,8 @@ def process_hot_trend_item(db, source, item, index: int, external_id: str, exist
event_to_log = None
# 查重:已存在则可能只需更新标题/排名;不存在则需聚类并新建
if existing_event:
# 场景 A1:老熟人
if existing_event.current_headline != title:
# 标题被暗改,此时需要重新算一次 Embedding
new_embedding_json, _ = embeddings_dict[title]
revision = HeadlineRevision(
@@ -123,30 +119,25 @@ def process_hot_trend_item(db, source, item, index: int, external_id: str, exist
)
db.add(revision)
existing_event.current_headline = title
existing_event.title_embedding = new_embedding_json # 更新为新标题的语义向量
# 注:这里不改变它所属的 unified_event_id,因为大体还是同一件事
existing_event.title_embedding = new_embedding_json
existing_event.current_ranking = index
existing_event.event_url = item_url
event_to_log = existing_event
else:
# 场景 A2:这是一条彻底的全新热搜
# 1. 计算向量
new_embedding_json, new_vec = embeddings_dict[title]
# 2. 扔进聚类中枢找归宿
new_embedding_json, new_vec = embeddings_dict[title]
matched_event_id = clusterer.match_or_create(title, new_embedding_json, new_vec)
# 3. 落库
new_event = TrendingEvent(
source_id=source.id,
external_id=external_id,
current_headline=title,
event_url=item_url,
current_ranking=index,
title_embedding=new_embedding_json, # 存入向量
unified_event_id=matched_event_id # 挂载到大事件下
title_embedding=new_embedding_json,
unified_event_id=matched_event_id
)
db.add(new_event)
db.flush()
@@ -192,7 +183,6 @@ def process_source_data(db, source, items: list) -> int:
saved_count = 0
platform_id = source.home_url
# 1. 批量计算外部 ID 并聚合要计算的文本
valid_items = []
external_ids = []
for item in items:
@@ -209,7 +199,6 @@ def process_source_data(db, source, items: list) -> int:
if not valid_items:
return 0
# 批量查重:按 external_id 判断是更新还是新增
existing_events_dict = {}
existing_articles_dict = {}
@@ -226,7 +215,6 @@ def process_source_data(db, source, items: list) -> int:
).all()
existing_articles_dict = {art.external_id: art for art in existing_articles}
# 仅对需要算向量的标题做批量 embedding,避免重复计算
texts_to_embed = []
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
for item, external_id in valid_items:
@@ -238,15 +226,12 @@ def process_source_data(db, source, items: list) -> int:
else:
texts_to_embed.append(title)
# 4. 批量执行大模型推理
embeddings_dict = generate_embeddings_batch(texts_to_embed)
# 初始化聚类器(只在热搜模式下需要,且只初始化一次)
clusterer = None
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
clusterer = UnifiedEventClusterer(db)
# 按来源类型分流:热搜/API → TrendingEvent + 聚类;RSS → NewsArticle
for index, (item, external_id) in enumerate(valid_items, 1):
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
existing_event = existing_events_dict.get(external_id)
@@ -269,14 +254,12 @@ async def fetch_and_save_trending_data():
"""
print(f"[{utcnow()}] 开始执行定时抓取任务...")
# 获取启用的信息源 - 这个只读操作用一个短连接
with SessionLocal() as db:
sources = db.query(InfoSource).filter(InfoSource.is_enabled == True).all()
if not sources:
print("没有找到启用的信息源,任务结束。")
return
# 我们把 source 的信息提前提取出来,避免在异步中长期持有 session
source_configs = [
{
"id": s.id,
@@ -287,7 +270,6 @@ async def fetch_and_save_trending_data():
for s in sources
]
# 伪装请求头,规避反爬
custom_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
@@ -304,13 +286,11 @@ async def fetch_and_save_trending_data():
url = f"{API_BASE_URL}?id={platform_id}&latest"
try:
# 1. 网络请求(可能耗时较长,不要包在 db session 里)
response = await client.get(url)
response.raise_for_status()
data_json = response.json()
items = data_json.get("items", [])
# 2. 数据库事务操作(尽量短,单独使用 session)
with SessionLocal() as db:
# 重新从短 session 中获取 source 实例,以免 detached
source = db.query(InfoSource).get(s_config["id"])
@@ -319,10 +299,8 @@ async def fetch_and_save_trending_data():
task_log = DataSyncTask(source_id=source.id, items_fetched=0)
try:
# 调用数据处理层
saved_count = process_source_data(db, source, items)
# 业务事务成功提交
task_log.items_fetched = saved_count
task_log.task_status = TaskStatus.SUCCESS
db.add(task_log)
@@ -330,10 +308,9 @@ async def fetch_and_save_trending_data():
print(f"[{source.source_name}] ({source.source_type}) 成功抓取并更新了 {saved_count} 条数据")
except Exception as e:
db.rollback()
raise e # 抛出给外层捕获记录日志
raise e
except Exception as e:
# 异常拦截与错误隔离,另起一个超短事务记录日志
with SessionLocal() as log_db:
try:
new_task_log = DataSyncTask(source_id=s_config["id"], items_fetched=0)
+3 -24
View File
@@ -1,7 +1,3 @@
"""
匹配服务:根据用户兴趣关键词(精确 + 语义)推荐事件
打分融合:标签/标题匹配分 + 标签相关度 + 热度 + 新鲜度加成
"""
import os
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
@@ -13,6 +9,7 @@ from sqlalchemy.orm import Session
from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow
from app.services.fetcher_service import embedder_model
# AI辅助生成:deepseek-v3-22026年3月20日
# 语义匹配阈值:用户关键词和事件标签/标题向量相似度达到该值才计入语义命中
DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
@@ -35,6 +32,7 @@ class MatchedEventResult:
semantic_hits: list[dict[str, Any]]
tags: list[str]
# AI生成结束
def _normalize_text(text: str) -> str:
"""统一小写与首尾空白,便于做稳定匹配。"""
@@ -80,7 +78,6 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
uncached_keywords = []
# 1. 尝试从缓存获取
for keyword in keywords:
if not keyword:
continue
@@ -89,9 +86,7 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
else:
uncached_keywords.append(keyword)
# 2. 对未命中的词进行统一的批量推理
if uncached_keywords:
# 去重,避免同一个未缓存的词被计算多次
unique_uncached = list(dict.fromkeys(uncached_keywords))
vectors = embedder_model.encode(unique_uncached, normalize_embeddings=True, show_progress_bar=False)
@@ -102,7 +97,6 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
for k in keys_to_delete:
del _EMBEDDING_CACHE[k]
# 3. 将新计算的向量存入缓存并回填结果
for keyword, vec in zip(unique_uncached, vectors):
vec_array = np.asarray(vec, dtype=np.float32)
_EMBEDDING_CACHE[keyword] = vec_array
@@ -172,7 +166,6 @@ def recommend_events_for_user(
else PREFERENCE_SEMANTIC_THRESHOLD
)
# 1. 读取用户兴趣词
preferences = (
db.query(UserTopicPreference)
.filter(UserTopicPreference.user_id == user_id)
@@ -185,7 +178,6 @@ def recommend_events_for_user(
if not preference_keywords:
return []
# 2. 读取候选事件(时间 + 热度过滤,避免全表扫描)
time_limit = utcnow() - timedelta(hours=hours)
events = (
db.query(UnifiedEvent)
@@ -213,20 +205,17 @@ def recommend_events_for_user(
.all()
)
# 组织事件标签映射:event_id -> [(tag, relevance_score), ...]
event_topics: dict[int, list[tuple[str, float | None]]] = {}
for event_id, topic_keyword, relevance_score in topic_rows:
if not topic_keyword:
continue
event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
# 3. 批量编码用户词与标签词,减少模型调用次数
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
# 预先建立“标准化后用户词集合”,用于精确匹配
normalized_preference_pairs = [
(word, _normalize_text(word))
for word in unique_preference_keywords
@@ -246,20 +235,15 @@ def recommend_events_for_user(
exact_hits: list[str] = []
semantic_hits: list[dict[str, Any]] = []
score = 0.0
# 对每个事件标签做精确匹配或语义匹配
for topic_keyword, topic_relevance in topic_list:
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
# 1) 精确命中(包括完全相等与包含关系)
matched_pref = _find_exact_preference_match(topic_keyword, normalized_preference_pairs)
if matched_pref is not None:
exact_hits.append(topic_keyword)
# 精确命中给较高基础分,标签自身相关度作为增益
score += 45.0 + topic_relevance_score * 0.2
continue
# 2) 语义命中(未精确命中时再算)
best_pref, best_sim = _find_best_semantic_match(topic_keyword, topic_vec_map, pref_vec_map)
if best_pref is not None and best_sim >= similarity_threshold:
@@ -270,10 +254,8 @@ def recommend_events_for_user(
"similarity": round(best_sim, 4),
}
)
# 语义命中分略低于精确命中,并由相似度放大
score += best_sim * 35.0 + topic_relevance_score * 0.12
# 标题也参与匹配,但权重低于结构化标签,避免长标题过度主导排序。
event_title = (event.unified_title or "").strip()
if event_title:
title_exact_pref = _find_exact_preference_match(event_title, normalized_preference_pairs)
@@ -292,15 +274,12 @@ def recommend_events_for_user(
)
score += best_sim * 24.0
# 如果精确和语义都没命中,直接跳过
if not exact_hits and not semantic_hits:
continue
# 融合事件热度和新鲜度,避免只看语义分
score += min(event.hot_score, 100) * 0.3
score += _calc_freshness_bonus(event)
# 返回标签时做去重,保证接口稳定
tags = list(dict.fromkeys([item[0] for item in topic_list]))
scored_results.append(
MatchedEventResult(
+4 -8
View File
@@ -1,8 +1,3 @@
# app/services/summary_service.py
"""
摘要服务:调用 LLM 生成统一标题、综合摘要、话题标签
定时任务:对热度达标且未摘要的事件批量处理
"""
import json
import os
from datetime import timedelta
@@ -26,12 +21,16 @@ from app.prompts.summary_prompts import (
)
from app.services.fetcher_service import embedder_model
# AI辅助生成:deepseek-v3-22026年3月20日
HOT_SCORE_THRESHOLD = int(os.getenv("HOT_SCORE_THRESHOLD", 3))
TOPIC_TAG_MIN_HOT_SCORE = int(os.getenv("TOPIC_TAG_MIN_HOT_SCORE", HOT_SCORE_THRESHOLD))
TOPIC_SIMILARITY_THRESHOLD = float(os.getenv("TOPIC_SIMILARITY_THRESHOLD", 0.82))
TOPIC_TAG_MAX_COUNT = int(os.getenv("TOPIC_TAG_MAX_COUNT", 8))
AI_API_KEY = os.getenv("AI_API_KEY", "")
# AI生成结束
deepseek_client = AsyncOpenAI(
api_key=AI_API_KEY,
@@ -184,7 +183,6 @@ async def generate_unified_summaries():
"""定时任务:对热度达标且未摘要的事件刷新标题、摘要、标签"""
print(f"[{utcnow()}] Start unified summary generation task...")
# 先提取需要处理的事件 ID,尽早释放 session,不长期占用 db session
with SessionLocal() as db:
recent_threshold = utcnow() - timedelta(days=3)
events = db.query(UnifiedEvent).filter(
@@ -197,11 +195,9 @@ async def generate_unified_summaries():
print("No events require summary update in this round.")
return
# 复制出需要的信息,脱离 session
event_ids = [e.id for e in events]
event_hot_scores = {e.id: e.hot_score for e in events}
# 外层循环:针对每个 event_id 开启一个极短生命周期的 session 获取依赖数据
for event_id in event_ids:
platform_dict: dict[str, set[str]] = {}
with SessionLocal() as db: