big update

This commit is contained in:
stardrophere
2026-03-11 20:52:58 +08:00
parent 8ed819a580
commit 966bcfbba4
44 changed files with 7124 additions and 650 deletions
+454
View File
@@ -0,0 +1,454 @@
# 定时推送调度服务
# 由 APScheduler 每分钟调用,检查当前时刻是否有用户需要接收推送,
# 如匹配则生成摘要邮件并发送,同时写入 DeliveryHistory 防重复。
import logging
from logging.handlers import TimedRotatingFileHandler
from dataclasses import dataclass, field
from datetime import datetime, time as dt_time, timedelta, timezone, tzinfo
from pathlib import Path
from typing import Any
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
from sqlalchemy.orm import Session
from app.database import SessionLocal
from app.models.models import (
AppUser,
DeliveryHistory,
ExtractedTopic,
InfoSource,
TargetType,
TaskStatus,
TrendingEvent,
UnifiedEvent,
UserDeliverySchedule,
UserPushEndpoint,
UserTopicPreference,
utcnow,
)
from app.prompts.digest_email_template import build_digest_html
from app.services.matching_service import recommend_events_for_user
from app.utils.email_utils import send_html_email
logger = logging.getLogger("delivery_service")
# delivery_service 日志单独写文件,不再输出到控制台
_delivery_log_dir = Path(__file__).resolve().parents[2] / "logs"
_delivery_log_dir.mkdir(parents=True, exist_ok=True)
_delivery_log_file = _delivery_log_dir / "delivery_check.log"
if not logger.handlers:
_file_handler = TimedRotatingFileHandler(
filename=str(_delivery_log_file),
when="midnight",
interval=1,
backupCount=14,
encoding="utf-8",
)
_file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
logger.addHandler(_file_handler)
logger.setLevel(logging.INFO)
logger.propagate = False
# 推送时间窗口:实际执行时刻与设定时间的最大容差(分钟)
DELIVERY_WINDOW_MINUTES = 2
# 同一用户两次推送之间的最小间隔(分钟)
MIN_PUSH_INTERVAL_MINUTES = 30
# 单次推送最多携带的事件数
MAX_EVENTS_PER_PUSH = 12
# 默认模式热度阈值(无关键词或无匹配时使用)
DEFAULT_MODE_HOT_THRESHOLD = 3
# 默认模式查询时间窗口(小时)
DEFAULT_MODE_HOURS = 48
# 用户时区无效时的兜底时区
DEFAULT_FALLBACK_TIMEZONE = "Asia/Shanghai"
# ==========================================
# 默认热点事件容器(无关键词时使用)
# ==========================================
@dataclass
class _DefaultEventItem:
"""
无关键词订阅或关键词无匹配时的默认热点包装器,
接口与 MatchedEventResult 保持一致,方便统一传给模板。
"""
event: UnifiedEvent
match_score: float = 0.0
exact_hits: list[str] = field(default_factory=list)
semantic_hits: list[dict[str, Any]] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
is_default: bool = True
# ==========================================
# 时区工具
# ==========================================
def _time_to_minutes(t: dt_time) -> int:
return t.hour * 60 + t.minute
def _is_within_window(schedule_time: dt_time, current_time: dt_time, window: int = DELIVERY_WINDOW_MINUTES) -> bool:
"""判断 schedule_time 是否在 current_time ± window 分钟范围内(跨午夜安全)。"""
s = _time_to_minutes(schedule_time)
c = _time_to_minutes(current_time)
diff = abs(s - c)
return min(diff, 1440 - diff) <= window
def _resolve_user_timezone(user_timezone: str | None) -> tzinfo:
"""解析用户时区,异常时回退到默认时区。"""
tz_name = (user_timezone or "").strip() or DEFAULT_FALLBACK_TIMEZONE
try:
return ZoneInfo(tz_name)
except ZoneInfoNotFoundError:
logger.warning(
"用户时区无效,已回退默认时区。timezone=%s fallback=%s",
tz_name, DEFAULT_FALLBACK_TIMEZONE,
)
try:
return ZoneInfo(DEFAULT_FALLBACK_TIMEZONE)
except ZoneInfoNotFoundError:
logger.warning("系统缺少时区数据库,最终回退为 UTC。建议安装 tzdata 包。")
return timezone.utc
def _user_local_time(now_utc: datetime, user_timezone: str | None) -> dt_time:
"""把 UTC 当前时刻转换为用户本地时间(仅取 HH:MM)。"""
local_dt = now_utc.astimezone(_resolve_user_timezone(user_timezone))
return local_dt.time().replace(second=0, microsecond=0)
def _ensure_aware(dt: datetime) -> datetime:
if dt.tzinfo is None:
return dt.replace(tzinfo=timezone.utc)
return dt
# ==========================================
# 数据库查询辅助
# ==========================================
def _should_skip_by_interval(db: Session, user_id: int) -> bool:
"""检查用户是否仍在 30 分钟冷却期内。"""
row = (
db.query(DeliveryHistory.created_at)
.filter(
DeliveryHistory.user_id == user_id,
DeliveryHistory.status == TaskStatus.SUCCESS,
)
.order_by(DeliveryHistory.created_at.desc())
.first()
)
if row is None:
return False
last_time = _ensure_aware(row[0])
elapsed = (utcnow() - last_time).total_seconds() / 60.0
return elapsed < MIN_PUSH_INTERVAL_MINUTES
def _get_user_email_endpoints(db: Session, user_id: int) -> list[UserPushEndpoint]:
"""获取用户已启用的邮件类型推送渠道,按优先级排序。"""
return (
db.query(UserPushEndpoint)
.filter(
UserPushEndpoint.user_id == user_id,
UserPushEndpoint.channel_type == "EMAIL",
UserPushEndpoint.is_active == True,
)
.order_by(UserPushEndpoint.priority_level.asc())
.all()
)
def _get_already_pushed_event_ids(db: Session, user_id: int) -> set[int]:
"""获取已经推送过的事件 ID 集合,避免重复轰炸。"""
rows = (
db.query(DeliveryHistory.target_id)
.filter(
DeliveryHistory.user_id == user_id,
DeliveryHistory.target_type == TargetType.EVENT,
DeliveryHistory.status == TaskStatus.SUCCESS,
)
.all()
)
return {r[0] for r in rows}
def _load_event_platforms(db: Session, event_ids: list[int]) -> dict[int, list[dict]]:
"""
批量加载事件的平台来源数据。
返回:event_id → [{source_name, headline, url, ranking, icon_url}, ...]
按排名升序排列(rank 1 最靠前)。
"""
if not event_ids:
return {}
rows = (
db.query(
TrendingEvent.unified_event_id,
TrendingEvent.current_headline,
TrendingEvent.event_url,
TrendingEvent.current_ranking,
TrendingEvent.icon_url,
InfoSource.source_name,
)
.join(InfoSource, TrendingEvent.source_id == InfoSource.id)
.filter(TrendingEvent.unified_event_id.in_(event_ids))
.order_by(
TrendingEvent.unified_event_id,
TrendingEvent.current_ranking.asc().nulls_last(),
)
.all()
)
result: dict[int, list[dict]] = {}
for event_id, headline, url, ranking, icon_url, source_name in rows:
result.setdefault(event_id, []).append({
"source_name": source_name or "未知",
"headline": headline or "",
"url": url or "",
"ranking": ranking,
"icon_url": icon_url or "",
})
return result
def _load_event_tags(db: Session, event_ids: list[int]) -> dict[int, list[str]]:
"""批量加载事件的标签,返回 event_id → [tag, ...]。"""
if not event_ids:
return {}
rows = (
db.query(ExtractedTopic.target_id, ExtractedTopic.topic_keyword)
.filter(
ExtractedTopic.target_type == TargetType.EVENT,
ExtractedTopic.target_id.in_(event_ids),
)
.all()
)
tags_map: dict[int, list[str]] = {}
for eid, kw in rows:
if kw:
tags_map.setdefault(eid, []).append(kw)
return tags_map
def _user_has_keywords(db: Session, user_id: int) -> bool:
"""判断用户是否配置了关键词订阅。"""
return (
db.query(UserTopicPreference.id)
.filter(UserTopicPreference.user_id == user_id)
.first()
) is not None
def _get_default_hot_events(
db: Session,
pushed_ids: set[int],
) -> list[_DefaultEventItem]:
"""
默认模式:获取热度 >= DEFAULT_MODE_HOT_THRESHOLD 的近期热点,
排除已推送过的,封装成与 MatchedEventResult 接口相同的对象。
"""
time_limit = utcnow() - timedelta(hours=DEFAULT_MODE_HOURS)
events = (
db.query(UnifiedEvent)
.filter(
UnifiedEvent.hot_score >= DEFAULT_MODE_HOT_THRESHOLD,
UnifiedEvent.created_at >= time_limit,
)
.order_by(UnifiedEvent.hot_score.desc())
.limit(MAX_EVENTS_PER_PUSH * 2)
.all()
)
event_ids = [e.id for e in events if e.id not in pushed_ids]
tags_map = _load_event_tags(db, event_ids)
result: list[_DefaultEventItem] = []
for ev in events:
if ev.id in pushed_ids:
continue
result.append(_DefaultEventItem(
event=ev,
tags=list(dict.fromkeys(tags_map.get(ev.id, [])))[:6],
))
if len(result) >= MAX_EVENTS_PER_PUSH:
break
return result
def _record_delivery(
db: Session,
user_id: int,
event_ids: list[int],
status: TaskStatus,
) -> None:
"""批量写入推送历史记录。"""
for eid in event_ids:
record = DeliveryHistory(
user_id=user_id,
target_type=TargetType.EVENT,
target_id=eid,
status=status,
)
db.add(record)
db.commit()
# ==========================================
# 推送准备
# ==========================================
@dataclass
class _PendingPush:
"""暂存需要发送邮件的信息,便于在 async 上下文中发送。"""
user_id: int
email_targets: list[str]
subject: str
html_body: str
event_ids: list[int]
def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedule) -> _PendingPush | None:
"""
同步准备单个用户的推送数据(DB 操作),不实际发送邮件。
推送优先级:
1. 有关键词 且 有匹配 → 发送匹配事件
2. 有关键词 但 无匹配 → 发送默认热点(热度 >= 3)
3. 无关键词 → 发送默认热点(热度 >= 3)
"""
user_id = user.id
if _should_skip_by_interval(db, user_id):
logger.info(f"用户 {user_id} 仍在 {MIN_PUSH_INTERVAL_MINUTES} 分钟冷却期内,跳过")
return None
email_endpoints = _get_user_email_endpoints(db, user_id)
if not email_endpoints:
logger.info(f"用户 {user_id} 无可用邮件渠道,跳过")
return None
pushed_ids = _get_already_pushed_event_ids(db, user_id)
# ——— 决策:匹配模式 or 默认模式 ———
items: list = []
is_default = False
has_keywords = _user_has_keywords(db, user_id)
if has_keywords:
matched = recommend_events_for_user(
db,
user_id=user_id,
min_hot=1,
hours=72,
limit=MAX_EVENTS_PER_PUSH * 2,
)
fresh_matched = [m for m in matched if m.event.id not in pushed_ids]
if fresh_matched:
items = fresh_matched[:MAX_EVENTS_PER_PUSH]
logger.info(f"用户 {user_id} 关键词匹配,推送 {len(items)} 条事件")
else:
logger.info(f"用户 {user_id} 关键词无匹配结果,切换为默认热点模式")
is_default = True
else:
logger.info(f"用户 {user_id} 未配置关键词,使用默认热点模式")
is_default = True
if is_default:
items = _get_default_hot_events(db, pushed_ids)
if not items:
logger.info(f"用户 {user_id} 默认热点无可推送内容,跳过")
return None
# 批量加载平台数据(来源名、标题、URL、排名)
event_ids = [item.event.id for item in items]
platforms_map = _load_event_platforms(db, event_ids)
time_str = schedule.delivery_time.strftime("%H:%M")
html_body = build_digest_html(
items=items,
delivery_time_str=time_str,
platforms_map=platforms_map,
is_default_push=is_default,
)
subject_suffix = "全网热点快报" if is_default else "个性化简报"
return _PendingPush(
user_id=user_id,
email_targets=[ep.channel_account for ep in email_endpoints],
subject=f"InsightRadar {subject_suffix} · {time_str}",
html_body=html_body,
event_ids=event_ids,
)
# ==========================================
# 调度主入口
# ==========================================
async def check_and_deliver() -> None:
"""
定时推送主入口,由 APScheduler 每分钟调用。
流程:
1. 获取当前 UTC 时间
2. 查询所有启用的推送计划
3. 对每个计划,按用户本地时区判断是否在推送窗口
4. 同步准备推送数据 → 异步发送邮件 → 记录结果
"""
now = datetime.now(timezone.utc)
current_utc = now.time().replace(second=0, microsecond=0)
logger.debug(f"推送调度检查 @ UTC {current_utc.strftime('%H:%M')}")
db: Session = SessionLocal()
try:
active_schedules = (
db.query(UserDeliverySchedule)
.filter(UserDeliverySchedule.is_active == True)
.all()
)
for schedule in active_schedules:
user = db.query(AppUser).filter(AppUser.id == schedule.user_id).first()
if not user:
continue
# 用户本地时间对比(核心时区修正)
user_current = _user_local_time(now, user.timezone)
if not _is_within_window(schedule.delivery_time, user_current):
continue
try:
pending = _prepare_user_push(db, user, schedule)
if pending is None:
continue
# 异步按优先级尝试各邮件渠道
sent = False
for target_email in pending.email_targets:
try:
success = await send_html_email(
to_email=target_email,
subject=pending.subject,
html_content=pending.html_body,
)
if success:
sent = True
logger.info(f"用户 {pending.user_id} 邮件发送成功 → {target_email}")
break
else:
logger.warning(f"用户 {pending.user_id} 渠道 {target_email} 发送失败,尝试下一个")
except Exception as e:
logger.error(f"用户 {pending.user_id} 发送至 {target_email} 异常: {e}")
_record_delivery(
db,
user_id=pending.user_id,
event_ids=pending.event_ids,
status=TaskStatus.SUCCESS if sent else TaskStatus.ERROR,
)
except Exception as e:
logger.error(f"推送用户 {schedule.user_id} 时异常: {e}", exc_info=True)
except Exception as e:
logger.error(f"推送调度主循环异常: {e}", exc_info=True)
finally:
db.close()
+238
View File
@@ -0,0 +1,238 @@
import os
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any
import numpy as np
from sqlalchemy.orm import Session
from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow
from app.services.fetcher_service import embedder_model
# 语义匹配阈值:用户关键词和事件标签向量相似度达到该值才计入语义命中
DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
PREFERENCE_SEMANTIC_THRESHOLD = float(
os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD))
)
# 推荐列表最大返回条数
DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT = 50
PREFERENCE_RECOMMEND_MAX_LIMIT = int(
os.getenv("PREFERENCE_RECOMMEND_MAX_LIMIT", str(DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT))
)
@dataclass
class MatchedEventResult:
"""用户兴趣匹配后的事件结果。"""
event: UnifiedEvent
match_score: float
exact_hits: list[str]
semantic_hits: list[dict[str, Any]]
tags: list[str]
def _normalize_text(text: str) -> str:
"""统一小写与首尾空白,便于做稳定匹配。"""
return text.strip().casefold()
def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
"""
批量生成关键词向量,并返回原词到向量的映射。
这里要求向量已归一化,后续可直接用点积表示余弦相似度。
"""
if not keywords:
return {}
vectors = embedder_model.encode(keywords, normalize_embeddings=True)
result: dict[str, np.ndarray] = {}
for keyword, vec in zip(keywords, vectors):
result[keyword] = np.asarray(vec, dtype=np.float32)
return result
def _ensure_aware(dt: datetime) -> datetime:
"""SQLite 读出的 datetime 不带时区信息,统一补上 UTC 后才能和 utcnow() 做减法。"""
if dt.tzinfo is None:
return dt.replace(tzinfo=timezone.utc)
return dt
def _calc_freshness_bonus(event: UnifiedEvent) -> float:
"""根据事件新鲜度给一个小额加分,避免旧热点长期占据推荐位。"""
age_hours = max((utcnow() - _ensure_aware(event.created_at)).total_seconds() / 3600.0, 0.0)
if age_hours <= 6:
return 12.0
if age_hours <= 24:
return 8.0
if age_hours <= 72:
return 4.0
return 0.0
def recommend_events_for_user(
db: Session,
*,
user_id: int,
min_hot: int = 3,
hours: int = 72,
limit: int = 20,
semantic_threshold: float | None = None,
) -> list[MatchedEventResult]:
"""
用户兴趣推荐主流程:
1) 精确匹配:用户词 == EVENT 标签
2) 语义匹配:用户词向量 vs EVENT 标签向量(超过阈值)
3) 打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度
"""
final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT))
similarity_threshold = (
semantic_threshold
if semantic_threshold is not None
else PREFERENCE_SEMANTIC_THRESHOLD
)
# 读取用户兴趣词
preferences = (
db.query(UserTopicPreference)
.filter(UserTopicPreference.user_id == user_id)
.all()
)
if not preferences:
return []
preference_keywords = [pref.interested_keyword.strip() for pref in preferences if pref.interested_keyword.strip()]
if not preference_keywords:
return []
# 读取候选事件(先做时间和热度过滤,避免全表扫描)
time_limit = utcnow() - timedelta(hours=hours)
events = (
db.query(UnifiedEvent)
.filter(
UnifiedEvent.hot_score >= min_hot,
UnifiedEvent.created_at >= time_limit,
)
.order_by(UnifiedEvent.hot_score.desc(), UnifiedEvent.created_at.desc())
.all()
)
if not events:
return []
event_id_list = [event.id for event in events]
topic_rows = (
db.query(
ExtractedTopic.target_id,
ExtractedTopic.topic_keyword,
ExtractedTopic.relevance_score,
)
.filter(
ExtractedTopic.target_type == TargetType.EVENT,
ExtractedTopic.target_id.in_(event_id_list),
)
.all()
)
if not topic_rows:
return []
# 组织事件标签映射:event_id -> [(tag, relevance_score), ...]
event_topics: dict[int, list[tuple[str, float | None]]] = {}
for event_id, topic_keyword, relevance_score in topic_rows:
if not topic_keyword:
continue
event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
# 如果某事件没有标签,就不参与推荐
if not event_topics:
return []
# 批量编码用户词和标签词,避免逐条调用模型
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
# 预先建立“标准化后用户词集合”,用于精确匹配
normalized_pref_set = {_normalize_text(word) for word in unique_preference_keywords}
scored_results: list[MatchedEventResult] = []
for event in events:
topic_list = event_topics.get(event.id, [])
if not topic_list:
continue
exact_hits: list[str] = []
semantic_hits: list[dict[str, Any]] = []
score = 0.0
# 对事件标签逐个匹配用户兴趣
for topic_keyword, topic_relevance in topic_list:
normalized_topic = _normalize_text(topic_keyword)
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
# 1) 精确命中(包括完全相等与包含关系)
matched_exact = False
if normalized_topic in normalized_pref_set:
matched_exact = True
else:
for pref_word in normalized_pref_set:
if pref_word and (pref_word in normalized_topic or normalized_topic in pref_word):
matched_exact = True
break
if matched_exact:
exact_hits.append(topic_keyword)
# 精确命中给较高基础分,标签自身相关度作为增益
score += 45.0 + topic_relevance_score * 0.2
continue
# 2) 语义命中(未精确命中时再算)
topic_vec = topic_vec_map.get(topic_keyword)
if topic_vec is None:
continue
best_pref = None
best_sim = -1.0
for pref_keyword, pref_vec in pref_vec_map.items():
sim = float(np.dot(topic_vec, pref_vec))
if sim > best_sim:
best_sim = sim
best_pref = pref_keyword
if best_pref is not None and best_sim >= similarity_threshold:
semantic_hits.append(
{
"preference_keyword": best_pref,
"topic_keyword": topic_keyword,
"similarity": round(best_sim, 4),
}
)
# 语义命中分略低于精确命中,并由相似度放大
score += best_sim * 35.0 + topic_relevance_score * 0.12
# 如果精确和语义都没命中,直接跳过
if not exact_hits and not semantic_hits:
continue
# 融合事件热度和新鲜度,避免只看语义分
score += min(event.hot_score, 100) * 0.3
score += _calc_freshness_bonus(event)
# 返回标签时做去重,保证接口稳定
tags = list(dict.fromkeys([item[0] for item in topic_list]))
scored_results.append(
MatchedEventResult(
event=event,
match_score=round(score, 2),
exact_hits=list(dict.fromkeys(exact_hits)),
semantic_hits=semantic_hits,
tags=tags,
)
)
scored_results.sort(
key=lambda item: (item.match_score, item.event.hot_score, item.event.created_at),
reverse=True,
)
return scored_results[:final_limit]
+175 -38
View File
@@ -1,104 +1,241 @@
# app/services/summary_service.py
import os
import json
import os
from datetime import timedelta
from typing import Any
import numpy as np
from openai import AsyncOpenAI
from app.database import SessionLocal
from app.models.models import UnifiedEvent, TrendingEvent, InfoSource, utcnow
from app.models.models import (
ExtractedTopic,
InfoSource,
TargetType,
TrendingEvent,
UnifiedEvent,
utcnow,
)
from app.prompts.summary_prompts import (
SUMMARY_SYSTEM_PROMPT,
SUMMARY_USER_PROMPT_TEMPLATE,
)
from app.services.fetcher_service import embedder_model
HOT_SCORE_THRESHOLD = int(os.getenv("HOT_SCORE_THRESHOLD", 3))
AI_API_KEY = os.getenv("AI_API_KEY", '')
TOPIC_TAG_MIN_HOT_SCORE = int(os.getenv("TOPIC_TAG_MIN_HOT_SCORE", HOT_SCORE_THRESHOLD))
TOPIC_SIMILARITY_THRESHOLD = float(os.getenv("TOPIC_SIMILARITY_THRESHOLD", 0.82))
TOPIC_TAG_MAX_COUNT = int(os.getenv("TOPIC_TAG_MAX_COUNT", 8))
AI_API_KEY = os.getenv("AI_API_KEY", "")
# 1. 初始化异步客户端 (全局复用)
deepseek_client = AsyncOpenAI(
api_key=AI_API_KEY,
base_url="https://api.deepseek.com"
base_url="https://api.deepseek.com",
)
async def call_llm_for_summary(platform_data_text: str) -> dict:
"""调用 DeepSeek 生成统一标题和多平台视角摘要"""
prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
platform_data_text=platform_data_text
)
"""Call LLM for unified title, summary and topic candidates."""
prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(platform_data_text=platform_data_text)
# await
response = await deepseek_client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
{"role": "user", "content": prompt}
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
temperature=1
temperature=1,
)
result_text = response.choices[0].message.content
return json.loads(result_text)
def _normalize_score(raw_score: Any) -> float | None:
try:
score = float(raw_score)
except (TypeError, ValueError):
return None
if score <= 1:
score *= 100
return max(0.0, min(100.0, score))
def parse_topic_keywords(llm_result: dict) -> list[dict[str, Any]]:
"""Parse topic keywords from LLM response; support list[str] and list[object]."""
raw_topics = llm_result.get("topic_keywords") or []
parsed: list[dict[str, Any]] = []
seen: set[str] = set()
for item in raw_topics:
keyword = ""
score = None
if isinstance(item, str):
keyword = item.strip()
elif isinstance(item, dict):
raw_keyword = (
item.get("keyword")
or item.get("topic_keyword")
or item.get("name")
or item.get("topic")
or ""
)
keyword = str(raw_keyword).strip()
score = _normalize_score(item.get("relevance_score") or item.get("score"))
if not keyword:
continue
keyword = keyword[:100]
normalized_key = keyword.casefold()
if normalized_key in seen:
continue
seen.add(normalized_key)
parsed.append({"keyword": keyword, "score": score})
return parsed
def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Deduplicate semantically similar tags using embedding similarity."""
if not topic_candidates:
return []
keywords = [item["keyword"] for item in topic_candidates]
vectors = embedder_model.encode(keywords, normalize_embeddings=True)
clusters: list[dict[str, Any]] = []
for item, vector in zip(topic_candidates, vectors):
vec = np.asarray(vector, dtype=np.float32)
best_idx = -1
best_sim = -1.0
for idx, cluster in enumerate(clusters):
sim = float(np.dot(vec, cluster["vector"]))
if sim > best_sim:
best_sim = sim
best_idx = idx
if best_idx >= 0 and best_sim >= TOPIC_SIMILARITY_THRESHOLD:
cluster = clusters[best_idx]
merged = cluster["vector"] * cluster["count"] + vec
norm = float(np.linalg.norm(merged))
if norm > 0:
cluster["vector"] = merged / norm
cluster["count"] += 1
if item["score"] is not None and (
cluster["score"] is None or item["score"] > cluster["score"]
):
cluster["score"] = item["score"]
# Prefer shorter tag as canonical keyword.
if len(item["keyword"]) < len(cluster["keyword"]):
cluster["keyword"] = item["keyword"]
else:
clusters.append(
{
"keyword": item["keyword"],
"score": item["score"],
"vector": vec,
"count": 1,
}
)
if any(cluster["score"] is not None for cluster in clusters):
clusters.sort(key=lambda x: x["score"] if x["score"] is not None else -1.0, reverse=True)
result = [
{"keyword": cluster["keyword"], "score": cluster["score"]}
for cluster in clusters[:TOPIC_TAG_MAX_COUNT]
]
return result
def replace_event_topics(db, event_id: int, normalized_topics: list[dict[str, Any]]) -> None:
"""Replace EVENT tags for one unified event atomically within current transaction."""
db.query(ExtractedTopic).filter(
ExtractedTopic.target_type == TargetType.EVENT,
ExtractedTopic.target_id == event_id,
).delete(synchronize_session=False)
for item in normalized_topics:
db.add(
ExtractedTopic(
target_type=TargetType.EVENT,
target_id=event_id,
topic_keyword=item["keyword"],
relevance_score=item["score"],
)
)
async def generate_unified_summaries():
"""定时任务:扫描高热度事件并生成/更新摘要"""
print(f"[{utcnow()}] 开始执行 DeepSeek 摘要生成任务...")
"""Scheduled task: refresh summaries and topic tags for hot unified events."""
print(f"[{utcnow()}] Start unified summary generation task...")
with SessionLocal() as db:
recent_threshold = utcnow() - timedelta(days=3)
# 必须满足:热度达标 AND (当前热度 > 上次生成摘要时的热度) AND 近期活跃
events = db.query(UnifiedEvent).filter(
UnifiedEvent.hot_score >= HOT_SCORE_THRESHOLD,
UnifiedEvent.hot_score > UnifiedEvent.last_summarized_trends_count,
UnifiedEvent.created_at >= recent_threshold
UnifiedEvent.created_at >= recent_threshold,
).all()
if not events:
print("当前没有需要更新摘要的大事件,任务结束。")
print("No events require summary update in this round.")
return
for event in events:
# 联合查询获取该事件在各平台的子新闻
trends = db.query(TrendingEvent, InfoSource.source_name) \
.join(InfoSource, TrendingEvent.source_id == InfoSource.id) \
.filter(TrendingEvent.unified_event_id == event.id) \
trends = (
db.query(TrendingEvent, InfoSource.source_name)
.join(InfoSource, TrendingEvent.source_id == InfoSource.id)
.filter(TrendingEvent.unified_event_id == event.id)
.all()
)
if not trends:
continue
# 按平台归类标题并去重
platform_dict = {}
platform_dict: dict[str, set[str]] = {}
for trend_record, source_name in trends:
if source_name not in platform_dict:
platform_dict[source_name] = set()
platform_dict[source_name].add(trend_record.current_headline)
platform_dict.setdefault(source_name, set()).add(trend_record.current_headline)
# 组装给大模型的 Prompt 数据
prompt_lines = [f"{platform}】: {', '.join(headlines)}" for platform, headlines in platform_dict.items()]
prompt_lines = [
f"[{platform}] {', '.join(sorted(headlines))}"
for platform, headlines in platform_dict.items()
]
platform_data_text = "\n".join(prompt_lines)
try:
# 调用封装好的异步函数
llm_result = await call_llm_for_summary(platform_data_text)
if "unified_title" in llm_result:
if "unified_title" in llm_result and llm_result["unified_title"]:
event.unified_title = llm_result["unified_title"]
if "ai_comprehensive_summary" in llm_result:
if "ai_comprehensive_summary" in llm_result and llm_result["ai_comprehensive_summary"]:
event.ai_comprehensive_summary = llm_result["ai_comprehensive_summary"]
# 成功后更新水位线
# 将最后一次总结时的热搜数量,更新为当前最新的 hot_score
if event.hot_score >= TOPIC_TAG_MIN_HOT_SCORE:
topic_candidates = parse_topic_keywords(llm_result)
normalized_topics = normalize_topic_keywords(topic_candidates)
if normalized_topics:
replace_event_topics(db, event.id, normalized_topics)
event.last_summarized_trends_count = event.hot_score
print(
f"Updated event {event.id} summary"
f" (hot_score={event.hot_score})."
)
print(f"成功更新大事件 ID {event.id} 的深度摘要 (当前热度: {event.hot_score})。")
except Exception as e:
print(f"大事件 ID {event.id} 摘要生成失败: {e}")
except Exception as exc:
print(f"Event {event.id} summary generation failed: {exc}")
continue
# 提交事务
db.commit()