mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-06 00:57:51 +08:00
big update
This commit is contained in:
@@ -0,0 +1,454 @@
|
||||
# 定时推送调度服务
|
||||
# 由 APScheduler 每分钟调用,检查当前时刻是否有用户需要接收推送,
|
||||
# 如匹配则生成摘要邮件并发送,同时写入 DeliveryHistory 防重复。
|
||||
import logging
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, time as dt_time, timedelta, timezone, tzinfo
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.database import SessionLocal
|
||||
from app.models.models import (
|
||||
AppUser,
|
||||
DeliveryHistory,
|
||||
ExtractedTopic,
|
||||
InfoSource,
|
||||
TargetType,
|
||||
TaskStatus,
|
||||
TrendingEvent,
|
||||
UnifiedEvent,
|
||||
UserDeliverySchedule,
|
||||
UserPushEndpoint,
|
||||
UserTopicPreference,
|
||||
utcnow,
|
||||
)
|
||||
from app.prompts.digest_email_template import build_digest_html
|
||||
from app.services.matching_service import recommend_events_for_user
|
||||
from app.utils.email_utils import send_html_email
|
||||
|
||||
logger = logging.getLogger("delivery_service")
|
||||
|
||||
# delivery_service 日志单独写文件,不再输出到控制台
|
||||
_delivery_log_dir = Path(__file__).resolve().parents[2] / "logs"
|
||||
_delivery_log_dir.mkdir(parents=True, exist_ok=True)
|
||||
_delivery_log_file = _delivery_log_dir / "delivery_check.log"
|
||||
if not logger.handlers:
|
||||
_file_handler = TimedRotatingFileHandler(
|
||||
filename=str(_delivery_log_file),
|
||||
when="midnight",
|
||||
interval=1,
|
||||
backupCount=14,
|
||||
encoding="utf-8",
|
||||
)
|
||||
_file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
|
||||
logger.addHandler(_file_handler)
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.propagate = False
|
||||
|
||||
# 推送时间窗口:实际执行时刻与设定时间的最大容差(分钟)
|
||||
DELIVERY_WINDOW_MINUTES = 2
|
||||
# 同一用户两次推送之间的最小间隔(分钟)
|
||||
MIN_PUSH_INTERVAL_MINUTES = 30
|
||||
# 单次推送最多携带的事件数
|
||||
MAX_EVENTS_PER_PUSH = 12
|
||||
# 默认模式热度阈值(无关键词或无匹配时使用)
|
||||
DEFAULT_MODE_HOT_THRESHOLD = 3
|
||||
# 默认模式查询时间窗口(小时)
|
||||
DEFAULT_MODE_HOURS = 48
|
||||
# 用户时区无效时的兜底时区
|
||||
DEFAULT_FALLBACK_TIMEZONE = "Asia/Shanghai"
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 默认热点事件容器(无关键词时使用)
|
||||
# ==========================================
|
||||
@dataclass
|
||||
class _DefaultEventItem:
|
||||
"""
|
||||
无关键词订阅或关键词无匹配时的默认热点包装器,
|
||||
接口与 MatchedEventResult 保持一致,方便统一传给模板。
|
||||
"""
|
||||
event: UnifiedEvent
|
||||
match_score: float = 0.0
|
||||
exact_hits: list[str] = field(default_factory=list)
|
||||
semantic_hits: list[dict[str, Any]] = field(default_factory=list)
|
||||
tags: list[str] = field(default_factory=list)
|
||||
is_default: bool = True
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 时区工具
|
||||
# ==========================================
|
||||
def _time_to_minutes(t: dt_time) -> int:
|
||||
return t.hour * 60 + t.minute
|
||||
|
||||
|
||||
def _is_within_window(schedule_time: dt_time, current_time: dt_time, window: int = DELIVERY_WINDOW_MINUTES) -> bool:
|
||||
"""判断 schedule_time 是否在 current_time ± window 分钟范围内(跨午夜安全)。"""
|
||||
s = _time_to_minutes(schedule_time)
|
||||
c = _time_to_minutes(current_time)
|
||||
diff = abs(s - c)
|
||||
return min(diff, 1440 - diff) <= window
|
||||
|
||||
|
||||
def _resolve_user_timezone(user_timezone: str | None) -> tzinfo:
|
||||
"""解析用户时区,异常时回退到默认时区。"""
|
||||
tz_name = (user_timezone or "").strip() or DEFAULT_FALLBACK_TIMEZONE
|
||||
try:
|
||||
return ZoneInfo(tz_name)
|
||||
except ZoneInfoNotFoundError:
|
||||
logger.warning(
|
||||
"用户时区无效,已回退默认时区。timezone=%s fallback=%s",
|
||||
tz_name, DEFAULT_FALLBACK_TIMEZONE,
|
||||
)
|
||||
try:
|
||||
return ZoneInfo(DEFAULT_FALLBACK_TIMEZONE)
|
||||
except ZoneInfoNotFoundError:
|
||||
logger.warning("系统缺少时区数据库,最终回退为 UTC。建议安装 tzdata 包。")
|
||||
return timezone.utc
|
||||
|
||||
|
||||
def _user_local_time(now_utc: datetime, user_timezone: str | None) -> dt_time:
|
||||
"""把 UTC 当前时刻转换为用户本地时间(仅取 HH:MM)。"""
|
||||
local_dt = now_utc.astimezone(_resolve_user_timezone(user_timezone))
|
||||
return local_dt.time().replace(second=0, microsecond=0)
|
||||
|
||||
|
||||
def _ensure_aware(dt: datetime) -> datetime:
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=timezone.utc)
|
||||
return dt
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 数据库查询辅助
|
||||
# ==========================================
|
||||
def _should_skip_by_interval(db: Session, user_id: int) -> bool:
|
||||
"""检查用户是否仍在 30 分钟冷却期内。"""
|
||||
row = (
|
||||
db.query(DeliveryHistory.created_at)
|
||||
.filter(
|
||||
DeliveryHistory.user_id == user_id,
|
||||
DeliveryHistory.status == TaskStatus.SUCCESS,
|
||||
)
|
||||
.order_by(DeliveryHistory.created_at.desc())
|
||||
.first()
|
||||
)
|
||||
if row is None:
|
||||
return False
|
||||
last_time = _ensure_aware(row[0])
|
||||
elapsed = (utcnow() - last_time).total_seconds() / 60.0
|
||||
return elapsed < MIN_PUSH_INTERVAL_MINUTES
|
||||
|
||||
|
||||
def _get_user_email_endpoints(db: Session, user_id: int) -> list[UserPushEndpoint]:
|
||||
"""获取用户已启用的邮件类型推送渠道,按优先级排序。"""
|
||||
return (
|
||||
db.query(UserPushEndpoint)
|
||||
.filter(
|
||||
UserPushEndpoint.user_id == user_id,
|
||||
UserPushEndpoint.channel_type == "EMAIL",
|
||||
UserPushEndpoint.is_active == True,
|
||||
)
|
||||
.order_by(UserPushEndpoint.priority_level.asc())
|
||||
.all()
|
||||
)
|
||||
|
||||
|
||||
def _get_already_pushed_event_ids(db: Session, user_id: int) -> set[int]:
|
||||
"""获取已经推送过的事件 ID 集合,避免重复轰炸。"""
|
||||
rows = (
|
||||
db.query(DeliveryHistory.target_id)
|
||||
.filter(
|
||||
DeliveryHistory.user_id == user_id,
|
||||
DeliveryHistory.target_type == TargetType.EVENT,
|
||||
DeliveryHistory.status == TaskStatus.SUCCESS,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
return {r[0] for r in rows}
|
||||
|
||||
|
||||
def _load_event_platforms(db: Session, event_ids: list[int]) -> dict[int, list[dict]]:
|
||||
"""
|
||||
批量加载事件的平台来源数据。
|
||||
返回:event_id → [{source_name, headline, url, ranking, icon_url}, ...]
|
||||
按排名升序排列(rank 1 最靠前)。
|
||||
"""
|
||||
if not event_ids:
|
||||
return {}
|
||||
|
||||
rows = (
|
||||
db.query(
|
||||
TrendingEvent.unified_event_id,
|
||||
TrendingEvent.current_headline,
|
||||
TrendingEvent.event_url,
|
||||
TrendingEvent.current_ranking,
|
||||
TrendingEvent.icon_url,
|
||||
InfoSource.source_name,
|
||||
)
|
||||
.join(InfoSource, TrendingEvent.source_id == InfoSource.id)
|
||||
.filter(TrendingEvent.unified_event_id.in_(event_ids))
|
||||
.order_by(
|
||||
TrendingEvent.unified_event_id,
|
||||
TrendingEvent.current_ranking.asc().nulls_last(),
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
result: dict[int, list[dict]] = {}
|
||||
for event_id, headline, url, ranking, icon_url, source_name in rows:
|
||||
result.setdefault(event_id, []).append({
|
||||
"source_name": source_name or "未知",
|
||||
"headline": headline or "",
|
||||
"url": url or "",
|
||||
"ranking": ranking,
|
||||
"icon_url": icon_url or "",
|
||||
})
|
||||
return result
|
||||
|
||||
|
||||
def _load_event_tags(db: Session, event_ids: list[int]) -> dict[int, list[str]]:
|
||||
"""批量加载事件的标签,返回 event_id → [tag, ...]。"""
|
||||
if not event_ids:
|
||||
return {}
|
||||
rows = (
|
||||
db.query(ExtractedTopic.target_id, ExtractedTopic.topic_keyword)
|
||||
.filter(
|
||||
ExtractedTopic.target_type == TargetType.EVENT,
|
||||
ExtractedTopic.target_id.in_(event_ids),
|
||||
)
|
||||
.all()
|
||||
)
|
||||
tags_map: dict[int, list[str]] = {}
|
||||
for eid, kw in rows:
|
||||
if kw:
|
||||
tags_map.setdefault(eid, []).append(kw)
|
||||
return tags_map
|
||||
|
||||
|
||||
def _user_has_keywords(db: Session, user_id: int) -> bool:
|
||||
"""判断用户是否配置了关键词订阅。"""
|
||||
return (
|
||||
db.query(UserTopicPreference.id)
|
||||
.filter(UserTopicPreference.user_id == user_id)
|
||||
.first()
|
||||
) is not None
|
||||
|
||||
|
||||
def _get_default_hot_events(
|
||||
db: Session,
|
||||
pushed_ids: set[int],
|
||||
) -> list[_DefaultEventItem]:
|
||||
"""
|
||||
默认模式:获取热度 >= DEFAULT_MODE_HOT_THRESHOLD 的近期热点,
|
||||
排除已推送过的,封装成与 MatchedEventResult 接口相同的对象。
|
||||
"""
|
||||
time_limit = utcnow() - timedelta(hours=DEFAULT_MODE_HOURS)
|
||||
events = (
|
||||
db.query(UnifiedEvent)
|
||||
.filter(
|
||||
UnifiedEvent.hot_score >= DEFAULT_MODE_HOT_THRESHOLD,
|
||||
UnifiedEvent.created_at >= time_limit,
|
||||
)
|
||||
.order_by(UnifiedEvent.hot_score.desc())
|
||||
.limit(MAX_EVENTS_PER_PUSH * 2)
|
||||
.all()
|
||||
)
|
||||
|
||||
event_ids = [e.id for e in events if e.id not in pushed_ids]
|
||||
tags_map = _load_event_tags(db, event_ids)
|
||||
|
||||
result: list[_DefaultEventItem] = []
|
||||
for ev in events:
|
||||
if ev.id in pushed_ids:
|
||||
continue
|
||||
result.append(_DefaultEventItem(
|
||||
event=ev,
|
||||
tags=list(dict.fromkeys(tags_map.get(ev.id, [])))[:6],
|
||||
))
|
||||
if len(result) >= MAX_EVENTS_PER_PUSH:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _record_delivery(
|
||||
db: Session,
|
||||
user_id: int,
|
||||
event_ids: list[int],
|
||||
status: TaskStatus,
|
||||
) -> None:
|
||||
"""批量写入推送历史记录。"""
|
||||
for eid in event_ids:
|
||||
record = DeliveryHistory(
|
||||
user_id=user_id,
|
||||
target_type=TargetType.EVENT,
|
||||
target_id=eid,
|
||||
status=status,
|
||||
)
|
||||
db.add(record)
|
||||
db.commit()
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 推送准备
|
||||
# ==========================================
|
||||
@dataclass
|
||||
class _PendingPush:
|
||||
"""暂存需要发送邮件的信息,便于在 async 上下文中发送。"""
|
||||
user_id: int
|
||||
email_targets: list[str]
|
||||
subject: str
|
||||
html_body: str
|
||||
event_ids: list[int]
|
||||
|
||||
|
||||
def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedule) -> _PendingPush | None:
|
||||
"""
|
||||
同步准备单个用户的推送数据(DB 操作),不实际发送邮件。
|
||||
推送优先级:
|
||||
1. 有关键词 且 有匹配 → 发送匹配事件
|
||||
2. 有关键词 但 无匹配 → 发送默认热点(热度 >= 3)
|
||||
3. 无关键词 → 发送默认热点(热度 >= 3)
|
||||
"""
|
||||
user_id = user.id
|
||||
|
||||
if _should_skip_by_interval(db, user_id):
|
||||
logger.info(f"用户 {user_id} 仍在 {MIN_PUSH_INTERVAL_MINUTES} 分钟冷却期内,跳过")
|
||||
return None
|
||||
|
||||
email_endpoints = _get_user_email_endpoints(db, user_id)
|
||||
if not email_endpoints:
|
||||
logger.info(f"用户 {user_id} 无可用邮件渠道,跳过")
|
||||
return None
|
||||
|
||||
pushed_ids = _get_already_pushed_event_ids(db, user_id)
|
||||
|
||||
# ——— 决策:匹配模式 or 默认模式 ———
|
||||
items: list = []
|
||||
is_default = False
|
||||
|
||||
has_keywords = _user_has_keywords(db, user_id)
|
||||
if has_keywords:
|
||||
matched = recommend_events_for_user(
|
||||
db,
|
||||
user_id=user_id,
|
||||
min_hot=1,
|
||||
hours=72,
|
||||
limit=MAX_EVENTS_PER_PUSH * 2,
|
||||
)
|
||||
fresh_matched = [m for m in matched if m.event.id not in pushed_ids]
|
||||
if fresh_matched:
|
||||
items = fresh_matched[:MAX_EVENTS_PER_PUSH]
|
||||
logger.info(f"用户 {user_id} 关键词匹配,推送 {len(items)} 条事件")
|
||||
else:
|
||||
logger.info(f"用户 {user_id} 关键词无匹配结果,切换为默认热点模式")
|
||||
is_default = True
|
||||
else:
|
||||
logger.info(f"用户 {user_id} 未配置关键词,使用默认热点模式")
|
||||
is_default = True
|
||||
|
||||
if is_default:
|
||||
items = _get_default_hot_events(db, pushed_ids)
|
||||
if not items:
|
||||
logger.info(f"用户 {user_id} 默认热点无可推送内容,跳过")
|
||||
return None
|
||||
|
||||
# 批量加载平台数据(来源名、标题、URL、排名)
|
||||
event_ids = [item.event.id for item in items]
|
||||
platforms_map = _load_event_platforms(db, event_ids)
|
||||
|
||||
time_str = schedule.delivery_time.strftime("%H:%M")
|
||||
html_body = build_digest_html(
|
||||
items=items,
|
||||
delivery_time_str=time_str,
|
||||
platforms_map=platforms_map,
|
||||
is_default_push=is_default,
|
||||
)
|
||||
|
||||
subject_suffix = "全网热点快报" if is_default else "个性化简报"
|
||||
return _PendingPush(
|
||||
user_id=user_id,
|
||||
email_targets=[ep.channel_account for ep in email_endpoints],
|
||||
subject=f"InsightRadar {subject_suffix} · {time_str}",
|
||||
html_body=html_body,
|
||||
event_ids=event_ids,
|
||||
)
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 调度主入口
|
||||
# ==========================================
|
||||
async def check_and_deliver() -> None:
|
||||
"""
|
||||
定时推送主入口,由 APScheduler 每分钟调用。
|
||||
流程:
|
||||
1. 获取当前 UTC 时间
|
||||
2. 查询所有启用的推送计划
|
||||
3. 对每个计划,按用户本地时区判断是否在推送窗口
|
||||
4. 同步准备推送数据 → 异步发送邮件 → 记录结果
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
current_utc = now.time().replace(second=0, microsecond=0)
|
||||
logger.debug(f"推送调度检查 @ UTC {current_utc.strftime('%H:%M')}")
|
||||
|
||||
db: Session = SessionLocal()
|
||||
try:
|
||||
active_schedules = (
|
||||
db.query(UserDeliverySchedule)
|
||||
.filter(UserDeliverySchedule.is_active == True)
|
||||
.all()
|
||||
)
|
||||
|
||||
for schedule in active_schedules:
|
||||
user = db.query(AppUser).filter(AppUser.id == schedule.user_id).first()
|
||||
if not user:
|
||||
continue
|
||||
|
||||
# 用户本地时间对比(核心时区修正)
|
||||
user_current = _user_local_time(now, user.timezone)
|
||||
if not _is_within_window(schedule.delivery_time, user_current):
|
||||
continue
|
||||
|
||||
try:
|
||||
pending = _prepare_user_push(db, user, schedule)
|
||||
if pending is None:
|
||||
continue
|
||||
|
||||
# 异步按优先级尝试各邮件渠道
|
||||
sent = False
|
||||
for target_email in pending.email_targets:
|
||||
try:
|
||||
success = await send_html_email(
|
||||
to_email=target_email,
|
||||
subject=pending.subject,
|
||||
html_content=pending.html_body,
|
||||
)
|
||||
if success:
|
||||
sent = True
|
||||
logger.info(f"用户 {pending.user_id} 邮件发送成功 → {target_email}")
|
||||
break
|
||||
else:
|
||||
logger.warning(f"用户 {pending.user_id} 渠道 {target_email} 发送失败,尝试下一个")
|
||||
except Exception as e:
|
||||
logger.error(f"用户 {pending.user_id} 发送至 {target_email} 异常: {e}")
|
||||
|
||||
_record_delivery(
|
||||
db,
|
||||
user_id=pending.user_id,
|
||||
event_ids=pending.event_ids,
|
||||
status=TaskStatus.SUCCESS if sent else TaskStatus.ERROR,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"推送用户 {schedule.user_id} 时异常: {e}", exc_info=True)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"推送调度主循环异常: {e}", exc_info=True)
|
||||
finally:
|
||||
db.close()
|
||||
@@ -0,0 +1,238 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopicPreference, utcnow
|
||||
from app.services.fetcher_service import embedder_model
|
||||
|
||||
|
||||
# 语义匹配阈值:用户关键词和事件标签向量相似度达到该值才计入语义命中
|
||||
DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
|
||||
PREFERENCE_SEMANTIC_THRESHOLD = float(
|
||||
os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD))
|
||||
)
|
||||
# 推荐列表最大返回条数
|
||||
DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT = 50
|
||||
PREFERENCE_RECOMMEND_MAX_LIMIT = int(
|
||||
os.getenv("PREFERENCE_RECOMMEND_MAX_LIMIT", str(DEFAULT_PREFERENCE_RECOMMEND_MAX_LIMIT))
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchedEventResult:
|
||||
"""用户兴趣匹配后的事件结果。"""
|
||||
event: UnifiedEvent
|
||||
match_score: float
|
||||
exact_hits: list[str]
|
||||
semantic_hits: list[dict[str, Any]]
|
||||
tags: list[str]
|
||||
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
"""统一小写与首尾空白,便于做稳定匹配。"""
|
||||
return text.strip().casefold()
|
||||
|
||||
|
||||
def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
|
||||
"""
|
||||
批量生成关键词向量,并返回原词到向量的映射。
|
||||
这里要求向量已归一化,后续可直接用点积表示余弦相似度。
|
||||
"""
|
||||
if not keywords:
|
||||
return {}
|
||||
|
||||
vectors = embedder_model.encode(keywords, normalize_embeddings=True)
|
||||
result: dict[str, np.ndarray] = {}
|
||||
for keyword, vec in zip(keywords, vectors):
|
||||
result[keyword] = np.asarray(vec, dtype=np.float32)
|
||||
return result
|
||||
|
||||
|
||||
def _ensure_aware(dt: datetime) -> datetime:
|
||||
"""SQLite 读出的 datetime 不带时区信息,统一补上 UTC 后才能和 utcnow() 做减法。"""
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=timezone.utc)
|
||||
return dt
|
||||
|
||||
|
||||
def _calc_freshness_bonus(event: UnifiedEvent) -> float:
|
||||
"""根据事件新鲜度给一个小额加分,避免旧热点长期占据推荐位。"""
|
||||
age_hours = max((utcnow() - _ensure_aware(event.created_at)).total_seconds() / 3600.0, 0.0)
|
||||
if age_hours <= 6:
|
||||
return 12.0
|
||||
if age_hours <= 24:
|
||||
return 8.0
|
||||
if age_hours <= 72:
|
||||
return 4.0
|
||||
return 0.0
|
||||
|
||||
|
||||
def recommend_events_for_user(
|
||||
db: Session,
|
||||
*,
|
||||
user_id: int,
|
||||
min_hot: int = 3,
|
||||
hours: int = 72,
|
||||
limit: int = 20,
|
||||
semantic_threshold: float | None = None,
|
||||
) -> list[MatchedEventResult]:
|
||||
"""
|
||||
用户兴趣推荐主流程:
|
||||
1) 精确匹配:用户词 == EVENT 标签
|
||||
2) 语义匹配:用户词向量 vs EVENT 标签向量(超过阈值)
|
||||
3) 打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度
|
||||
"""
|
||||
final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT))
|
||||
similarity_threshold = (
|
||||
semantic_threshold
|
||||
if semantic_threshold is not None
|
||||
else PREFERENCE_SEMANTIC_THRESHOLD
|
||||
)
|
||||
|
||||
# 读取用户兴趣词
|
||||
preferences = (
|
||||
db.query(UserTopicPreference)
|
||||
.filter(UserTopicPreference.user_id == user_id)
|
||||
.all()
|
||||
)
|
||||
if not preferences:
|
||||
return []
|
||||
|
||||
preference_keywords = [pref.interested_keyword.strip() for pref in preferences if pref.interested_keyword.strip()]
|
||||
if not preference_keywords:
|
||||
return []
|
||||
|
||||
# 读取候选事件(先做时间和热度过滤,避免全表扫描)
|
||||
time_limit = utcnow() - timedelta(hours=hours)
|
||||
events = (
|
||||
db.query(UnifiedEvent)
|
||||
.filter(
|
||||
UnifiedEvent.hot_score >= min_hot,
|
||||
UnifiedEvent.created_at >= time_limit,
|
||||
)
|
||||
.order_by(UnifiedEvent.hot_score.desc(), UnifiedEvent.created_at.desc())
|
||||
.all()
|
||||
)
|
||||
if not events:
|
||||
return []
|
||||
|
||||
event_id_list = [event.id for event in events]
|
||||
topic_rows = (
|
||||
db.query(
|
||||
ExtractedTopic.target_id,
|
||||
ExtractedTopic.topic_keyword,
|
||||
ExtractedTopic.relevance_score,
|
||||
)
|
||||
.filter(
|
||||
ExtractedTopic.target_type == TargetType.EVENT,
|
||||
ExtractedTopic.target_id.in_(event_id_list),
|
||||
)
|
||||
.all()
|
||||
)
|
||||
if not topic_rows:
|
||||
return []
|
||||
|
||||
# 组织事件标签映射:event_id -> [(tag, relevance_score), ...]
|
||||
event_topics: dict[int, list[tuple[str, float | None]]] = {}
|
||||
for event_id, topic_keyword, relevance_score in topic_rows:
|
||||
if not topic_keyword:
|
||||
continue
|
||||
event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
|
||||
|
||||
# 如果某事件没有标签,就不参与推荐
|
||||
if not event_topics:
|
||||
return []
|
||||
|
||||
# 批量编码用户词和标签词,避免逐条调用模型
|
||||
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
|
||||
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
|
||||
pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
|
||||
topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
|
||||
|
||||
# 预先建立“标准化后用户词集合”,用于精确匹配
|
||||
normalized_pref_set = {_normalize_text(word) for word in unique_preference_keywords}
|
||||
|
||||
scored_results: list[MatchedEventResult] = []
|
||||
for event in events:
|
||||
topic_list = event_topics.get(event.id, [])
|
||||
if not topic_list:
|
||||
continue
|
||||
|
||||
exact_hits: list[str] = []
|
||||
semantic_hits: list[dict[str, Any]] = []
|
||||
score = 0.0
|
||||
|
||||
# 对事件标签逐个匹配用户兴趣
|
||||
for topic_keyword, topic_relevance in topic_list:
|
||||
normalized_topic = _normalize_text(topic_keyword)
|
||||
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
|
||||
|
||||
# 1) 精确命中(包括完全相等与包含关系)
|
||||
matched_exact = False
|
||||
if normalized_topic in normalized_pref_set:
|
||||
matched_exact = True
|
||||
else:
|
||||
for pref_word in normalized_pref_set:
|
||||
if pref_word and (pref_word in normalized_topic or normalized_topic in pref_word):
|
||||
matched_exact = True
|
||||
break
|
||||
|
||||
if matched_exact:
|
||||
exact_hits.append(topic_keyword)
|
||||
# 精确命中给较高基础分,标签自身相关度作为增益
|
||||
score += 45.0 + topic_relevance_score * 0.2
|
||||
continue
|
||||
|
||||
# 2) 语义命中(未精确命中时再算)
|
||||
topic_vec = topic_vec_map.get(topic_keyword)
|
||||
if topic_vec is None:
|
||||
continue
|
||||
|
||||
best_pref = None
|
||||
best_sim = -1.0
|
||||
for pref_keyword, pref_vec in pref_vec_map.items():
|
||||
sim = float(np.dot(topic_vec, pref_vec))
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_pref = pref_keyword
|
||||
|
||||
if best_pref is not None and best_sim >= similarity_threshold:
|
||||
semantic_hits.append(
|
||||
{
|
||||
"preference_keyword": best_pref,
|
||||
"topic_keyword": topic_keyword,
|
||||
"similarity": round(best_sim, 4),
|
||||
}
|
||||
)
|
||||
# 语义命中分略低于精确命中,并由相似度放大
|
||||
score += best_sim * 35.0 + topic_relevance_score * 0.12
|
||||
|
||||
# 如果精确和语义都没命中,直接跳过
|
||||
if not exact_hits and not semantic_hits:
|
||||
continue
|
||||
|
||||
# 融合事件热度和新鲜度,避免只看语义分
|
||||
score += min(event.hot_score, 100) * 0.3
|
||||
score += _calc_freshness_bonus(event)
|
||||
|
||||
# 返回标签时做去重,保证接口稳定
|
||||
tags = list(dict.fromkeys([item[0] for item in topic_list]))
|
||||
scored_results.append(
|
||||
MatchedEventResult(
|
||||
event=event,
|
||||
match_score=round(score, 2),
|
||||
exact_hits=list(dict.fromkeys(exact_hits)),
|
||||
semantic_hits=semantic_hits,
|
||||
tags=tags,
|
||||
)
|
||||
)
|
||||
|
||||
scored_results.sort(
|
||||
key=lambda item: (item.match_score, item.event.hot_score, item.event.created_at),
|
||||
reverse=True,
|
||||
)
|
||||
return scored_results[:final_limit]
|
||||
@@ -1,104 +1,241 @@
|
||||
# app/services/summary_service.py
|
||||
import os
|
||||
import json
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from app.database import SessionLocal
|
||||
from app.models.models import UnifiedEvent, TrendingEvent, InfoSource, utcnow
|
||||
from app.models.models import (
|
||||
ExtractedTopic,
|
||||
InfoSource,
|
||||
TargetType,
|
||||
TrendingEvent,
|
||||
UnifiedEvent,
|
||||
utcnow,
|
||||
)
|
||||
from app.prompts.summary_prompts import (
|
||||
SUMMARY_SYSTEM_PROMPT,
|
||||
SUMMARY_USER_PROMPT_TEMPLATE,
|
||||
)
|
||||
from app.services.fetcher_service import embedder_model
|
||||
|
||||
HOT_SCORE_THRESHOLD = int(os.getenv("HOT_SCORE_THRESHOLD", 3))
|
||||
AI_API_KEY = os.getenv("AI_API_KEY", '')
|
||||
TOPIC_TAG_MIN_HOT_SCORE = int(os.getenv("TOPIC_TAG_MIN_HOT_SCORE", HOT_SCORE_THRESHOLD))
|
||||
TOPIC_SIMILARITY_THRESHOLD = float(os.getenv("TOPIC_SIMILARITY_THRESHOLD", 0.82))
|
||||
TOPIC_TAG_MAX_COUNT = int(os.getenv("TOPIC_TAG_MAX_COUNT", 8))
|
||||
AI_API_KEY = os.getenv("AI_API_KEY", "")
|
||||
|
||||
|
||||
# 1. 初始化异步客户端 (全局复用)
|
||||
deepseek_client = AsyncOpenAI(
|
||||
api_key=AI_API_KEY,
|
||||
base_url="https://api.deepseek.com"
|
||||
base_url="https://api.deepseek.com",
|
||||
)
|
||||
|
||||
|
||||
async def call_llm_for_summary(platform_data_text: str) -> dict:
|
||||
"""调用 DeepSeek 生成统一标题和多平台视角摘要"""
|
||||
prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
|
||||
platform_data_text=platform_data_text
|
||||
)
|
||||
"""Call LLM for unified title, summary and topic candidates."""
|
||||
prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(platform_data_text=platform_data_text)
|
||||
|
||||
# await
|
||||
response = await deepseek_client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt}
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=1
|
||||
temperature=1,
|
||||
)
|
||||
|
||||
result_text = response.choices[0].message.content
|
||||
return json.loads(result_text)
|
||||
|
||||
|
||||
def _normalize_score(raw_score: Any) -> float | None:
|
||||
try:
|
||||
score = float(raw_score)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
if score <= 1:
|
||||
score *= 100
|
||||
|
||||
return max(0.0, min(100.0, score))
|
||||
|
||||
|
||||
def parse_topic_keywords(llm_result: dict) -> list[dict[str, Any]]:
|
||||
"""Parse topic keywords from LLM response; support list[str] and list[object]."""
|
||||
raw_topics = llm_result.get("topic_keywords") or []
|
||||
parsed: list[dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for item in raw_topics:
|
||||
keyword = ""
|
||||
score = None
|
||||
|
||||
if isinstance(item, str):
|
||||
keyword = item.strip()
|
||||
elif isinstance(item, dict):
|
||||
raw_keyword = (
|
||||
item.get("keyword")
|
||||
or item.get("topic_keyword")
|
||||
or item.get("name")
|
||||
or item.get("topic")
|
||||
or ""
|
||||
)
|
||||
keyword = str(raw_keyword).strip()
|
||||
score = _normalize_score(item.get("relevance_score") or item.get("score"))
|
||||
|
||||
if not keyword:
|
||||
continue
|
||||
|
||||
keyword = keyword[:100]
|
||||
normalized_key = keyword.casefold()
|
||||
if normalized_key in seen:
|
||||
continue
|
||||
|
||||
seen.add(normalized_key)
|
||||
parsed.append({"keyword": keyword, "score": score})
|
||||
|
||||
return parsed
|
||||
|
||||
|
||||
def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Deduplicate semantically similar tags using embedding similarity."""
|
||||
if not topic_candidates:
|
||||
return []
|
||||
|
||||
keywords = [item["keyword"] for item in topic_candidates]
|
||||
vectors = embedder_model.encode(keywords, normalize_embeddings=True)
|
||||
|
||||
clusters: list[dict[str, Any]] = []
|
||||
for item, vector in zip(topic_candidates, vectors):
|
||||
vec = np.asarray(vector, dtype=np.float32)
|
||||
|
||||
best_idx = -1
|
||||
best_sim = -1.0
|
||||
for idx, cluster in enumerate(clusters):
|
||||
sim = float(np.dot(vec, cluster["vector"]))
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_idx = idx
|
||||
|
||||
if best_idx >= 0 and best_sim >= TOPIC_SIMILARITY_THRESHOLD:
|
||||
cluster = clusters[best_idx]
|
||||
merged = cluster["vector"] * cluster["count"] + vec
|
||||
norm = float(np.linalg.norm(merged))
|
||||
if norm > 0:
|
||||
cluster["vector"] = merged / norm
|
||||
|
||||
cluster["count"] += 1
|
||||
if item["score"] is not None and (
|
||||
cluster["score"] is None or item["score"] > cluster["score"]
|
||||
):
|
||||
cluster["score"] = item["score"]
|
||||
|
||||
# Prefer shorter tag as canonical keyword.
|
||||
if len(item["keyword"]) < len(cluster["keyword"]):
|
||||
cluster["keyword"] = item["keyword"]
|
||||
else:
|
||||
clusters.append(
|
||||
{
|
||||
"keyword": item["keyword"],
|
||||
"score": item["score"],
|
||||
"vector": vec,
|
||||
"count": 1,
|
||||
}
|
||||
)
|
||||
|
||||
if any(cluster["score"] is not None for cluster in clusters):
|
||||
clusters.sort(key=lambda x: x["score"] if x["score"] is not None else -1.0, reverse=True)
|
||||
|
||||
result = [
|
||||
{"keyword": cluster["keyword"], "score": cluster["score"]}
|
||||
for cluster in clusters[:TOPIC_TAG_MAX_COUNT]
|
||||
]
|
||||
return result
|
||||
|
||||
|
||||
def replace_event_topics(db, event_id: int, normalized_topics: list[dict[str, Any]]) -> None:
|
||||
"""Replace EVENT tags for one unified event atomically within current transaction."""
|
||||
db.query(ExtractedTopic).filter(
|
||||
ExtractedTopic.target_type == TargetType.EVENT,
|
||||
ExtractedTopic.target_id == event_id,
|
||||
).delete(synchronize_session=False)
|
||||
|
||||
for item in normalized_topics:
|
||||
db.add(
|
||||
ExtractedTopic(
|
||||
target_type=TargetType.EVENT,
|
||||
target_id=event_id,
|
||||
topic_keyword=item["keyword"],
|
||||
relevance_score=item["score"],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def generate_unified_summaries():
|
||||
"""定时任务:扫描高热度事件并生成/更新摘要"""
|
||||
print(f"[{utcnow()}] 开始执行 DeepSeek 摘要生成任务...")
|
||||
"""Scheduled task: refresh summaries and topic tags for hot unified events."""
|
||||
print(f"[{utcnow()}] Start unified summary generation task...")
|
||||
|
||||
with SessionLocal() as db:
|
||||
recent_threshold = utcnow() - timedelta(days=3)
|
||||
|
||||
# 必须满足:热度达标 AND (当前热度 > 上次生成摘要时的热度) AND 近期活跃
|
||||
events = db.query(UnifiedEvent).filter(
|
||||
UnifiedEvent.hot_score >= HOT_SCORE_THRESHOLD,
|
||||
UnifiedEvent.hot_score > UnifiedEvent.last_summarized_trends_count,
|
||||
UnifiedEvent.created_at >= recent_threshold
|
||||
UnifiedEvent.created_at >= recent_threshold,
|
||||
).all()
|
||||
|
||||
if not events:
|
||||
print("当前没有需要更新摘要的大事件,任务结束。")
|
||||
print("No events require summary update in this round.")
|
||||
return
|
||||
|
||||
for event in events:
|
||||
# 联合查询获取该事件在各平台的子新闻
|
||||
trends = db.query(TrendingEvent, InfoSource.source_name) \
|
||||
.join(InfoSource, TrendingEvent.source_id == InfoSource.id) \
|
||||
.filter(TrendingEvent.unified_event_id == event.id) \
|
||||
trends = (
|
||||
db.query(TrendingEvent, InfoSource.source_name)
|
||||
.join(InfoSource, TrendingEvent.source_id == InfoSource.id)
|
||||
.filter(TrendingEvent.unified_event_id == event.id)
|
||||
.all()
|
||||
)
|
||||
|
||||
if not trends:
|
||||
continue
|
||||
|
||||
# 按平台归类标题并去重
|
||||
platform_dict = {}
|
||||
platform_dict: dict[str, set[str]] = {}
|
||||
for trend_record, source_name in trends:
|
||||
if source_name not in platform_dict:
|
||||
platform_dict[source_name] = set()
|
||||
platform_dict[source_name].add(trend_record.current_headline)
|
||||
platform_dict.setdefault(source_name, set()).add(trend_record.current_headline)
|
||||
|
||||
# 组装给大模型的 Prompt 数据
|
||||
prompt_lines = [f"【{platform}】: {', '.join(headlines)}" for platform, headlines in platform_dict.items()]
|
||||
prompt_lines = [
|
||||
f"[{platform}] {', '.join(sorted(headlines))}"
|
||||
for platform, headlines in platform_dict.items()
|
||||
]
|
||||
platform_data_text = "\n".join(prompt_lines)
|
||||
|
||||
try:
|
||||
# 调用封装好的异步函数
|
||||
llm_result = await call_llm_for_summary(platform_data_text)
|
||||
|
||||
if "unified_title" in llm_result:
|
||||
if "unified_title" in llm_result and llm_result["unified_title"]:
|
||||
event.unified_title = llm_result["unified_title"]
|
||||
if "ai_comprehensive_summary" in llm_result:
|
||||
if "ai_comprehensive_summary" in llm_result and llm_result["ai_comprehensive_summary"]:
|
||||
event.ai_comprehensive_summary = llm_result["ai_comprehensive_summary"]
|
||||
|
||||
# 成功后更新水位线
|
||||
# 将最后一次总结时的热搜数量,更新为当前最新的 hot_score
|
||||
if event.hot_score >= TOPIC_TAG_MIN_HOT_SCORE:
|
||||
topic_candidates = parse_topic_keywords(llm_result)
|
||||
normalized_topics = normalize_topic_keywords(topic_candidates)
|
||||
if normalized_topics:
|
||||
replace_event_topics(db, event.id, normalized_topics)
|
||||
|
||||
event.last_summarized_trends_count = event.hot_score
|
||||
print(
|
||||
f"Updated event {event.id} summary"
|
||||
f" (hot_score={event.hot_score})."
|
||||
)
|
||||
|
||||
print(f"成功更新大事件 ID {event.id} 的深度摘要 (当前热度: {event.hot_score})。")
|
||||
|
||||
except Exception as e:
|
||||
print(f"大事件 ID {event.id} 摘要生成失败: {e}")
|
||||
except Exception as exc:
|
||||
print(f"Event {event.id} summary generation failed: {exc}")
|
||||
continue
|
||||
|
||||
# 提交事务
|
||||
db.commit()
|
||||
|
||||
Reference in New Issue
Block a user