mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-06 00:57:51 +08:00
optimize+注释
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# 定时推送调度服务
|
||||
# 由 APScheduler 每分钟调用,检查当前时刻是否有用户需要接收推送,
|
||||
# 如匹配则生成摘要邮件并发送,同时写入 DeliveryHistory 防重复。
|
||||
# 推送优先级:有关键词且匹配 → 个性化简报;无关键词或无匹配 → 默认热点快报
|
||||
import logging
|
||||
import os
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
@@ -129,7 +130,7 @@ def _ensure_aware(dt: datetime) -> datetime:
|
||||
# 数据库查询辅助
|
||||
# ==========================================
|
||||
def _should_skip_by_interval(db: Session, user_id: int) -> bool:
|
||||
"""检查用户是否仍在 30 分钟冷却期内。"""
|
||||
"""检查用户是否仍在冷却期内,避免短时间内重复推送"""
|
||||
row = (
|
||||
db.query(DeliveryHistory.created_at)
|
||||
.filter(
|
||||
@@ -330,7 +331,7 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul
|
||||
|
||||
pushed_ids = _get_already_pushed_event_ids(db, user_id)
|
||||
|
||||
# ——— 决策:匹配模式 or 默认模式 ———
|
||||
# 决策:有关键词且有匹配 → 匹配模式;否则 → 默认热点模式
|
||||
items: list = []
|
||||
is_default = False
|
||||
|
||||
@@ -411,7 +412,7 @@ async def check_and_deliver() -> None:
|
||||
if not user:
|
||||
continue
|
||||
|
||||
# 用户本地时间对比(核心时区修正)
|
||||
# 将 UTC 转为用户本地时间,判断是否落在推送窗口内
|
||||
user_current = _user_local_time(now, user.timezone)
|
||||
if not _is_within_window(schedule.delivery_time, user_current):
|
||||
continue
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
# app/services/fetcher_service.py
|
||||
"""
|
||||
抓取服务:从外部 API 拉取热搜/RSS 数据,做查重、向量聚类、入库
|
||||
热搜分支:语义聚类到 UnifiedEvent;RSS 分支:写入 NewsArticle
|
||||
"""
|
||||
import os
|
||||
import hashlib
|
||||
from datetime import timedelta
|
||||
@@ -29,7 +33,7 @@ print("模型加载完成。")
|
||||
|
||||
|
||||
def generate_md5(text: str) -> str:
|
||||
"""生成32位MD5哈希值作为全局唯一指纹"""
|
||||
"""生成 32 位 MD5 作为 external_id,用于跨平台去重"""
|
||||
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
@@ -66,6 +70,7 @@ class UnifiedEventClusterer:
|
||||
self.event_ids.append(ev.id)
|
||||
|
||||
def match_or_create(self, title: str, embedding_json: str, new_vec: np.ndarray) -> int:
|
||||
"""语义相似则归入已有事件并累加热度,否则创建新 UnifiedEvent"""
|
||||
if self.event_vectors:
|
||||
# 批量矩阵计算相似度
|
||||
sim_scores = cosine_similarity([new_vec], self.event_vectors)[0]
|
||||
@@ -104,7 +109,7 @@ def process_hot_trend_item(db, source, item, index: int, external_id: str, exist
|
||||
|
||||
event_to_log = None
|
||||
|
||||
# 核心逻辑:查重后再决定是否调用模型
|
||||
# 查重:已存在则可能只需更新标题/排名;不存在则需聚类并新建
|
||||
if existing_event:
|
||||
# 场景 A1:老熟人
|
||||
if existing_event.current_headline != title:
|
||||
@@ -204,7 +209,7 @@ def process_source_data(db, source, items: list) -> int:
|
||||
if not valid_items:
|
||||
return 0
|
||||
|
||||
# 2. 批量数据库查重
|
||||
# 批量查重:按 external_id 判断是更新还是新增
|
||||
existing_events_dict = {}
|
||||
existing_articles_dict = {}
|
||||
|
||||
@@ -221,7 +226,7 @@ def process_source_data(db, source, items: list) -> int:
|
||||
).all()
|
||||
existing_articles_dict = {art.external_id: art for art in existing_articles}
|
||||
|
||||
# 3. 筛选出需要进行大模型向量运算的文本
|
||||
# 仅对需要算向量的标题做批量 embedding,避免重复计算
|
||||
texts_to_embed = []
|
||||
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
|
||||
for item, external_id in valid_items:
|
||||
@@ -241,7 +246,7 @@ def process_source_data(db, source, items: list) -> int:
|
||||
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
|
||||
clusterer = UnifiedEventClusterer(db)
|
||||
|
||||
# 5. 核心路由分流落库
|
||||
# 按来源类型分流:热搜/API → TrendingEvent + 聚类;RSS → NewsArticle
|
||||
for index, (item, external_id) in enumerate(valid_items, 1):
|
||||
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
|
||||
existing_event = existing_events_dict.get(external_id)
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
匹配服务:根据用户兴趣关键词(精确 + 语义)推荐事件
|
||||
打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度加成
|
||||
"""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
@@ -123,7 +127,7 @@ def recommend_events_for_user(
|
||||
else PREFERENCE_SEMANTIC_THRESHOLD
|
||||
)
|
||||
|
||||
# 读取用户兴趣词
|
||||
# 1. 读取用户兴趣词
|
||||
preferences = (
|
||||
db.query(UserTopicPreference)
|
||||
.filter(UserTopicPreference.user_id == user_id)
|
||||
@@ -136,7 +140,7 @@ def recommend_events_for_user(
|
||||
if not preference_keywords:
|
||||
return []
|
||||
|
||||
# 读取候选事件(先做时间和热度过滤,避免全表扫描)
|
||||
# 2. 读取候选事件(时间 + 热度过滤,避免全表扫描)
|
||||
time_limit = utcnow() - timedelta(hours=hours)
|
||||
events = (
|
||||
db.query(UnifiedEvent)
|
||||
@@ -177,7 +181,7 @@ def recommend_events_for_user(
|
||||
if not event_topics:
|
||||
return []
|
||||
|
||||
# 批量编码用户词和标签词,避免逐条调用模型
|
||||
# 3. 批量编码用户词与标签词,减少模型调用次数
|
||||
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
|
||||
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
|
||||
pref_vec_map = _build_keyword_embedding_map(unique_preference_keywords)
|
||||
@@ -196,7 +200,7 @@ def recommend_events_for_user(
|
||||
semantic_hits: list[dict[str, Any]] = []
|
||||
score = 0.0
|
||||
|
||||
# 对事件标签逐个匹配用户兴趣
|
||||
# 对每个事件标签做精确匹配或语义匹配
|
||||
for topic_keyword, topic_relevance in topic_list:
|
||||
normalized_topic = _normalize_text(topic_keyword)
|
||||
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
# app/services/summary_service.py
|
||||
"""
|
||||
摘要服务:调用 LLM 生成统一标题、综合摘要、话题标签
|
||||
定时任务:对热度达标且未摘要的事件批量处理
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
from datetime import timedelta
|
||||
@@ -36,7 +40,7 @@ deepseek_client = AsyncOpenAI(
|
||||
|
||||
|
||||
async def call_llm_for_summary(platform_data_text: str) -> dict:
|
||||
"""Call LLM for unified title, summary and topic candidates."""
|
||||
"""调用 LLM 生成统一标题、综合摘要、话题候选词"""
|
||||
prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(platform_data_text=platform_data_text)
|
||||
|
||||
response = await deepseek_client.chat.completions.create(
|
||||
@@ -66,7 +70,7 @@ def _normalize_score(raw_score: Any) -> float | None:
|
||||
|
||||
|
||||
def parse_topic_keywords(llm_result: dict) -> list[dict[str, Any]]:
|
||||
"""Parse topic keywords from LLM response; support list[str] and list[object]."""
|
||||
"""解析 LLM 返回的话题关键词,支持字符串或对象格式"""
|
||||
raw_topics = llm_result.get("topic_keywords") or []
|
||||
parsed: list[dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
@@ -103,7 +107,7 @@ def parse_topic_keywords(llm_result: dict) -> list[dict[str, Any]]:
|
||||
|
||||
|
||||
def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Deduplicate semantically similar tags using embedding similarity."""
|
||||
"""用向量相似度去重同义标签,保留最具代表性的关键词"""
|
||||
if not topic_candidates:
|
||||
return []
|
||||
|
||||
@@ -159,7 +163,7 @@ def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dic
|
||||
|
||||
|
||||
def replace_event_topics(db, event_id: int, normalized_topics: list[dict[str, Any]]) -> None:
|
||||
"""Replace EVENT tags for one unified event atomically within current transaction."""
|
||||
"""原子替换某事件的标签:先删旧再插新"""
|
||||
db.query(ExtractedTopic).filter(
|
||||
ExtractedTopic.target_type == TargetType.EVENT,
|
||||
ExtractedTopic.target_id == event_id,
|
||||
@@ -177,7 +181,7 @@ def replace_event_topics(db, event_id: int, normalized_topics: list[dict[str, An
|
||||
|
||||
|
||||
async def generate_unified_summaries():
|
||||
"""Scheduled task: refresh summaries and topic tags for hot unified events."""
|
||||
"""定时任务:对热度达标且未摘要的事件刷新标题、摘要、标签"""
|
||||
print(f"[{utcnow()}] Start unified summary generation task...")
|
||||
|
||||
# 先提取需要处理的事件 ID,尽早释放 session,不长期占用 db session
|
||||
|
||||
Reference in New Issue
Block a user