mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-05 23:56:36 +08:00
optimize+注释
This commit is contained in:
@@ -1,4 +1,8 @@
|
||||
# app/services/fetcher_service.py
|
||||
"""
|
||||
抓取服务:从外部 API 拉取热搜/RSS 数据,做查重、向量聚类、入库
|
||||
热搜分支:语义聚类到 UnifiedEvent;RSS 分支:写入 NewsArticle
|
||||
"""
|
||||
import os
|
||||
import hashlib
|
||||
from datetime import timedelta
|
||||
@@ -29,7 +33,7 @@ print("模型加载完成。")
|
||||
|
||||
|
||||
def generate_md5(text: str) -> str:
|
||||
"""生成32位MD5哈希值作为全局唯一指纹"""
|
||||
"""生成 32 位 MD5 作为 external_id,用于跨平台去重"""
|
||||
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
@@ -66,6 +70,7 @@ class UnifiedEventClusterer:
|
||||
self.event_ids.append(ev.id)
|
||||
|
||||
def match_or_create(self, title: str, embedding_json: str, new_vec: np.ndarray) -> int:
|
||||
"""语义相似则归入已有事件并累加热度,否则创建新 UnifiedEvent"""
|
||||
if self.event_vectors:
|
||||
# 批量矩阵计算相似度
|
||||
sim_scores = cosine_similarity([new_vec], self.event_vectors)[0]
|
||||
@@ -104,7 +109,7 @@ def process_hot_trend_item(db, source, item, index: int, external_id: str, exist
|
||||
|
||||
event_to_log = None
|
||||
|
||||
# 核心逻辑:查重后再决定是否调用模型
|
||||
# 查重:已存在则可能只需更新标题/排名;不存在则需聚类并新建
|
||||
if existing_event:
|
||||
# 场景 A1:老熟人
|
||||
if existing_event.current_headline != title:
|
||||
@@ -204,7 +209,7 @@ def process_source_data(db, source, items: list) -> int:
|
||||
if not valid_items:
|
||||
return 0
|
||||
|
||||
# 2. 批量数据库查重
|
||||
# 批量查重:按 external_id 判断是更新还是新增
|
||||
existing_events_dict = {}
|
||||
existing_articles_dict = {}
|
||||
|
||||
@@ -221,7 +226,7 @@ def process_source_data(db, source, items: list) -> int:
|
||||
).all()
|
||||
existing_articles_dict = {art.external_id: art for art in existing_articles}
|
||||
|
||||
# 3. 筛选出需要进行大模型向量运算的文本
|
||||
# 仅对需要算向量的标题做批量 embedding,避免重复计算
|
||||
texts_to_embed = []
|
||||
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
|
||||
for item, external_id in valid_items:
|
||||
@@ -241,7 +246,7 @@ def process_source_data(db, source, items: list) -> int:
|
||||
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
|
||||
clusterer = UnifiedEventClusterer(db)
|
||||
|
||||
# 5. 核心路由分流落库
|
||||
# 按来源类型分流:热搜/API → TrendingEvent + 聚类;RSS → NewsArticle
|
||||
for index, (item, external_id) in enumerate(valid_items, 1):
|
||||
if source.source_type in (SourceType.HOT_TREND, SourceType.API):
|
||||
existing_event = existing_events_dict.get(external_id)
|
||||
|
||||
Reference in New Issue
Block a user