mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-06 00:00:05 +08:00
big update
This commit is contained in:
@@ -1,104 +1,241 @@
|
||||
# app/services/summary_service.py
|
||||
import os
|
||||
import json
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from app.database import SessionLocal
|
||||
from app.models.models import UnifiedEvent, TrendingEvent, InfoSource, utcnow
|
||||
from app.models.models import (
|
||||
ExtractedTopic,
|
||||
InfoSource,
|
||||
TargetType,
|
||||
TrendingEvent,
|
||||
UnifiedEvent,
|
||||
utcnow,
|
||||
)
|
||||
from app.prompts.summary_prompts import (
|
||||
SUMMARY_SYSTEM_PROMPT,
|
||||
SUMMARY_USER_PROMPT_TEMPLATE,
|
||||
)
|
||||
from app.services.fetcher_service import embedder_model
|
||||
|
||||
HOT_SCORE_THRESHOLD = int(os.getenv("HOT_SCORE_THRESHOLD", 3))
|
||||
AI_API_KEY = os.getenv("AI_API_KEY", '')
|
||||
TOPIC_TAG_MIN_HOT_SCORE = int(os.getenv("TOPIC_TAG_MIN_HOT_SCORE", HOT_SCORE_THRESHOLD))
|
||||
TOPIC_SIMILARITY_THRESHOLD = float(os.getenv("TOPIC_SIMILARITY_THRESHOLD", 0.82))
|
||||
TOPIC_TAG_MAX_COUNT = int(os.getenv("TOPIC_TAG_MAX_COUNT", 8))
|
||||
AI_API_KEY = os.getenv("AI_API_KEY", "")
|
||||
|
||||
|
||||
# 1. 初始化异步客户端 (全局复用)
|
||||
deepseek_client = AsyncOpenAI(
|
||||
api_key=AI_API_KEY,
|
||||
base_url="https://api.deepseek.com"
|
||||
base_url="https://api.deepseek.com",
|
||||
)
|
||||
|
||||
|
||||
async def call_llm_for_summary(platform_data_text: str) -> dict:
|
||||
"""调用 DeepSeek 生成统一标题和多平台视角摘要"""
|
||||
prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
|
||||
platform_data_text=platform_data_text
|
||||
)
|
||||
"""Call LLM for unified title, summary and topic candidates."""
|
||||
prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(platform_data_text=platform_data_text)
|
||||
|
||||
# await
|
||||
response = await deepseek_client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt}
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=1
|
||||
temperature=1,
|
||||
)
|
||||
|
||||
result_text = response.choices[0].message.content
|
||||
return json.loads(result_text)
|
||||
|
||||
|
||||
def _normalize_score(raw_score: Any) -> float | None:
|
||||
try:
|
||||
score = float(raw_score)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
if score <= 1:
|
||||
score *= 100
|
||||
|
||||
return max(0.0, min(100.0, score))
|
||||
|
||||
|
||||
def parse_topic_keywords(llm_result: dict) -> list[dict[str, Any]]:
|
||||
"""Parse topic keywords from LLM response; support list[str] and list[object]."""
|
||||
raw_topics = llm_result.get("topic_keywords") or []
|
||||
parsed: list[dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for item in raw_topics:
|
||||
keyword = ""
|
||||
score = None
|
||||
|
||||
if isinstance(item, str):
|
||||
keyword = item.strip()
|
||||
elif isinstance(item, dict):
|
||||
raw_keyword = (
|
||||
item.get("keyword")
|
||||
or item.get("topic_keyword")
|
||||
or item.get("name")
|
||||
or item.get("topic")
|
||||
or ""
|
||||
)
|
||||
keyword = str(raw_keyword).strip()
|
||||
score = _normalize_score(item.get("relevance_score") or item.get("score"))
|
||||
|
||||
if not keyword:
|
||||
continue
|
||||
|
||||
keyword = keyword[:100]
|
||||
normalized_key = keyword.casefold()
|
||||
if normalized_key in seen:
|
||||
continue
|
||||
|
||||
seen.add(normalized_key)
|
||||
parsed.append({"keyword": keyword, "score": score})
|
||||
|
||||
return parsed
|
||||
|
||||
|
||||
def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Deduplicate semantically similar tags using embedding similarity."""
|
||||
if not topic_candidates:
|
||||
return []
|
||||
|
||||
keywords = [item["keyword"] for item in topic_candidates]
|
||||
vectors = embedder_model.encode(keywords, normalize_embeddings=True)
|
||||
|
||||
clusters: list[dict[str, Any]] = []
|
||||
for item, vector in zip(topic_candidates, vectors):
|
||||
vec = np.asarray(vector, dtype=np.float32)
|
||||
|
||||
best_idx = -1
|
||||
best_sim = -1.0
|
||||
for idx, cluster in enumerate(clusters):
|
||||
sim = float(np.dot(vec, cluster["vector"]))
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_idx = idx
|
||||
|
||||
if best_idx >= 0 and best_sim >= TOPIC_SIMILARITY_THRESHOLD:
|
||||
cluster = clusters[best_idx]
|
||||
merged = cluster["vector"] * cluster["count"] + vec
|
||||
norm = float(np.linalg.norm(merged))
|
||||
if norm > 0:
|
||||
cluster["vector"] = merged / norm
|
||||
|
||||
cluster["count"] += 1
|
||||
if item["score"] is not None and (
|
||||
cluster["score"] is None or item["score"] > cluster["score"]
|
||||
):
|
||||
cluster["score"] = item["score"]
|
||||
|
||||
# Prefer shorter tag as canonical keyword.
|
||||
if len(item["keyword"]) < len(cluster["keyword"]):
|
||||
cluster["keyword"] = item["keyword"]
|
||||
else:
|
||||
clusters.append(
|
||||
{
|
||||
"keyword": item["keyword"],
|
||||
"score": item["score"],
|
||||
"vector": vec,
|
||||
"count": 1,
|
||||
}
|
||||
)
|
||||
|
||||
if any(cluster["score"] is not None for cluster in clusters):
|
||||
clusters.sort(key=lambda x: x["score"] if x["score"] is not None else -1.0, reverse=True)
|
||||
|
||||
result = [
|
||||
{"keyword": cluster["keyword"], "score": cluster["score"]}
|
||||
for cluster in clusters[:TOPIC_TAG_MAX_COUNT]
|
||||
]
|
||||
return result
|
||||
|
||||
|
||||
def replace_event_topics(db, event_id: int, normalized_topics: list[dict[str, Any]]) -> None:
|
||||
"""Replace EVENT tags for one unified event atomically within current transaction."""
|
||||
db.query(ExtractedTopic).filter(
|
||||
ExtractedTopic.target_type == TargetType.EVENT,
|
||||
ExtractedTopic.target_id == event_id,
|
||||
).delete(synchronize_session=False)
|
||||
|
||||
for item in normalized_topics:
|
||||
db.add(
|
||||
ExtractedTopic(
|
||||
target_type=TargetType.EVENT,
|
||||
target_id=event_id,
|
||||
topic_keyword=item["keyword"],
|
||||
relevance_score=item["score"],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def generate_unified_summaries():
|
||||
"""定时任务:扫描高热度事件并生成/更新摘要"""
|
||||
print(f"[{utcnow()}] 开始执行 DeepSeek 摘要生成任务...")
|
||||
"""Scheduled task: refresh summaries and topic tags for hot unified events."""
|
||||
print(f"[{utcnow()}] Start unified summary generation task...")
|
||||
|
||||
with SessionLocal() as db:
|
||||
recent_threshold = utcnow() - timedelta(days=3)
|
||||
|
||||
# 必须满足:热度达标 AND (当前热度 > 上次生成摘要时的热度) AND 近期活跃
|
||||
events = db.query(UnifiedEvent).filter(
|
||||
UnifiedEvent.hot_score >= HOT_SCORE_THRESHOLD,
|
||||
UnifiedEvent.hot_score > UnifiedEvent.last_summarized_trends_count,
|
||||
UnifiedEvent.created_at >= recent_threshold
|
||||
UnifiedEvent.created_at >= recent_threshold,
|
||||
).all()
|
||||
|
||||
if not events:
|
||||
print("当前没有需要更新摘要的大事件,任务结束。")
|
||||
print("No events require summary update in this round.")
|
||||
return
|
||||
|
||||
for event in events:
|
||||
# 联合查询获取该事件在各平台的子新闻
|
||||
trends = db.query(TrendingEvent, InfoSource.source_name) \
|
||||
.join(InfoSource, TrendingEvent.source_id == InfoSource.id) \
|
||||
.filter(TrendingEvent.unified_event_id == event.id) \
|
||||
trends = (
|
||||
db.query(TrendingEvent, InfoSource.source_name)
|
||||
.join(InfoSource, TrendingEvent.source_id == InfoSource.id)
|
||||
.filter(TrendingEvent.unified_event_id == event.id)
|
||||
.all()
|
||||
)
|
||||
|
||||
if not trends:
|
||||
continue
|
||||
|
||||
# 按平台归类标题并去重
|
||||
platform_dict = {}
|
||||
platform_dict: dict[str, set[str]] = {}
|
||||
for trend_record, source_name in trends:
|
||||
if source_name not in platform_dict:
|
||||
platform_dict[source_name] = set()
|
||||
platform_dict[source_name].add(trend_record.current_headline)
|
||||
platform_dict.setdefault(source_name, set()).add(trend_record.current_headline)
|
||||
|
||||
# 组装给大模型的 Prompt 数据
|
||||
prompt_lines = [f"【{platform}】: {', '.join(headlines)}" for platform, headlines in platform_dict.items()]
|
||||
prompt_lines = [
|
||||
f"[{platform}] {', '.join(sorted(headlines))}"
|
||||
for platform, headlines in platform_dict.items()
|
||||
]
|
||||
platform_data_text = "\n".join(prompt_lines)
|
||||
|
||||
try:
|
||||
# 调用封装好的异步函数
|
||||
llm_result = await call_llm_for_summary(platform_data_text)
|
||||
|
||||
if "unified_title" in llm_result:
|
||||
if "unified_title" in llm_result and llm_result["unified_title"]:
|
||||
event.unified_title = llm_result["unified_title"]
|
||||
if "ai_comprehensive_summary" in llm_result:
|
||||
if "ai_comprehensive_summary" in llm_result and llm_result["ai_comprehensive_summary"]:
|
||||
event.ai_comprehensive_summary = llm_result["ai_comprehensive_summary"]
|
||||
|
||||
# 成功后更新水位线
|
||||
# 将最后一次总结时的热搜数量,更新为当前最新的 hot_score
|
||||
if event.hot_score >= TOPIC_TAG_MIN_HOT_SCORE:
|
||||
topic_candidates = parse_topic_keywords(llm_result)
|
||||
normalized_topics = normalize_topic_keywords(topic_candidates)
|
||||
if normalized_topics:
|
||||
replace_event_topics(db, event.id, normalized_topics)
|
||||
|
||||
event.last_summarized_trends_count = event.hot_score
|
||||
print(
|
||||
f"Updated event {event.id} summary"
|
||||
f" (hot_score={event.hot_score})."
|
||||
)
|
||||
|
||||
print(f"成功更新大事件 ID {event.id} 的深度摘要 (当前热度: {event.hot_score})。")
|
||||
|
||||
except Exception as e:
|
||||
print(f"大事件 ID {event.id} 摘要生成失败: {e}")
|
||||
except Exception as exc:
|
||||
print(f"Event {event.id} summary generation failed: {exc}")
|
||||
continue
|
||||
|
||||
# 提交事务
|
||||
db.commit()
|
||||
|
||||
Reference in New Issue
Block a user