big update

2026-06-06 00:00:05 +08:00 · 2026-03-11 20:52:58 +08:00
parent 8ed819a580
commit 966bcfbba4
44 changed files with 7124 additions and 650 deletions
@@ -1,104 +1,241 @@
 # app/services/summary_service.py
-import os
 import json
+import os
 from datetime import timedelta
+from typing import Any
+
+import numpy as np
 from openai import AsyncOpenAI

 from app.database import SessionLocal
-from app.models.models import UnifiedEvent, TrendingEvent, InfoSource, utcnow
+from app.models.models import (
+    ExtractedTopic,
+    InfoSource,
+    TargetType,
+    TrendingEvent,
+    UnifiedEvent,
+    utcnow,
+)
 from app.prompts.summary_prompts import (
    SUMMARY_SYSTEM_PROMPT,
    SUMMARY_USER_PROMPT_TEMPLATE,
 )
+from app.services.fetcher_service import embedder_model

 HOT_SCORE_THRESHOLD = int(os.getenv("HOT_SCORE_THRESHOLD", 3))
-AI_API_KEY = os.getenv("AI_API_KEY", '')
+TOPIC_TAG_MIN_HOT_SCORE = int(os.getenv("TOPIC_TAG_MIN_HOT_SCORE", HOT_SCORE_THRESHOLD))
+TOPIC_SIMILARITY_THRESHOLD = float(os.getenv("TOPIC_SIMILARITY_THRESHOLD", 0.82))
+TOPIC_TAG_MAX_COUNT = int(os.getenv("TOPIC_TAG_MAX_COUNT", 8))
+AI_API_KEY = os.getenv("AI_API_KEY", "")
+

-# 1. 初始化异步客户端 (全局复用)
 deepseek_client = AsyncOpenAI(
    api_key=AI_API_KEY,
-    base_url="https://api.deepseek.com"
+    base_url="https://api.deepseek.com",
 )


 async def call_llm_for_summary(platform_data_text: str) -> dict:
-    """调用 DeepSeek 生成统一标题和多平台视角摘要"""
-    prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
-        platform_data_text=platform_data_text
-    )
+    """Call LLM for unified title, summary and topic candidates."""
+    prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(platform_data_text=platform_data_text)

-    # await
    response = await deepseek_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
-            {"role": "user", "content": prompt}
+            {"role": "user", "content": prompt},
        ],
        response_format={"type": "json_object"},
-        temperature=1
+        temperature=1,
    )

    result_text = response.choices[0].message.content
    return json.loads(result_text)


+def _normalize_score(raw_score: Any) -> float | None:
+    try:
+        score = float(raw_score)
+    except (TypeError, ValueError):
+        return None
+
+    if score <= 1:
+        score *= 100
+
+    return max(0.0, min(100.0, score))
+
+
+def parse_topic_keywords(llm_result: dict) -> list[dict[str, Any]]:
+    """Parse topic keywords from LLM response; support list[str] and list[object]."""
+    raw_topics = llm_result.get("topic_keywords") or []
+    parsed: list[dict[str, Any]] = []
+    seen: set[str] = set()
+
+    for item in raw_topics:
+        keyword = ""
+        score = None
+
+        if isinstance(item, str):
+            keyword = item.strip()
+        elif isinstance(item, dict):
+            raw_keyword = (
+                item.get("keyword")
+                or item.get("topic_keyword")
+                or item.get("name")
+                or item.get("topic")
+                or ""
+            )
+            keyword = str(raw_keyword).strip()
+            score = _normalize_score(item.get("relevance_score") or item.get("score"))
+
+        if not keyword:
+            continue
+
+        keyword = keyword[:100]
+        normalized_key = keyword.casefold()
+        if normalized_key in seen:
+            continue
+
+        seen.add(normalized_key)
+        parsed.append({"keyword": keyword, "score": score})
+
+    return parsed
+
+
+def normalize_topic_keywords(topic_candidates: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Deduplicate semantically similar tags using embedding similarity."""
+    if not topic_candidates:
+        return []
+
+    keywords = [item["keyword"] for item in topic_candidates]
+    vectors = embedder_model.encode(keywords, normalize_embeddings=True)
+
+    clusters: list[dict[str, Any]] = []
+    for item, vector in zip(topic_candidates, vectors):
+        vec = np.asarray(vector, dtype=np.float32)
+
+        best_idx = -1
+        best_sim = -1.0
+        for idx, cluster in enumerate(clusters):
+            sim = float(np.dot(vec, cluster["vector"]))
+            if sim > best_sim:
+                best_sim = sim
+                best_idx = idx
+
+        if best_idx >= 0 and best_sim >= TOPIC_SIMILARITY_THRESHOLD:
+            cluster = clusters[best_idx]
+            merged = cluster["vector"] * cluster["count"] + vec
+            norm = float(np.linalg.norm(merged))
+            if norm > 0:
+                cluster["vector"] = merged / norm
+
+            cluster["count"] += 1
+            if item["score"] is not None and (
+                cluster["score"] is None or item["score"] > cluster["score"]
+            ):
+                cluster["score"] = item["score"]
+
+            # Prefer shorter tag as canonical keyword.
+            if len(item["keyword"]) < len(cluster["keyword"]):
+                cluster["keyword"] = item["keyword"]
+        else:
+            clusters.append(
+                {
+                    "keyword": item["keyword"],
+                    "score": item["score"],
+                    "vector": vec,
+                    "count": 1,
+                }
+            )
+
+    if any(cluster["score"] is not None for cluster in clusters):
+        clusters.sort(key=lambda x: x["score"] if x["score"] is not None else -1.0, reverse=True)
+
+    result = [
+        {"keyword": cluster["keyword"], "score": cluster["score"]}
+        for cluster in clusters[:TOPIC_TAG_MAX_COUNT]
+    ]
+    return result
+
+
+def replace_event_topics(db, event_id: int, normalized_topics: list[dict[str, Any]]) -> None:
+    """Replace EVENT tags for one unified event atomically within current transaction."""
+    db.query(ExtractedTopic).filter(
+        ExtractedTopic.target_type == TargetType.EVENT,
+        ExtractedTopic.target_id == event_id,
+    ).delete(synchronize_session=False)
+
+    for item in normalized_topics:
+        db.add(
+            ExtractedTopic(
+                target_type=TargetType.EVENT,
+                target_id=event_id,
+                topic_keyword=item["keyword"],
+                relevance_score=item["score"],
+            )
+        )
+
+
 async def generate_unified_summaries():
-    """定时任务：扫描高热度事件并生成/更新摘要"""
-    print(f"[{utcnow()}] 开始执行 DeepSeek 摘要生成任务...")
+    """Scheduled task: refresh summaries and topic tags for hot unified events."""
+    print(f"[{utcnow()}] Start unified summary generation task...")

    with SessionLocal() as db:
        recent_threshold = utcnow() - timedelta(days=3)

-        # 必须满足：热度达标 AND (当前热度 > 上次生成摘要时的热度) AND 近期活跃
        events = db.query(UnifiedEvent).filter(
            UnifiedEvent.hot_score >= HOT_SCORE_THRESHOLD,
            UnifiedEvent.hot_score > UnifiedEvent.last_summarized_trends_count,
-            UnifiedEvent.created_at >= recent_threshold
+            UnifiedEvent.created_at >= recent_threshold,
        ).all()

        if not events:
-            print("当前没有需要更新摘要的大事件，任务结束。")
+            print("No events require summary update in this round.")
            return

        for event in events:
-            # 联合查询获取该事件在各平台的子新闻
-            trends = db.query(TrendingEvent, InfoSource.source_name) \
-                .join(InfoSource, TrendingEvent.source_id == InfoSource.id) \
-                .filter(TrendingEvent.unified_event_id == event.id) \
+            trends = (
+                db.query(TrendingEvent, InfoSource.source_name)
+                .join(InfoSource, TrendingEvent.source_id == InfoSource.id)
+                .filter(TrendingEvent.unified_event_id == event.id)
                .all()
+            )

            if not trends:
                continue

-            # 按平台归类标题并去重
-            platform_dict = {}
+            platform_dict: dict[str, set[str]] = {}
            for trend_record, source_name in trends:
-                if source_name not in platform_dict:
-                    platform_dict[source_name] = set()
-                platform_dict[source_name].add(trend_record.current_headline)
+                platform_dict.setdefault(source_name, set()).add(trend_record.current_headline)

-            # 组装给大模型的 Prompt 数据
-            prompt_lines = [f"【{platform}】: {', '.join(headlines)}" for platform, headlines in platform_dict.items()]
+            prompt_lines = [
+                f"[{platform}] {', '.join(sorted(headlines))}"
+                for platform, headlines in platform_dict.items()
+            ]
            platform_data_text = "\n".join(prompt_lines)

            try:
-                # 调用封装好的异步函数
                llm_result = await call_llm_for_summary(platform_data_text)

-                if "unified_title" in llm_result:
+                if "unified_title" in llm_result and llm_result["unified_title"]:
                    event.unified_title = llm_result["unified_title"]
-                if "ai_comprehensive_summary" in llm_result:
+                if "ai_comprehensive_summary" in llm_result and llm_result["ai_comprehensive_summary"]:
                    event.ai_comprehensive_summary = llm_result["ai_comprehensive_summary"]

-                # 成功后更新水位线
-                # 将最后一次总结时的热搜数量，更新为当前最新的 hot_score
+                if event.hot_score >= TOPIC_TAG_MIN_HOT_SCORE:
+                    topic_candidates = parse_topic_keywords(llm_result)
+                    normalized_topics = normalize_topic_keywords(topic_candidates)
+                    if normalized_topics:
+                        replace_event_topics(db, event.id, normalized_topics)
+
                event.last_summarized_trends_count = event.hot_score
+                print(
+                    f"Updated event {event.id} summary"
+                    f" (hot_score={event.hot_score})."
+                )

-                print(f"成功更新大事件 ID {event.id} 的深度摘要 (当前热度: {event.hot_score})。")
-
-            except Exception as e:
-                print(f"大事件 ID {event.id} 摘要生成失败: {e}")
+            except Exception as exc:
+                print(f"Event {event.id} summary generation failed: {exc}")
                continue

-        # 提交事务
        db.commit()