mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-06 00:00:05 +08:00
算法与视觉优化
This commit is contained in:
@@ -26,9 +26,9 @@ SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.72))
|
|||||||
API_BASE_URL = os.getenv("API_BASE_URL", "https://newsnow.busiyi.world/api/s")
|
API_BASE_URL = os.getenv("API_BASE_URL", "https://newsnow.busiyi.world/api/s")
|
||||||
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "")
|
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "")
|
||||||
|
|
||||||
print("正在加载向量模型...")
|
print("正在加载 BAAI/bge-m3 向量模型...")
|
||||||
# 全局单例
|
# 全局单例
|
||||||
embedder_model = SentenceTransformer(EMBEDDING_MODEL_PATH, local_files_only=True)
|
embedder_model = SentenceTransformer(EMBEDDING_MODEL_PATH, local_files_only=True, device="cuda")
|
||||||
print("模型加载完成。")
|
print("模型加载完成。")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
匹配服务:根据用户兴趣关键词(精确 + 语义)推荐事件
|
匹配服务:根据用户兴趣关键词(精确 + 语义)推荐事件
|
||||||
打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度加成
|
打分融合:标签/标题匹配分 + 标签相关度 + 热度 + 新鲜度加成
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@@ -14,7 +14,7 @@ from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopi
|
|||||||
from app.services.fetcher_service import embedder_model
|
from app.services.fetcher_service import embedder_model
|
||||||
|
|
||||||
|
|
||||||
# 语义匹配阈值:用户关键词和事件标签向量相似度达到该值才计入语义命中
|
# 语义匹配阈值:用户关键词和事件标签/标题向量相似度达到该值才计入语义命中
|
||||||
DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
|
DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
|
||||||
PREFERENCE_SEMANTIC_THRESHOLD = float(
|
PREFERENCE_SEMANTIC_THRESHOLD = float(
|
||||||
os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD))
|
os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD))
|
||||||
@@ -41,6 +41,31 @@ def _normalize_text(text: str) -> str:
|
|||||||
return text.strip().casefold()
|
return text.strip().casefold()
|
||||||
|
|
||||||
|
|
||||||
|
def _find_exact_preference_match(
|
||||||
|
target_text: str,
|
||||||
|
normalized_preferences: list[tuple[str, str]],
|
||||||
|
) -> str | None:
|
||||||
|
"""
|
||||||
|
判断目标文本是否与某个用户兴趣词形成“精确命中”。
|
||||||
|
命中条件:
|
||||||
|
1. 标准化后完全相等
|
||||||
|
2. 二者互为包含关系
|
||||||
|
返回命中的原始兴趣词,未命中则返回 None。
|
||||||
|
"""
|
||||||
|
normalized_target = _normalize_text(target_text)
|
||||||
|
if not normalized_target:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for raw_pref, normalized_pref in normalized_preferences:
|
||||||
|
if not normalized_pref:
|
||||||
|
continue
|
||||||
|
if normalized_target == normalized_pref:
|
||||||
|
return raw_pref
|
||||||
|
if normalized_pref in normalized_target or normalized_target in normalized_pref:
|
||||||
|
return raw_pref
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
_EMBEDDING_CACHE: dict[str, np.ndarray] = {}
|
_EMBEDDING_CACHE: dict[str, np.ndarray] = {}
|
||||||
MAX_CACHE_SIZE = 10000
|
MAX_CACHE_SIZE = 10000
|
||||||
|
|
||||||
@@ -86,6 +111,26 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _find_best_semantic_match(
|
||||||
|
target_text: str,
|
||||||
|
target_vec_map: dict[str, np.ndarray],
|
||||||
|
pref_vec_map: dict[str, np.ndarray],
|
||||||
|
) -> tuple[str | None, float]:
|
||||||
|
"""返回与目标文本最接近的兴趣词及其余弦相似度。"""
|
||||||
|
target_vec = target_vec_map.get(target_text)
|
||||||
|
if target_vec is None:
|
||||||
|
return None, -1.0
|
||||||
|
|
||||||
|
best_pref = None
|
||||||
|
best_sim = -1.0
|
||||||
|
for pref_keyword, pref_vec in pref_vec_map.items():
|
||||||
|
sim = float(np.dot(target_vec, pref_vec))
|
||||||
|
if sim > best_sim:
|
||||||
|
best_sim = sim
|
||||||
|
best_pref = pref_keyword
|
||||||
|
return best_pref, best_sim
|
||||||
|
|
||||||
|
|
||||||
def _ensure_aware(dt: datetime) -> datetime:
|
def _ensure_aware(dt: datetime) -> datetime:
|
||||||
"""SQLite 读出的 datetime 不带时区信息,统一补上 UTC 后才能和 utcnow() 做减法。"""
|
"""SQLite 读出的 datetime 不带时区信息,统一补上 UTC 后才能和 utcnow() 做减法。"""
|
||||||
if dt.tzinfo is None:
|
if dt.tzinfo is None:
|
||||||
@@ -116,8 +161,8 @@ def recommend_events_for_user(
|
|||||||
) -> list[MatchedEventResult]:
|
) -> list[MatchedEventResult]:
|
||||||
"""
|
"""
|
||||||
用户兴趣推荐主流程:
|
用户兴趣推荐主流程:
|
||||||
1) 精确匹配:用户词 == EVENT 标签
|
1) 精确匹配:用户词 vs EVENT 标签/标题
|
||||||
2) 语义匹配:用户词向量 vs EVENT 标签向量(超过阈值)
|
2) 语义匹配:用户词向量 vs EVENT 标签/标题向量(超过阈值)
|
||||||
3) 打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度
|
3) 打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度
|
||||||
"""
|
"""
|
||||||
final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT))
|
final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT))
|
||||||
@@ -167,8 +212,6 @@ def recommend_events_for_user(
|
|||||||
)
|
)
|
||||||
.all()
|
.all()
|
||||||
)
|
)
|
||||||
if not topic_rows:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 组织事件标签映射:event_id -> [(tag, relevance_score), ...]
|
# 组织事件标签映射:event_id -> [(tag, relevance_score), ...]
|
||||||
event_topics: dict[int, list[tuple[str, float | None]]] = {}
|
event_topics: dict[int, list[tuple[str, float | None]]] = {}
|
||||||
@@ -177,10 +220,6 @@ def recommend_events_for_user(
|
|||||||
continue
|
continue
|
||||||
event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
|
event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
|
||||||
|
|
||||||
# 如果某事件没有标签,就不参与推荐
|
|
||||||
if not event_topics:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 3. 批量编码用户词与标签词,减少模型调用次数
|
# 3. 批量编码用户词与标签词,减少模型调用次数
|
||||||
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
|
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
|
||||||
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
|
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
|
||||||
@@ -188,13 +227,21 @@ def recommend_events_for_user(
|
|||||||
topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
|
topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
|
||||||
|
|
||||||
# 预先建立“标准化后用户词集合”,用于精确匹配
|
# 预先建立“标准化后用户词集合”,用于精确匹配
|
||||||
normalized_pref_set = {_normalize_text(word) for word in unique_preference_keywords}
|
normalized_preference_pairs = [
|
||||||
|
(word, _normalize_text(word))
|
||||||
|
for word in unique_preference_keywords
|
||||||
|
if _normalize_text(word)
|
||||||
|
]
|
||||||
|
unique_event_titles = list(
|
||||||
|
dict.fromkeys(
|
||||||
|
[event.unified_title.strip() for event in events if event.unified_title and event.unified_title.strip()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
title_vec_map = _build_keyword_embedding_map(unique_event_titles)
|
||||||
|
|
||||||
scored_results: list[MatchedEventResult] = []
|
scored_results: list[MatchedEventResult] = []
|
||||||
for event in events:
|
for event in events:
|
||||||
topic_list = event_topics.get(event.id, [])
|
topic_list = event_topics.get(event.id, [])
|
||||||
if not topic_list:
|
|
||||||
continue
|
|
||||||
|
|
||||||
exact_hits: list[str] = []
|
exact_hits: list[str] = []
|
||||||
semantic_hits: list[dict[str, Any]] = []
|
semantic_hits: list[dict[str, Any]] = []
|
||||||
@@ -202,37 +249,18 @@ def recommend_events_for_user(
|
|||||||
|
|
||||||
# 对每个事件标签做精确匹配或语义匹配
|
# 对每个事件标签做精确匹配或语义匹配
|
||||||
for topic_keyword, topic_relevance in topic_list:
|
for topic_keyword, topic_relevance in topic_list:
|
||||||
normalized_topic = _normalize_text(topic_keyword)
|
|
||||||
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
|
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
|
||||||
|
|
||||||
# 1) 精确命中(包括完全相等与包含关系)
|
# 1) 精确命中(包括完全相等与包含关系)
|
||||||
matched_exact = False
|
matched_pref = _find_exact_preference_match(topic_keyword, normalized_preference_pairs)
|
||||||
if normalized_topic in normalized_pref_set:
|
if matched_pref is not None:
|
||||||
matched_exact = True
|
|
||||||
else:
|
|
||||||
for pref_word in normalized_pref_set:
|
|
||||||
if pref_word and (pref_word in normalized_topic or normalized_topic in pref_word):
|
|
||||||
matched_exact = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if matched_exact:
|
|
||||||
exact_hits.append(topic_keyword)
|
exact_hits.append(topic_keyword)
|
||||||
# 精确命中给较高基础分,标签自身相关度作为增益
|
# 精确命中给较高基础分,标签自身相关度作为增益
|
||||||
score += 45.0 + topic_relevance_score * 0.2
|
score += 45.0 + topic_relevance_score * 0.2
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 2) 语义命中(未精确命中时再算)
|
# 2) 语义命中(未精确命中时再算)
|
||||||
topic_vec = topic_vec_map.get(topic_keyword)
|
best_pref, best_sim = _find_best_semantic_match(topic_keyword, topic_vec_map, pref_vec_map)
|
||||||
if topic_vec is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
best_pref = None
|
|
||||||
best_sim = -1.0
|
|
||||||
for pref_keyword, pref_vec in pref_vec_map.items():
|
|
||||||
sim = float(np.dot(topic_vec, pref_vec))
|
|
||||||
if sim > best_sim:
|
|
||||||
best_sim = sim
|
|
||||||
best_pref = pref_keyword
|
|
||||||
|
|
||||||
if best_pref is not None and best_sim >= similarity_threshold:
|
if best_pref is not None and best_sim >= similarity_threshold:
|
||||||
semantic_hits.append(
|
semantic_hits.append(
|
||||||
@@ -245,6 +273,25 @@ def recommend_events_for_user(
|
|||||||
# 语义命中分略低于精确命中,并由相似度放大
|
# 语义命中分略低于精确命中,并由相似度放大
|
||||||
score += best_sim * 35.0 + topic_relevance_score * 0.12
|
score += best_sim * 35.0 + topic_relevance_score * 0.12
|
||||||
|
|
||||||
|
# 标题也参与匹配,但权重低于结构化标签,避免长标题过度主导排序。
|
||||||
|
event_title = (event.unified_title or "").strip()
|
||||||
|
if event_title:
|
||||||
|
title_exact_pref = _find_exact_preference_match(event_title, normalized_preference_pairs)
|
||||||
|
if title_exact_pref is not None:
|
||||||
|
exact_hits.append(f"标题:{title_exact_pref}")
|
||||||
|
score += 30.0
|
||||||
|
else:
|
||||||
|
best_pref, best_sim = _find_best_semantic_match(event_title, title_vec_map, pref_vec_map)
|
||||||
|
if best_pref is not None and best_sim >= similarity_threshold:
|
||||||
|
semantic_hits.append(
|
||||||
|
{
|
||||||
|
"preference_keyword": best_pref,
|
||||||
|
"topic_keyword": f"标题:{best_pref}",
|
||||||
|
"similarity": round(best_sim, 4),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
score += best_sim * 24.0
|
||||||
|
|
||||||
# 如果精确和语义都没命中,直接跳过
|
# 如果精确和语义都没命中,直接跳过
|
||||||
if not exact_hits and not semantic_hits:
|
if not exact_hits and not semantic_hits:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -111,6 +111,14 @@ function getRankingChartOptions(history: number[], platformColor: string) {
|
|||||||
height: 56,
|
height: 56,
|
||||||
sparkline: { enabled: true },
|
sparkline: { enabled: true },
|
||||||
animations: { enabled: true, easing: 'easeinout' as const, speed: 400 },
|
animations: { enabled: true, easing: 'easeinout' as const, speed: 400 },
|
||||||
|
events: {
|
||||||
|
mounted: (chartContext: any) => {
|
||||||
|
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||||
|
},
|
||||||
|
updated: (chartContext: any) => {
|
||||||
|
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||||
|
}
|
||||||
|
}
|
||||||
},
|
},
|
||||||
stroke: { curve: 'smooth' as const, width: 2 },
|
stroke: { curve: 'smooth' as const, width: 2 },
|
||||||
fill: {
|
fill: {
|
||||||
|
|||||||
@@ -182,6 +182,14 @@ function getRankingChartOptions(history: number[], platformColor: string) {
|
|||||||
height: 56,
|
height: 56,
|
||||||
sparkline: { enabled: true },
|
sparkline: { enabled: true },
|
||||||
animations: { enabled: true, easing: 'easeinout' as const, speed: 400 },
|
animations: { enabled: true, easing: 'easeinout' as const, speed: 400 },
|
||||||
|
events: {
|
||||||
|
mounted: (chartContext: any) => {
|
||||||
|
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||||
|
},
|
||||||
|
updated: (chartContext: any) => {
|
||||||
|
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||||
|
}
|
||||||
|
}
|
||||||
},
|
},
|
||||||
stroke: { curve: 'smooth' as const, width: 2 },
|
stroke: { curve: 'smooth' as const, width: 2 },
|
||||||
fill: {
|
fill: {
|
||||||
|
|||||||
@@ -72,6 +72,12 @@ const chartOptions = ref<ApexOptions>({
|
|||||||
},
|
},
|
||||||
// 点击图表数据点:切换选中时间,再次点击则取消筛选
|
// 点击图表数据点:切换选中时间,再次点击则取消筛选
|
||||||
events: {
|
events: {
|
||||||
|
mounted: (chartContext: any) => {
|
||||||
|
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||||
|
},
|
||||||
|
updated: (chartContext: any) => {
|
||||||
|
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||||
|
},
|
||||||
markerClick: function(event: unknown, chartContext: unknown, { dataPointIndex }: never) {
|
markerClick: function(event: unknown, chartContext: unknown, { dataPointIndex }: never) {
|
||||||
if (searchResult.value && searchResult.value.timeline[dataPointIndex]) {
|
if (searchResult.value && searchResult.value.timeline[dataPointIndex]) {
|
||||||
const clickedTime = searchResult.value.timeline[dataPointIndex].time_label
|
const clickedTime = searchResult.value.timeline[dataPointIndex].time_label
|
||||||
@@ -585,7 +591,12 @@ async function handleSearch() {
|
|||||||
|
|
||||||
.chart-container {
|
.chart-container {
|
||||||
margin-top: 16px;
|
margin-top: 16px;
|
||||||
margin-left: -10px; /* 视觉上抵消 apexcharts 的默认左侧留白。 */
|
margin-left: -10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.chart-container :deep(svg),
|
||||||
|
.chart-container :deep(canvas) {
|
||||||
|
outline: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
.events-section {
|
.events-section {
|
||||||
@@ -595,7 +606,6 @@ async function handleSearch() {
|
|||||||
.events-grid {
|
.events-grid {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
/* 与 DashboardView 保持一致,列表按纵向堆叠展示。 */
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.loading-state {
|
.loading-state {
|
||||||
|
|||||||
@@ -156,7 +156,7 @@ onMounted(async () => {
|
|||||||
v-model="newKeyword"
|
v-model="newKeyword"
|
||||||
type="text"
|
type="text"
|
||||||
class="keyword-input"
|
class="keyword-input"
|
||||||
placeholder="输入关键词,如「直升机」「科比」「佐巴扬」..."
|
placeholder="输入关键词,如「篮球」「科比」「科技」..."
|
||||||
maxlength="100"
|
maxlength="100"
|
||||||
@keydown="onInputKeydown"
|
@keydown="onInputKeydown"
|
||||||
/>
|
/>
|
||||||
|
|||||||
Reference in New Issue
Block a user