mirror of
https://github.com/stardrophere/InsightRadar.git
synced 2026-06-06 00:57:51 +08:00
Compare commits
22 Commits
v0.1.0
...
6af713b67a
| Author | SHA1 | Date | |
|---|---|---|---|
| 6af713b67a | |||
| 6992b58208 | |||
| 1604decd3c | |||
| 98971588ae | |||
| 531844f33c | |||
| 76f00db86d | |||
| 761fad17bc | |||
| 0cab5c1cda | |||
| 9574b02d8a | |||
| c48c2b9143 | |||
| cdad76cd3b | |||
| d3e59bc7f3 | |||
| 61b6357418 | |||
| 943770b2bc | |||
| f4d9b2075c | |||
| e3541f8d43 | |||
| 6ddedd76d7 | |||
| ca36f3813a | |||
| 2cd9137f91 | |||
| 3fe122cb80 | |||
| 97c97b7bae | |||
| 7c01b5c265 |
@@ -0,0 +1,18 @@
|
||||
# 前端
|
||||
frontend/dist
|
||||
frontend/node_modules
|
||||
|
||||
# 后端
|
||||
backend/.venv
|
||||
backend/.git
|
||||
backend/__pycache__
|
||||
backend/*.pyc
|
||||
backend/*.pyo
|
||||
backend/*.pyd
|
||||
backend/.pytest_cache
|
||||
backend/.mypy_cache
|
||||
backend/.cache
|
||||
backend/.env
|
||||
backend/*.log
|
||||
backend/dist
|
||||
backend/build
|
||||
@@ -1,35 +0,0 @@
|
||||
name: Docker Image CI
|
||||
|
||||
on:
|
||||
push:
|
||||
tags: ['v*']
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Logout Docker (avoid wrong credentials)
|
||||
run: docker logout || true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Install the latest version of uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
|
||||
- name: uv lock
|
||||
working-directory: backend
|
||||
run: uv lock
|
||||
|
||||
- name: Build Docker Image (with BuildKit)
|
||||
working-directory: backend
|
||||
run: |
|
||||
docker build \
|
||||
--progress=plain \
|
||||
-t insightradar-backend:${{ github.ref_name }} \
|
||||
-t insightradar-backend:latest \
|
||||
.
|
||||
+5
-3
@@ -37,13 +37,12 @@ MANIFEST
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# uv
|
||||
*.lock
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
**/logs/*
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
@@ -190,3 +189,6 @@ cython_debug/
|
||||
|
||||
**/data/*
|
||||
**/docker/*
|
||||
backend/app/static/*
|
||||
|
||||
test*.*
|
||||
@@ -1,2 +1,70 @@
|
||||
# InsightRadar
|
||||
An AI-powered trend monitoring and news intelligence platform
|
||||
# 聚势智见 — 基于语义聚类与大模型的热点资讯聚合平台
|
||||
|
||||
一个智能热点监测与个性化分发平台,通过语义聚类与大模型技术,将分散在微博、知乎、抖音、百度等平台的热点资讯自动归并为统一事件,生成AI摘要与标签,并支持个性化订阅与定时推送。
|
||||
|
||||
## 核心特性
|
||||
|
||||
- **跨平台热点聚合**:基于Embedding语义相似度计算,自动识别不同平台的同一事件
|
||||
- **AI智能摘要**:调用大模型生成统一标题、综合摘要与标准化标签
|
||||
- **个性化推荐**:支持关键词订阅、语义匹配与多因子排序
|
||||
- **舆情分析工具**:提供热度趋势追踪、标题修改监控、时间线分析
|
||||
- **定时简报推送**:自定义推送时间与接收邮箱,生成个性化AI简报
|
||||
|
||||
## 快速部署
|
||||
|
||||
### 方式一:Docker部署(推荐)
|
||||
|
||||
**环境要求**
|
||||
- Linux系统(推荐Ubuntu 22.04 LTS / Debian 12)
|
||||
- Docker ≥ 20.10.0,Docker Compose v2
|
||||
- 内存 ≥ 512MB(建议1GB以上)
|
||||
|
||||
**部署步骤**
|
||||
|
||||
```bash
|
||||
# 1. 构建镜像
|
||||
docker build -t insightradar:latest .
|
||||
|
||||
# 2. 配置目录(参考docker/ereadm.txt)
|
||||
mkdir -p ./data ./logs
|
||||
|
||||
# 3. 启动服务
|
||||
cd docker
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### 方式二:源码部署
|
||||
|
||||
**环境要求**
|
||||
|
||||
- Python ≥ 3.11,uv包管理器
|
||||
- Node.js ≥ 22
|
||||
- 内存 ≥ 512MB
|
||||
|
||||
**后端部署**
|
||||
|
||||
``` bash
|
||||
# 复制
|
||||
cd backend
|
||||
uv sync
|
||||
uv run
|
||||
```
|
||||
|
||||
**前端部署**
|
||||
|
||||
```bash
|
||||
# 复制
|
||||
cd frontend
|
||||
npm install
|
||||
npm run build
|
||||
# 将dist/目录内容复制到 backend/app/static/
|
||||
```
|
||||
|
||||
**配置说明**
|
||||
|
||||
- 复制 .env.example 为 .env 并填写配置
|
||||
- 将Embedding模型(Qwen3-Embedding-4B)放入 backend/data/ 目录
|
||||
|
||||
### 访问应用
|
||||
|
||||
部署完成后,通过 http://<服务器IP>:<配置端口> 访问Web界面。
|
||||
@@ -1,13 +0,0 @@
|
||||
.venv
|
||||
.git
|
||||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.pytest_cache
|
||||
.mypy_cache
|
||||
.cache
|
||||
.env
|
||||
*.log
|
||||
dist
|
||||
build
|
||||
@@ -69,7 +69,7 @@ def _normalize_email(email: str) -> str:
|
||||
def _build_verification_email(code: str, purpose_text: str, expire_minutes: int) -> str:
|
||||
return f"""
|
||||
<div style="font-family: Arial, sans-serif; line-height: 1.6; color: #222;">
|
||||
<h2 style="margin-bottom: 12px;">InsightRadar 邮箱验证</h2>
|
||||
<h2 style="margin-bottom: 12px;">聚势智见邮箱验证</h2>
|
||||
<p>您的{purpose_text}验证码是:</p>
|
||||
<p style="font-size: 28px; font-weight: bold; letter-spacing: 4px; color: #0b57d0;">{code}</p>
|
||||
<p>该验证码在 {expire_minutes} 分钟内有效。请勿泄露给他人。</p>
|
||||
@@ -203,7 +203,7 @@ async def send_register_code(
|
||||
|
||||
await send_html_email(
|
||||
to_email=email,
|
||||
subject=f"【{code}】InsightRadar 注册验证码",
|
||||
subject=f"【{code}】聚势智见 注册验证码",
|
||||
html_content=_build_verification_email(
|
||||
code, "注册", REGISTER_CODE_EXPIRE_MINUTES
|
||||
),
|
||||
@@ -241,7 +241,7 @@ async def send_login_code(
|
||||
|
||||
await send_html_email(
|
||||
to_email=email,
|
||||
subject=f"【{code}】InsightRadar 登录验证码",
|
||||
subject=f"【{code}】聚势智见 登录验证码",
|
||||
html_content=_build_verification_email(
|
||||
code, "登录", LOGIN_CODE_EXPIRE_MINUTES
|
||||
),
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
"""
|
||||
信息源 CRUD:对 InfoSource 的增删改查,供 API 与爬虫使用
|
||||
"""
|
||||
from sqlite3 import IntegrityError
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List, Optional
|
||||
|
||||
@@ -22,10 +24,20 @@ def get_multi(db: Session, skip: int = 0, limit: int = 100) -> List[InfoSource]:
|
||||
def create(db: Session, obj_in: InfoSourceCreate) -> InfoSource:
|
||||
"""创建新的信息源"""
|
||||
db_obj = InfoSource(**obj_in.model_dump())
|
||||
db.add(db_obj)
|
||||
db.commit()
|
||||
db.refresh(db_obj)
|
||||
return db_obj
|
||||
exits =db.query(InfoSource).filter(InfoSource.source_name == db_obj.source_name).first()
|
||||
if exits:
|
||||
db.close()
|
||||
return db_obj
|
||||
try:
|
||||
db.add(db_obj)
|
||||
db.commit()
|
||||
db.refresh(db_obj)
|
||||
return db_obj
|
||||
except IntegrityError:
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
return db_obj
|
||||
|
||||
|
||||
def update(db: Session, db_obj: InfoSource, obj_in: InfoSourceUpdate) -> InfoSource:
|
||||
|
||||
+35
-39
@@ -1,46 +1,42 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
# 请将此处的 URL 替换为您实际的 API 基础域名
|
||||
api_url = "http://10.252.130.135:8000/api/v1/sources/"
|
||||
from app.database import SessionLocal
|
||||
from app.crud.crud_source import create
|
||||
from app.models.models import SourceType
|
||||
from app.schemas.source_schema import InfoSourceCreate
|
||||
|
||||
# 请求头
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
# "Authorization": "Bearer YOUR_TOKEN" # 如果接口需要鉴权,请取消注释并填入 Token
|
||||
}
|
||||
|
||||
# 解析后的数据源列表
|
||||
sources_data = [
|
||||
{"name": "今日头条", "url": "toutiao"},
|
||||
{"name": "百度热搜", "url": "baidu"},
|
||||
{"name": "华尔街见闻", "url": "wallstreetcn-hot"},
|
||||
{"name": "澎湃新闻", "url": "thepaper"},
|
||||
{"name": "bilibili 热搜", "url": "bilibili-hot-search"},
|
||||
{"name": "财联社热门", "url": "cls-hot"},
|
||||
{"name": "凤凰网", "url": "ifeng"},
|
||||
{"name": "贴吧", "url": "tieba"},
|
||||
{"name": "微博", "url": "weibo"},
|
||||
{"name": "抖音", "url": "douyin"},
|
||||
{"name": "知乎", "url": "zhihu"}
|
||||
]
|
||||
def init():
|
||||
|
||||
# 遍历数据并发送 POST 请求
|
||||
for item in sources_data:
|
||||
payload = {
|
||||
"source_name": item["name"],
|
||||
"source_type": "HOT_TREND",
|
||||
"home_url": item["url"],
|
||||
"is_enabled": True
|
||||
}
|
||||
# 解析后的数据源列表
|
||||
sources_data = [
|
||||
{"name": "今日头条", "url": "toutiao"},
|
||||
{"name": "百度热搜", "url": "baidu"},
|
||||
{"name": "华尔街见闻", "url": "wallstreetcn-hot"},
|
||||
{"name": "澎湃新闻", "url": "thepaper"},
|
||||
{"name": "bilibili 热搜", "url": "bilibili-hot-search"},
|
||||
{"name": "财联社热门", "url": "cls-hot"},
|
||||
{"name": "凤凰网", "url": "ifeng"},
|
||||
{"name": "贴吧", "url": "tieba"},
|
||||
{"name": "微博", "url": "weibo"},
|
||||
{"name": "抖音", "url": "douyin"},
|
||||
{"name": "知乎", "url": "zhihu"}
|
||||
]
|
||||
|
||||
try:
|
||||
response = requests.post(api_url, headers=headers, data=json.dumps(payload))
|
||||
if response.status_code in (200, 201):
|
||||
print(f"✅ 成功创建: {item['name']}")
|
||||
else:
|
||||
print(f"❌ 创建失败: {item['name']} - 状态码: {response.status_code} - 详情: {response.text}")
|
||||
except Exception as e:
|
||||
print(f"⚠️ 请求异常: {item['name']} - 错误: {e}")
|
||||
# 遍历数据并发送 POST 请求
|
||||
for item in sources_data:
|
||||
|
||||
print("执行完毕!")
|
||||
try:
|
||||
|
||||
with SessionLocal() as db:
|
||||
|
||||
create(db, InfoSourceCreate(
|
||||
source_name=item["name"],
|
||||
source_type=SourceType.HOT_TREND,
|
||||
home_url=item["url"],
|
||||
is_enabled=True
|
||||
))
|
||||
print(f"创建订阅源{item['name']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ 请求异常: {item['name']} - 错误: {e}")
|
||||
|
||||
+27
-2
@@ -1,8 +1,11 @@
|
||||
# app/main.py
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
||||
import httpx
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, HTTPException, Request, staticfiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@@ -21,11 +24,11 @@ from app.services.summary_service import generate_unified_summaries
|
||||
from app.services.delivery_service import check_and_deliver
|
||||
from app.database import engine
|
||||
from app.models.models import Base
|
||||
from app.initialize import init
|
||||
|
||||
# 路由总线
|
||||
from app.api.router import api_router
|
||||
|
||||
load_dotenv()
|
||||
CRAWL_INTERVAL = int(os.getenv("CRAWL_INTERVAL_MINUTES", 10))
|
||||
SUMMARY_INTERVAL = int(os.getenv("SUMMARY_INTERVAL_MINUTES", 30))
|
||||
|
||||
@@ -42,6 +45,10 @@ async def lifespan(app: FastAPI):
|
||||
Base.metadata.create_all(bind=engine)
|
||||
logging.info("数据库表初始化完成!")
|
||||
|
||||
logging.info("初始化订阅源")
|
||||
init()
|
||||
logging.info("订阅源初始化完毕")
|
||||
|
||||
# 2. 配置并启动定时任务
|
||||
scheduler.add_job(
|
||||
fetch_and_save_trending_data,
|
||||
@@ -106,6 +113,24 @@ app.add_middleware(
|
||||
# 版本控制
|
||||
app.include_router(api_router, prefix="/api/v1")
|
||||
|
||||
# 只需要保留API的优先匹配,catch_all可以简化成这样
|
||||
@app.get("/api/{full_path:path}")
|
||||
async def api_not_found(full_path: str):
|
||||
return {"detail": "API Not Found"}
|
||||
|
||||
staticPath = staticfiles.StaticFiles(directory="app/static", html=True)
|
||||
|
||||
# 把目录改成static对应我们放dist内容的路径就可以
|
||||
app.mount("/", staticPath, name="static")
|
||||
|
||||
INDEX_HTML = Path("app/static/index.html").read_text(encoding="utf-8")
|
||||
|
||||
@app.exception_handler(404)
|
||||
async def not_found_handler(request: Request, exc: HTTPException):
|
||||
# 如果是API路径才返回404,前端路径走catch-all不会进这里
|
||||
if request.url.path.startswith("/api/"):
|
||||
return JSONResponse({"detail": "Not Found"}, status_code=404)
|
||||
return HTMLResponse(INDEX_HTML)
|
||||
|
||||
# 健康检查
|
||||
@app.get("/", tags=["健康检查"])
|
||||
|
||||
@@ -94,6 +94,10 @@ class InfoSource(Base):
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow)
|
||||
|
||||
__table_args__ = (
|
||||
UniqueConstraint("source_name", name="uix_source_name"),
|
||||
)
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 模块二:AI 语义聚类中枢 (大事件池)
|
||||
@@ -176,8 +180,7 @@ class NewsArticle(Base):
|
||||
|
||||
id: Mapped[int] = mapped_column(BigIntType, primary_key=True, autoincrement=True)
|
||||
source_id: Mapped[int] = mapped_column(ForeignKey("info_sources.id"), comment="所属信息源ID")
|
||||
unified_event_id: Mapped[Optional[int]] = mapped_column(ForeignKey("unified_events.id"),
|
||||
comment="深度文章也可归入大事件分析")
|
||||
unified_event_id: Mapped[Optional[int]] = mapped_column(ForeignKey("unified_events.id"), comment="深度文章也可归入大事件分析")
|
||||
|
||||
external_id: Mapped[str] = mapped_column(String(32), comment="RSS原文<guid>生成的MD5防重指纹")
|
||||
title_embedding: Mapped[Optional[str]] = mapped_column(Text, comment="新闻标题/摘要的语义向量")
|
||||
@@ -214,8 +217,7 @@ class HeadlineRevision(Base):
|
||||
previous_headline: Mapped[str] = mapped_column(String(255), comment="修改前的旧标题")
|
||||
revised_headline: Mapped[str] = mapped_column(String(255), comment="修改后的新标题")
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow,
|
||||
comment="系统发现被修改的时间")
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, comment="系统发现被修改的时间")
|
||||
|
||||
|
||||
class RankingLog(Base):
|
||||
@@ -235,8 +237,7 @@ class RankingLog(Base):
|
||||
# 当时它在第几名
|
||||
ranking_position: Mapped[int] = mapped_column(Integer, comment="当时抓取时的排名名次")
|
||||
# 爬虫看到它的那一瞬间的时间
|
||||
observed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow,
|
||||
comment="观察到该名次的准确时间")
|
||||
observed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, comment="观察到该名次的准确时间")
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
|
||||
@@ -307,13 +308,11 @@ class AppUser(Base):
|
||||
|
||||
nickname: Mapped[Optional[str]] = mapped_column(String(100), comment="用户展示昵称")
|
||||
avatar_url: Mapped[Optional[str]] = mapped_column(String(500), comment="用户头像地址")
|
||||
gender: Mapped[GenderType] = mapped_column(Enum(GenderType), default=GenderType.UNKNOWN,
|
||||
comment="用户性别(用于AI调整行文语气)")
|
||||
gender: Mapped[GenderType] = mapped_column(Enum(GenderType), default=GenderType.UNKNOWN, comment="用户性别(用于AI调整行文语气)")
|
||||
|
||||
# 极其强大:一个万能收纳箱!前端未来想加任何诸如“夜间模式”、“字体变大”的开关,
|
||||
# 全部丢进这个 JSON 字段即可,从此免去手动修改后端表结构的麻烦。
|
||||
metadata_: Mapped[Optional[Any]] = mapped_column("metadata", JSON,
|
||||
comment="JSON扩展字段: 存放灵活多变的前端用户偏好设置")
|
||||
metadata_: Mapped[Optional[Any]] = mapped_column("metadata", JSON, comment="JSON扩展字段: 存放灵活多变的前端用户偏好设置")
|
||||
|
||||
# 时区对于定时推送系统极其重要!保证纽约的用户和北京的用户都能在早晨8点收到新闻。
|
||||
timezone: Mapped[str] = mapped_column(String(50), default="Asia/Shanghai", comment="用户所在地时区")
|
||||
@@ -401,8 +400,7 @@ class DeliveryHistory(Base):
|
||||
# 记录这次推送是彻底成功了,还是由于渠道网络问题失败了
|
||||
status: Mapped[TaskStatus] = mapped_column(Enum(TaskStatus), comment="最终推送结果状态")
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow,
|
||||
comment="记录或实际推送的准确时间")
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, comment="记录或实际推送的准确时间")
|
||||
|
||||
|
||||
# ==========================================
|
||||
|
||||
@@ -86,7 +86,7 @@ body{{margin:0;padding:0;background:#0d1117;color:#e6edf3;font-family:-apple-sys
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>InsightRadar · 热点快报</h1>
|
||||
<h1>聚势智见 · 热点快报</h1>
|
||||
<p>{delivery_time} · 为你精选了 {event_count} 条事件</p>
|
||||
<span class="mode-badge {mode_badge_class}">{mode_label}</span>
|
||||
</div>
|
||||
@@ -94,8 +94,8 @@ body{{margin:0;padding:0;background:#0d1117;color:#e6edf3;font-family:-apple-sys
|
||||
{event_cards_html}
|
||||
|
||||
<div class="footer">
|
||||
<p>此邮件由 InsightRadar 自动推送。</p>
|
||||
<p>如需调整推送设置,请登录 <a href="{app_url}">InsightRadar 控制台</a></p>
|
||||
<p>此邮件由 聚势智见自动推送。</p>
|
||||
<p>如需调整推送设置,请登录 <a href="{app_url}">聚势智见 控制台</a></p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
|
||||
@@ -377,7 +377,7 @@ def _prepare_user_push(db: Session, user: AppUser, schedule: UserDeliverySchedul
|
||||
return _PendingPush(
|
||||
user_id=user_id,
|
||||
email_targets=[ep.channel_account for ep in email_endpoints],
|
||||
subject=f"InsightRadar {subject_suffix} · {time_str}",
|
||||
subject=f"聚势智见 {subject_suffix} · {time_str}",
|
||||
html_body=html_body,
|
||||
event_ids=event_ids,
|
||||
)
|
||||
|
||||
@@ -26,7 +26,7 @@ SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.72))
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "https://newsnow.busiyi.world/api/s")
|
||||
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "")
|
||||
|
||||
print("正在加载 BAAI/bge-m3 向量模型...")
|
||||
print("正在加载模型...")
|
||||
# 全局单例
|
||||
embedder_model = SentenceTransformer(EMBEDDING_MODEL_PATH, local_files_only=True)
|
||||
print("模型加载完成。")
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""
|
||||
匹配服务:根据用户兴趣关键词(精确 + 语义)推荐事件
|
||||
打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度加成
|
||||
打分融合:标签/标题匹配分 + 标签相关度 + 热度 + 新鲜度加成
|
||||
"""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
@@ -14,7 +14,7 @@ from app.models.models import ExtractedTopic, TargetType, UnifiedEvent, UserTopi
|
||||
from app.services.fetcher_service import embedder_model
|
||||
|
||||
|
||||
# 语义匹配阈值:用户关键词和事件标签向量相似度达到该值才计入语义命中
|
||||
# 语义匹配阈值:用户关键词和事件标签/标题向量相似度达到该值才计入语义命中
|
||||
DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD = 0.78
|
||||
PREFERENCE_SEMANTIC_THRESHOLD = float(
|
||||
os.getenv("PREFERENCE_SEMANTIC_THRESHOLD", str(DEFAULT_PREFERENCE_SEMANTIC_THRESHOLD))
|
||||
@@ -41,6 +41,31 @@ def _normalize_text(text: str) -> str:
|
||||
return text.strip().casefold()
|
||||
|
||||
|
||||
def _find_exact_preference_match(
|
||||
target_text: str,
|
||||
normalized_preferences: list[tuple[str, str]],
|
||||
) -> str | None:
|
||||
"""
|
||||
判断目标文本是否与某个用户兴趣词形成“精确命中”。
|
||||
命中条件:
|
||||
1. 标准化后完全相等
|
||||
2. 二者互为包含关系
|
||||
返回命中的原始兴趣词,未命中则返回 None。
|
||||
"""
|
||||
normalized_target = _normalize_text(target_text)
|
||||
if not normalized_target:
|
||||
return None
|
||||
|
||||
for raw_pref, normalized_pref in normalized_preferences:
|
||||
if not normalized_pref:
|
||||
continue
|
||||
if normalized_target == normalized_pref:
|
||||
return raw_pref
|
||||
if normalized_pref in normalized_target or normalized_target in normalized_pref:
|
||||
return raw_pref
|
||||
return None
|
||||
|
||||
|
||||
_EMBEDDING_CACHE: dict[str, np.ndarray] = {}
|
||||
MAX_CACHE_SIZE = 10000
|
||||
|
||||
@@ -86,6 +111,26 @@ def _build_keyword_embedding_map(keywords: list[str]) -> dict[str, np.ndarray]:
|
||||
return result
|
||||
|
||||
|
||||
def _find_best_semantic_match(
|
||||
target_text: str,
|
||||
target_vec_map: dict[str, np.ndarray],
|
||||
pref_vec_map: dict[str, np.ndarray],
|
||||
) -> tuple[str | None, float]:
|
||||
"""返回与目标文本最接近的兴趣词及其余弦相似度。"""
|
||||
target_vec = target_vec_map.get(target_text)
|
||||
if target_vec is None:
|
||||
return None, -1.0
|
||||
|
||||
best_pref = None
|
||||
best_sim = -1.0
|
||||
for pref_keyword, pref_vec in pref_vec_map.items():
|
||||
sim = float(np.dot(target_vec, pref_vec))
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_pref = pref_keyword
|
||||
return best_pref, best_sim
|
||||
|
||||
|
||||
def _ensure_aware(dt: datetime) -> datetime:
|
||||
"""SQLite 读出的 datetime 不带时区信息,统一补上 UTC 后才能和 utcnow() 做减法。"""
|
||||
if dt.tzinfo is None:
|
||||
@@ -116,8 +161,8 @@ def recommend_events_for_user(
|
||||
) -> list[MatchedEventResult]:
|
||||
"""
|
||||
用户兴趣推荐主流程:
|
||||
1) 精确匹配:用户词 == EVENT 标签
|
||||
2) 语义匹配:用户词向量 vs EVENT 标签向量(超过阈值)
|
||||
1) 精确匹配:用户词 vs EVENT 标签/标题
|
||||
2) 语义匹配:用户词向量 vs EVENT 标签/标题向量(超过阈值)
|
||||
3) 打分融合:匹配分 + 标签相关度 + 热度 + 新鲜度
|
||||
"""
|
||||
final_limit = max(1, min(limit, PREFERENCE_RECOMMEND_MAX_LIMIT))
|
||||
@@ -167,8 +212,6 @@ def recommend_events_for_user(
|
||||
)
|
||||
.all()
|
||||
)
|
||||
if not topic_rows:
|
||||
return []
|
||||
|
||||
# 组织事件标签映射:event_id -> [(tag, relevance_score), ...]
|
||||
event_topics: dict[int, list[tuple[str, float | None]]] = {}
|
||||
@@ -177,10 +220,6 @@ def recommend_events_for_user(
|
||||
continue
|
||||
event_topics.setdefault(event_id, []).append((topic_keyword, relevance_score))
|
||||
|
||||
# 如果某事件没有标签,就不参与推荐
|
||||
if not event_topics:
|
||||
return []
|
||||
|
||||
# 3. 批量编码用户词与标签词,减少模型调用次数
|
||||
unique_preference_keywords = list(dict.fromkeys(preference_keywords))
|
||||
unique_topic_keywords = list(dict.fromkeys([row[1] for row in topic_rows if row[1]]))
|
||||
@@ -188,13 +227,21 @@ def recommend_events_for_user(
|
||||
topic_vec_map = _build_keyword_embedding_map(unique_topic_keywords)
|
||||
|
||||
# 预先建立“标准化后用户词集合”,用于精确匹配
|
||||
normalized_pref_set = {_normalize_text(word) for word in unique_preference_keywords}
|
||||
normalized_preference_pairs = [
|
||||
(word, _normalize_text(word))
|
||||
for word in unique_preference_keywords
|
||||
if _normalize_text(word)
|
||||
]
|
||||
unique_event_titles = list(
|
||||
dict.fromkeys(
|
||||
[event.unified_title.strip() for event in events if event.unified_title and event.unified_title.strip()]
|
||||
)
|
||||
)
|
||||
title_vec_map = _build_keyword_embedding_map(unique_event_titles)
|
||||
|
||||
scored_results: list[MatchedEventResult] = []
|
||||
for event in events:
|
||||
topic_list = event_topics.get(event.id, [])
|
||||
if not topic_list:
|
||||
continue
|
||||
|
||||
exact_hits: list[str] = []
|
||||
semantic_hits: list[dict[str, Any]] = []
|
||||
@@ -202,37 +249,18 @@ def recommend_events_for_user(
|
||||
|
||||
# 对每个事件标签做精确匹配或语义匹配
|
||||
for topic_keyword, topic_relevance in topic_list:
|
||||
normalized_topic = _normalize_text(topic_keyword)
|
||||
topic_relevance_score = float(topic_relevance) if topic_relevance is not None else 50.0
|
||||
|
||||
# 1) 精确命中(包括完全相等与包含关系)
|
||||
matched_exact = False
|
||||
if normalized_topic in normalized_pref_set:
|
||||
matched_exact = True
|
||||
else:
|
||||
for pref_word in normalized_pref_set:
|
||||
if pref_word and (pref_word in normalized_topic or normalized_topic in pref_word):
|
||||
matched_exact = True
|
||||
break
|
||||
|
||||
if matched_exact:
|
||||
matched_pref = _find_exact_preference_match(topic_keyword, normalized_preference_pairs)
|
||||
if matched_pref is not None:
|
||||
exact_hits.append(topic_keyword)
|
||||
# 精确命中给较高基础分,标签自身相关度作为增益
|
||||
score += 45.0 + topic_relevance_score * 0.2
|
||||
continue
|
||||
|
||||
# 2) 语义命中(未精确命中时再算)
|
||||
topic_vec = topic_vec_map.get(topic_keyword)
|
||||
if topic_vec is None:
|
||||
continue
|
||||
|
||||
best_pref = None
|
||||
best_sim = -1.0
|
||||
for pref_keyword, pref_vec in pref_vec_map.items():
|
||||
sim = float(np.dot(topic_vec, pref_vec))
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_pref = pref_keyword
|
||||
best_pref, best_sim = _find_best_semantic_match(topic_keyword, topic_vec_map, pref_vec_map)
|
||||
|
||||
if best_pref is not None and best_sim >= similarity_threshold:
|
||||
semantic_hits.append(
|
||||
@@ -245,6 +273,25 @@ def recommend_events_for_user(
|
||||
# 语义命中分略低于精确命中,并由相似度放大
|
||||
score += best_sim * 35.0 + topic_relevance_score * 0.12
|
||||
|
||||
# 标题也参与匹配,但权重低于结构化标签,避免长标题过度主导排序。
|
||||
event_title = (event.unified_title or "").strip()
|
||||
if event_title:
|
||||
title_exact_pref = _find_exact_preference_match(event_title, normalized_preference_pairs)
|
||||
if title_exact_pref is not None:
|
||||
exact_hits.append(f"标题:{title_exact_pref}")
|
||||
score += 30.0
|
||||
else:
|
||||
best_pref, best_sim = _find_best_semantic_match(event_title, title_vec_map, pref_vec_map)
|
||||
if best_pref is not None and best_sim >= similarity_threshold:
|
||||
semantic_hits.append(
|
||||
{
|
||||
"preference_keyword": best_pref,
|
||||
"topic_keyword": f"标题:{best_pref}",
|
||||
"similarity": round(best_sim, 4),
|
||||
}
|
||||
)
|
||||
score += best_sim * 24.0
|
||||
|
||||
# 如果精确和语义都没命中,直接跳过
|
||||
if not exact_hits and not semantic_hits:
|
||||
continue
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
FROM python:3.11-slim AS builder
|
||||
|
||||
WORKDIR /insightradar
|
||||
|
||||
COPY pyproject.toml uv.lock ./
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
pip install --no-cache-dir uv && \
|
||||
uv sync --frozen --no-dev
|
||||
|
||||
COPY app app
|
||||
COPY main.py main.py
|
||||
|
||||
#-----------------------------------------------
|
||||
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /insightradar
|
||||
|
||||
# 👇 复制虚拟环境
|
||||
COPY --from=builder /insightradar/.venv /insightradar/.venv
|
||||
|
||||
COPY app app
|
||||
COPY main.py main.py
|
||||
|
||||
# 👇 关键:用 venv 里的 python
|
||||
ENV PATH="/insightradar/.venv/bin:$PATH"
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["python","main.py"]
|
||||
+7
-1
@@ -1,12 +1,18 @@
|
||||
# run.py
|
||||
import uvicorn
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
load_dotenv()
|
||||
PORT = int(os.getenv("PORT", 8000))
|
||||
|
||||
# 启动服务
|
||||
uvicorn.run(
|
||||
app="app.main:app",
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
port=PORT,
|
||||
# reload=True,
|
||||
workers=1
|
||||
)
|
||||
|
||||
+12
-3
@@ -49,7 +49,6 @@ dependencies = [
|
||||
"safetensors==0.7.0",
|
||||
"scikit-learn==1.8.0",
|
||||
"scipy==1.17.1",
|
||||
"sentence-transformers==5.2.3",
|
||||
"shellingham==1.5.4",
|
||||
"sniffio==1.3.1",
|
||||
"sqlalchemy==2.0.48",
|
||||
@@ -57,8 +56,6 @@ dependencies = [
|
||||
"sympy==1.14.0",
|
||||
"threadpoolctl==3.6.0",
|
||||
"tokenizers==0.22.2",
|
||||
"torch==2.10.0",
|
||||
"torchvision==0.25.0",
|
||||
"tqdm==4.67.3",
|
||||
"transformers==5.3.0",
|
||||
"typer==0.24.1",
|
||||
@@ -68,4 +65,16 @@ dependencies = [
|
||||
"tzlocal==5.3.1",
|
||||
"urllib3==2.6.3",
|
||||
"uvicorn==0.41.0",
|
||||
"torch==2.11.0+cpu",
|
||||
"torchvision==0.26.0+cpu",
|
||||
"torchaudio==2.11.0+cpu",
|
||||
"sentence-transformers>=5.3.0",
|
||||
]
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cpu"
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
default = false
|
||||
|
||||
[tool.uv]
|
||||
index-strategy = "unsafe-best-match"
|
||||
|
||||
Generated
+1720
File diff suppressed because it is too large
Load Diff
+50
@@ -0,0 +1,50 @@
|
||||
# ---------- 阶段1:前端编译(Node打包静态产物) ----------
|
||||
FROM node:22-alpine AS frontend-builder
|
||||
|
||||
WORKDIR /frontend
|
||||
|
||||
# 复制前端依赖,利用Docker缓存优化
|
||||
COPY frontend/package*.json ./
|
||||
RUN npm install --registry=https://registry.npmmirror.com
|
||||
|
||||
# 复制前端代码,编译出静态产物
|
||||
COPY frontend/ .
|
||||
RUN npm run build
|
||||
|
||||
# ---------- 阶段2:后端依赖构建(uv构建虚拟环境) ----------
|
||||
FROM python:3.11-slim AS backend-builder
|
||||
|
||||
WORKDIR /backend
|
||||
|
||||
# 安装uv,同步Python依赖
|
||||
COPY backend/pyproject.toml backend/uv.lock ./
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
pip install --no-cache-dir uv && \
|
||||
uv sync --frozen --no-dev --index https://pypi.tuna.tsinghua.edu.cn/simple/
|
||||
|
||||
# 复制后端代码
|
||||
COPY backend/app ./app
|
||||
COPY backend/main.py ./
|
||||
|
||||
# ---------- 阶段3:最终运行镜像(仅Python+Uvicorn,托管前端静态) ----------
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 复制构建好的后端虚拟环境
|
||||
COPY --from=backend-builder /backend/.venv /app/.venv
|
||||
COPY --from=backend-builder /backend/app /app/app
|
||||
COPY --from=backend-builder /backend/main.py /app/main.py
|
||||
|
||||
# 复制前端编译好的静态产物,放到后端能访问的目录
|
||||
# 这里我们把静态文件放到 /app/static 目录
|
||||
COPY --from=frontend-builder /frontend/dist /app/app/static
|
||||
|
||||
# 把venv加入PATH
|
||||
ENV PATH="/app/.venv/bin:$PATH"
|
||||
|
||||
# 暴露Uvicorn端口
|
||||
EXPOSE 8000
|
||||
|
||||
# 直接启动Uvicorn,由Uvicorn配合后端框架托管静态文件
|
||||
CMD ["python3", "main.py"]
|
||||
+1
-1
@@ -4,7 +4,7 @@
|
||||
<meta charset="UTF-8">
|
||||
<link rel="icon" href="/favicon.svg">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>InsightRadar - 全网热点监控中枢</title>
|
||||
<title>聚势智见 - 基于语义聚类与大模型的热点资讯聚合平台</title>
|
||||
<!-- Font Awesome 图标库 -->
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css">
|
||||
</head>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,4 @@
|
||||
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Noto+Sans+SC:wght@400;500;600;700&display=swap');
|
||||
@import url(./font.css);
|
||||
|
||||
/* =========================================
|
||||
1. 现代 SaaS 风格高级主题变量
|
||||
|
||||
@@ -111,6 +111,14 @@ function getRankingChartOptions(history: number[], platformColor: string) {
|
||||
height: 56,
|
||||
sparkline: { enabled: true },
|
||||
animations: { enabled: true, easing: 'easeinout' as const, speed: 400 },
|
||||
events: {
|
||||
mounted: (chartContext: any) => {
|
||||
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||
},
|
||||
updated: (chartContext: any) => {
|
||||
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||
}
|
||||
}
|
||||
},
|
||||
stroke: { curve: 'smooth' as const, width: 2 },
|
||||
fill: {
|
||||
|
||||
@@ -57,7 +57,7 @@ function toggleSidebar() {
|
||||
<!-- Logo -->
|
||||
<div class="sidebar-logo">
|
||||
<BrandLogo />
|
||||
<span class="logo-text">InsightRadar<span class="logo-dot">.AI</span></span>
|
||||
<span class="logo-text">聚势智见<span class="logo-dot">.AI</span></span>
|
||||
</div>
|
||||
|
||||
<!-- 导航菜单 -->
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
<!-- 关于页(占位) -->
|
||||
<template>
|
||||
<div class="about">
|
||||
<h1>关于 InsightRadar</h1>
|
||||
<h1>关于 聚势智见</h1>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
|
||||
@@ -182,6 +182,14 @@ function getRankingChartOptions(history: number[], platformColor: string) {
|
||||
height: 56,
|
||||
sparkline: { enabled: true },
|
||||
animations: { enabled: true, easing: 'easeinout' as const, speed: 400 },
|
||||
events: {
|
||||
mounted: (chartContext: any) => {
|
||||
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||
},
|
||||
updated: (chartContext: any) => {
|
||||
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||
}
|
||||
}
|
||||
},
|
||||
stroke: { curve: 'smooth' as const, width: 2 },
|
||||
fill: {
|
||||
@@ -838,10 +846,10 @@ watch(() => route.query.event, (newId) => {
|
||||
<i class="fa-regular fa-clock"></i>
|
||||
最后同步: {{ lastSyncText }}
|
||||
</span>
|
||||
<span v-if="stats.error_tasks_today > 0" class="error-count">
|
||||
<!-- <span v-if="stats.error_tasks_today > 0" class="error-count">
|
||||
<i class="fa-solid fa-triangle-exclamation"></i>
|
||||
{{ stats.error_tasks_today }} 个异常
|
||||
</span>
|
||||
</span> -->
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
@@ -31,7 +31,7 @@ async function handleLogout() {
|
||||
<div class="nav-brand">
|
||||
<div class="logo">
|
||||
<BrandLogo />
|
||||
InsightRadar
|
||||
聚势智见
|
||||
</div>
|
||||
</div>
|
||||
<div class="nav-actions">
|
||||
|
||||
@@ -150,7 +150,7 @@ onUnmounted(() => {
|
||||
<div class="brand-content">
|
||||
<div class="logo">
|
||||
<BrandLogo />
|
||||
InsightRadar
|
||||
聚势智见
|
||||
</div>
|
||||
<h1 class="brand-title">洞察全网热点<br />让信息更聚焦</h1>
|
||||
<p class="brand-desc">
|
||||
@@ -192,7 +192,7 @@ onUnmounted(() => {
|
||||
<div class="form-container">
|
||||
<div class="form-header">
|
||||
<h2>欢迎回来</h2>
|
||||
<p>登录后继续查看 InsightRadar 实时动态</p>
|
||||
<p>登录后继续查看 聚势智见 实时动态</p>
|
||||
</div>
|
||||
|
||||
<div class="login-mode-tabs">
|
||||
|
||||
@@ -131,7 +131,7 @@ onUnmounted(() => {
|
||||
<div class="brand-content">
|
||||
<div class="logo">
|
||||
<BrandLogo />
|
||||
InsightRadar
|
||||
聚势智见
|
||||
</div>
|
||||
<h1 class="brand-title">开启智能<br />分析之旅。</h1>
|
||||
<p class="brand-desc">
|
||||
|
||||
@@ -72,6 +72,12 @@ const chartOptions = ref<ApexOptions>({
|
||||
},
|
||||
// 点击图表数据点:切换选中时间,再次点击则取消筛选
|
||||
events: {
|
||||
mounted: (chartContext: any) => {
|
||||
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||
},
|
||||
updated: (chartContext: any) => {
|
||||
chartContext.el?.querySelector('.apexcharts-svg > title')?.remove()
|
||||
},
|
||||
markerClick: function(event: unknown, chartContext: unknown, { dataPointIndex }: never) {
|
||||
if (searchResult.value && searchResult.value.timeline[dataPointIndex]) {
|
||||
const clickedTime = searchResult.value.timeline[dataPointIndex].time_label
|
||||
@@ -585,7 +591,12 @@ async function handleSearch() {
|
||||
|
||||
.chart-container {
|
||||
margin-top: 16px;
|
||||
margin-left: -10px; /* 视觉上抵消 apexcharts 的默认左侧留白。 */
|
||||
margin-left: -10px;
|
||||
}
|
||||
|
||||
.chart-container :deep(svg),
|
||||
.chart-container :deep(canvas) {
|
||||
outline: none;
|
||||
}
|
||||
|
||||
.events-section {
|
||||
@@ -595,7 +606,6 @@ async function handleSearch() {
|
||||
.events-grid {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
/* 与 DashboardView 保持一致,列表按纵向堆叠展示。 */
|
||||
}
|
||||
|
||||
.loading-state {
|
||||
|
||||
@@ -156,7 +156,7 @@ onMounted(async () => {
|
||||
v-model="newKeyword"
|
||||
type="text"
|
||||
class="keyword-input"
|
||||
placeholder="输入关键词,如「直升机」「科比」「佐巴扬」..."
|
||||
placeholder="输入关键词,如「篮球」「科比」「科技」..."
|
||||
maxlength="100"
|
||||
@keydown="onInputKeydown"
|
||||
/>
|
||||
|
||||
Reference in New Issue
Block a user