418 lines
17 KiB
Python
418 lines
17 KiB
Python
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
||
from typing import Optional, Any, cast
|
||
import asyncio
|
||
import shutil
|
||
import os
|
||
import time
|
||
from pathlib import Path
|
||
from loguru import logger
|
||
import traceback
|
||
import re
|
||
import json
|
||
import requests
|
||
from urllib.parse import unquote
|
||
|
||
from app.services.whisper_service import whisper_service
|
||
from app.services.glm_service import glm_service
|
||
from app.core.response import success_response
|
||
|
||
router = APIRouter()
|
||
|
||
@router.post("/extract-script")
|
||
async def extract_script_tool(
|
||
file: Optional[UploadFile] = File(None),
|
||
url: Optional[str] = Form(None),
|
||
rewrite: bool = Form(True)
|
||
):
|
||
"""
|
||
独立文案提取工具
|
||
支持上传视频/音频 OR 输入视频链接 -> 提取文字 -> (可选) AI洗稿
|
||
"""
|
||
if not file and not url:
|
||
raise HTTPException(400, "必须提供文件或视频链接")
|
||
|
||
temp_path = None
|
||
try:
|
||
timestamp = int(time.time())
|
||
temp_dir = Path("/tmp")
|
||
if os.name == 'nt':
|
||
temp_dir = Path("d:/tmp")
|
||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 1. 获取/保存文件
|
||
loop = asyncio.get_event_loop()
|
||
|
||
if file:
|
||
filename = file.filename
|
||
if not filename:
|
||
raise HTTPException(400, "文件名无效")
|
||
safe_filename = Path(filename).name.replace(" ", "_")
|
||
temp_path = temp_dir / f"tool_extract_{timestamp}_{safe_filename}"
|
||
# 文件 I/O 放入线程池
|
||
await loop.run_in_executor(None, lambda: shutil.copyfileobj(file.file, open(temp_path, "wb")))
|
||
logger.info(f"Tool processing upload file: {temp_path}")
|
||
else:
|
||
if not url:
|
||
raise HTTPException(400, "必须提供视频链接")
|
||
url_value: str = url
|
||
# URL 下载逻辑
|
||
# 自动提取文案中的链接 (支持 Douyin/Bilibili 等分享文案)
|
||
url_match = re.search(r'https?://[^\s]+', url_value)
|
||
if url_match:
|
||
extracted_url = url_match.group(0)
|
||
logger.info(f"Extracted URL from text: {extracted_url}")
|
||
url_value = extracted_url
|
||
|
||
logger.info(f"Tool downloading URL: {url_value}")
|
||
|
||
# 封装 yt-dlp 下载函数 (Blocking)
|
||
def _download_yt_dlp():
|
||
import yt_dlp
|
||
logger.info("Attempting download with yt-dlp...")
|
||
|
||
ydl_opts = {
|
||
'format': 'bestaudio/best',
|
||
'outtmpl': str(temp_dir / f"tool_download_{timestamp}_%(id)s.%(ext)s"),
|
||
'quiet': True,
|
||
'no_warnings': True,
|
||
'http_headers': {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||
'Referer': 'https://www.douyin.com/',
|
||
}
|
||
}
|
||
|
||
with yt_dlp.YoutubeDL() as ydl_raw:
|
||
ydl: Any = ydl_raw
|
||
ydl.params.update(ydl_opts)
|
||
info = ydl.extract_info(url_value, download=True)
|
||
if 'requested_downloads' in info:
|
||
downloaded_file = info['requested_downloads'][0]['filepath']
|
||
else:
|
||
ext = info.get('ext', 'mp4')
|
||
id = info.get('id')
|
||
downloaded_file = str(temp_dir / f"tool_download_{timestamp}_{id}.{ext}")
|
||
|
||
return Path(downloaded_file)
|
||
|
||
# 先尝试 yt-dlp (Run in Executor)
|
||
try:
|
||
temp_path = await loop.run_in_executor(None, _download_yt_dlp)
|
||
logger.info(f"yt-dlp downloaded to: {temp_path}")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"yt-dlp download failed: {e}. Trying manual Douyin fallback...")
|
||
|
||
# 失败则尝试手动解析 (Douyin Fallback)
|
||
if "douyin" in url_value:
|
||
manual_path = await download_douyin_manual(url_value, temp_dir, timestamp)
|
||
if manual_path:
|
||
temp_path = manual_path
|
||
logger.info(f"Manual Douyin fallback successful: {temp_path}")
|
||
else:
|
||
raise HTTPException(400, f"视频下载失败。yt-dlp 报错: {str(e)}")
|
||
elif "bilibili" in url_value:
|
||
manual_path = await download_bilibili_manual(url_value, temp_dir, timestamp)
|
||
if manual_path:
|
||
temp_path = manual_path
|
||
logger.info(f"Manual Bilibili fallback successful: {temp_path}")
|
||
else:
|
||
raise HTTPException(400, f"视频下载失败。yt-dlp 报错: {str(e)}")
|
||
else:
|
||
raise HTTPException(400, f"视频下载失败: {str(e)}")
|
||
|
||
if not temp_path or not temp_path.exists():
|
||
raise HTTPException(400, "文件获取失败")
|
||
|
||
# 1.5 安全转换: 强制转为 WAV (16k)
|
||
import subprocess
|
||
audio_path = temp_dir / f"extract_audio_{timestamp}.wav"
|
||
|
||
def _convert_audio():
|
||
try:
|
||
convert_cmd = [
|
||
'ffmpeg',
|
||
'-i', str(temp_path),
|
||
'-vn', # 忽略视频
|
||
'-acodec', 'pcm_s16le',
|
||
'-ar', '16000', # Whisper 推荐采样率
|
||
'-ac', '1', # 单声道
|
||
'-y', # 覆盖
|
||
str(audio_path)
|
||
]
|
||
# 捕获 stderr
|
||
subprocess.run(convert_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||
return True
|
||
except subprocess.CalledProcessError as e:
|
||
error_log = e.stderr.decode('utf-8', errors='ignore') if e.stderr else str(e)
|
||
logger.error(f"FFmpeg check/convert failed: {error_log}")
|
||
# 检查是否为 HTML
|
||
head = b""
|
||
try:
|
||
with open(temp_path, 'rb') as f:
|
||
head = f.read(100)
|
||
except: pass
|
||
if b'<!DOCTYPE html' in head or b'<html' in head:
|
||
raise ValueError("HTML_DETECTED")
|
||
raise ValueError("CONVERT_FAILED")
|
||
|
||
# 执行转换 (Run in Executor)
|
||
try:
|
||
await loop.run_in_executor(None, _convert_audio)
|
||
logger.info(f"Converted to WAV: {audio_path}")
|
||
target_path = audio_path
|
||
except ValueError as ve:
|
||
if str(ve) == "HTML_DETECTED":
|
||
raise HTTPException(400, "下载的文件是网页而非视频,请重试或手动上传。")
|
||
else:
|
||
raise HTTPException(400, "下载的文件已损坏或格式无法识别。")
|
||
|
||
# 2. 提取文案 (Whisper)
|
||
script = await whisper_service.transcribe(str(target_path))
|
||
|
||
# 3. AI 洗稿 (GLM)
|
||
rewritten = None
|
||
if rewrite:
|
||
if script and len(script.strip()) > 0:
|
||
logger.info("Rewriting script...")
|
||
rewritten = await glm_service.rewrite_script(script)
|
||
else:
|
||
logger.warning("No script extracted, skipping rewrite")
|
||
|
||
return success_response({
|
||
"original_script": script,
|
||
"rewritten_script": rewritten
|
||
})
|
||
|
||
except HTTPException as he:
|
||
raise he
|
||
except Exception as e:
|
||
logger.error(f"Tool extract failed: {e}")
|
||
logger.error(traceback.format_exc())
|
||
|
||
# Friendly error message
|
||
msg = str(e)
|
||
if "Fresh cookies" in msg:
|
||
msg = "下载失败:目标平台开启了反爬验证,请过段时间重试或直接上传视频文件。"
|
||
|
||
raise HTTPException(500, f"提取失败: {msg}")
|
||
finally:
|
||
# 清理临时文件
|
||
if temp_path and temp_path.exists():
|
||
try:
|
||
os.remove(temp_path)
|
||
logger.info(f"Cleaned up temp file: {temp_path}")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")
|
||
|
||
|
||
async def download_douyin_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
|
||
"""
|
||
手动下载抖音视频 (Fallback logic - Ported from SuperIPAgent/douyinDownloader)
|
||
使用特定的 User Profile URL 和硬编码 Cookie 绕过反爬
|
||
"""
|
||
import httpx
|
||
|
||
logger.info(f"[SuperIPAgent] Starting download for: {url}")
|
||
|
||
try:
|
||
# 1. 提取 Modal ID (支持短链跳转)
|
||
headers = {
|
||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||
}
|
||
|
||
# 如果是短链或重定向 - 使用异步 httpx
|
||
async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client:
|
||
resp = await client.get(url, headers=headers)
|
||
final_url = str(resp.url)
|
||
|
||
logger.info(f"[SuperIPAgent] Final URL: {final_url}")
|
||
|
||
modal_id = None
|
||
match = re.search(r'/video/(\d+)', final_url)
|
||
if match:
|
||
modal_id = match.group(1)
|
||
|
||
if not modal_id:
|
||
logger.error("[SuperIPAgent] Could not extract modal_id")
|
||
return None
|
||
|
||
logger.info(f"[SuperIPAgent] Extracted modal_id: {modal_id}")
|
||
|
||
# 2. 构造特定请求 URL (Copy from SuperIPAgent)
|
||
# 使用特定用户的 Profile 页 + modal_id 参数,配合特定 Cookie
|
||
target_url = f"https://www.douyin.com/user/MS4wLjABAAAAN_s_hups7LD0N4qnrM3o2gI0vuG3pozNaEolz2_py3cHTTrpVr1Z4dukFD9SOlwY?from_tab_name=main&modal_id={modal_id}"
|
||
|
||
# 3. 使用配置的 Cookie (从环境变量 DOUYIN_COOKIE 读取)
|
||
from app.core.config import settings
|
||
if not settings.DOUYIN_COOKIE:
|
||
logger.warning("[SuperIPAgent] DOUYIN_COOKIE 未配置,视频下载可能失败")
|
||
|
||
headers_with_cookie = {
|
||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"cookie": settings.DOUYIN_COOKIE,
|
||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||
}
|
||
|
||
logger.info(f"[SuperIPAgent] Requesting page with Cookie...")
|
||
|
||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||
response = await client.get(target_url, headers=headers_with_cookie)
|
||
|
||
# 4. 解析 RENDER_DATA
|
||
content_match = re.findall(r'<script id="RENDER_DATA" type="application/json">(.*?)</script>', response.text)
|
||
if not content_match:
|
||
# 尝试解码后再查找?或者结构变了
|
||
# 再尝试找 SSR_HYDRATED_DATA
|
||
if "SSR_HYDRATED_DATA" in response.text:
|
||
content_match = re.findall(r'<script id="SSR_HYDRATED_DATA" type="application/json">(.*?)</script>', response.text)
|
||
|
||
if not content_match:
|
||
logger.error(f"[SuperIPAgent] Could not find RENDER_DATA in page (len={len(response.text)})")
|
||
return None
|
||
|
||
content = unquote(content_match[0])
|
||
try:
|
||
data = json.loads(content)
|
||
except:
|
||
logger.error("[SuperIPAgent] JSON decode failed")
|
||
return None
|
||
|
||
# 5. 提取视频流
|
||
video_url = None
|
||
try:
|
||
# 路径通常是: app -> videoDetail -> video -> bitRateList -> playAddr -> src
|
||
if "app" in data and "videoDetail" in data["app"]:
|
||
info = data["app"]["videoDetail"]["video"]
|
||
if "bitRateList" in info and info["bitRateList"]:
|
||
video_url = info["bitRateList"][0]["playAddr"][0]["src"]
|
||
elif "playAddr" in info and info["playAddr"]:
|
||
video_url = info["playAddr"][0]["src"]
|
||
except Exception as e:
|
||
logger.error(f"[SuperIPAgent] Path extraction failed: {e}")
|
||
|
||
if not video_url:
|
||
logger.error("[SuperIPAgent] No video_url found")
|
||
return None
|
||
|
||
if video_url.startswith("//"):
|
||
video_url = "https:" + video_url
|
||
|
||
logger.info(f"[SuperIPAgent] Found video URL: {video_url[:50]}...")
|
||
|
||
# 6. 下载 (带 Header) - 使用异步 httpx
|
||
temp_path = temp_dir / f"douyin_manual_{timestamp}.mp4"
|
||
download_headers = {
|
||
'Referer': 'https://www.douyin.com/',
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||
}
|
||
|
||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||
async with client.stream("GET", video_url, headers=download_headers) as dl_resp:
|
||
if dl_resp.status_code == 200:
|
||
with open(temp_path, 'wb') as f:
|
||
async for chunk in dl_resp.aiter_bytes(chunk_size=8192):
|
||
f.write(chunk)
|
||
|
||
logger.info(f"[SuperIPAgent] Downloaded successfully: {temp_path}")
|
||
return temp_path
|
||
else:
|
||
logger.error(f"[SuperIPAgent] Download failed: {dl_resp.status_code}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[SuperIPAgent] Logic failed: {e}")
|
||
return None
|
||
|
||
async def download_bilibili_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
|
||
"""
|
||
手动下载 Bilibili 视频 (Fallback logic - Playwright Version)
|
||
B站通常音视频分离,这里只提取音频即可(因为只需要文案)
|
||
"""
|
||
from playwright.async_api import async_playwright
|
||
|
||
logger.info(f"[Playwright] Starting Bilibili download for: {url}")
|
||
|
||
playwright = None
|
||
browser = None
|
||
try:
|
||
playwright = await async_playwright().start()
|
||
# Launch browser (ensure chromium is installed: playwright install chromium)
|
||
browser = await playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
|
||
|
||
# Mobile User Agent often gives single stream?
|
||
# But Bilibili mobile web is tricky. Desktop is fine.
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
# Intercept audio responses?
|
||
# Bilibili streams are usually .m4s
|
||
# But finding the initial state is easier.
|
||
|
||
logger.info("[Playwright] Navigating to Bilibili...")
|
||
await page.goto(url, timeout=45000)
|
||
|
||
# Wait for video element (triggers loading)
|
||
try:
|
||
await page.wait_for_selector('video', timeout=15000)
|
||
except:
|
||
logger.warning("[Playwright] Video selector timeout")
|
||
|
||
# 1. Try extracting from __playinfo__
|
||
# window.__playinfo__ contains dash streams
|
||
playinfo = await page.evaluate("window.__playinfo__")
|
||
|
||
audio_url = None
|
||
|
||
if playinfo and "data" in playinfo and "dash" in playinfo["data"]:
|
||
dash = playinfo["data"]["dash"]
|
||
if "audio" in dash and dash["audio"]:
|
||
audio_url = dash["audio"][0]["baseUrl"]
|
||
logger.info(f"[Playwright] Found audio stream in __playinfo__: {audio_url[:50]}...")
|
||
|
||
# 2. If playinfo fails, try extracting video src (sometimes it's a blob, which we can't fetch easily without interception)
|
||
# But interception is complex. Let's try requests with Referer if we have URL.
|
||
|
||
if not audio_url:
|
||
logger.warning("[Playwright] Could not find audio in __playinfo__")
|
||
return None
|
||
|
||
# Download the audio stream
|
||
temp_path = temp_dir / f"bilibili_audio_{timestamp}.m4s" # usually m4s
|
||
|
||
try:
|
||
api_request = context.request
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Referer": "https://www.bilibili.com/"
|
||
}
|
||
|
||
logger.info(f"[Playwright] Downloading audio stream...")
|
||
response = await api_request.get(audio_url, headers=headers)
|
||
|
||
if response.status == 200:
|
||
body = await response.body()
|
||
with open(temp_path, 'wb') as f:
|
||
f.write(body)
|
||
|
||
logger.info(f"[Playwright] Downloaded successfully: {temp_path}")
|
||
return temp_path
|
||
else:
|
||
logger.error(f"[Playwright] API Request failed: {response.status}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Playwright] Download logic error: {e}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Playwright] Bilibili download failed: {e}")
|
||
return None
|
||
finally:
|
||
if browser:
|
||
await browser.close()
|
||
if playwright:
|
||
await playwright.stop()
|