399 lines
20 KiB
Python
399 lines
20 KiB
Python
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
||
from typing import Optional
|
||
import shutil
|
||
import os
|
||
import time
|
||
from pathlib import Path
|
||
from loguru import logger
|
||
import traceback
|
||
import re
|
||
import json
|
||
import requests
|
||
from urllib.parse import unquote
|
||
|
||
from app.services.whisper_service import whisper_service
|
||
from app.services.glm_service import glm_service
|
||
|
||
router = APIRouter()
|
||
|
||
@router.post("/extract-script")
|
||
async def extract_script_tool(
|
||
file: Optional[UploadFile] = File(None),
|
||
url: Optional[str] = Form(None),
|
||
rewrite: bool = Form(True)
|
||
):
|
||
"""
|
||
独立文案提取工具
|
||
支持上传视频/音频 OR 输入视频链接 -> 提取文字 -> (可选) AI洗稿
|
||
"""
|
||
if not file and not url:
|
||
raise HTTPException(400, "必须提供文件或视频链接")
|
||
|
||
temp_path = None
|
||
try:
|
||
timestamp = int(time.time())
|
||
temp_dir = Path("/tmp")
|
||
if os.name == 'nt':
|
||
temp_dir = Path("d:/tmp")
|
||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 1. 获取/保存文件
|
||
loop = asyncio.get_event_loop()
|
||
|
||
if file:
|
||
safe_filename = Path(file.filename).name.replace(" ", "_")
|
||
temp_path = temp_dir / f"tool_extract_{timestamp}_{safe_filename}"
|
||
# 文件 I/O 放入线程池
|
||
await loop.run_in_executor(None, lambda: shutil.copyfileobj(file.file, open(temp_path, "wb")))
|
||
logger.info(f"Tool processing upload file: {temp_path}")
|
||
else:
|
||
# URL 下载逻辑
|
||
# 自动提取文案中的链接 (支持 Douyin/Bilibili 等分享文案)
|
||
url_match = re.search(r'https?://[^\s]+', url)
|
||
if url_match:
|
||
extracted_url = url_match.group(0)
|
||
logger.info(f"Extracted URL from text: {extracted_url}")
|
||
url = extracted_url
|
||
|
||
logger.info(f"Tool downloading URL: {url}")
|
||
|
||
# 封装 yt-dlp 下载函数 (Blocking)
|
||
def _download_yt_dlp():
|
||
import yt_dlp
|
||
logger.info("Attempting download with yt-dlp...")
|
||
|
||
ydl_opts = {
|
||
'format': 'bestaudio/best',
|
||
'outtmpl': str(temp_dir / f"tool_download_{timestamp}_%(id)s.%(ext)s"),
|
||
'quiet': True,
|
||
'no_warnings': True,
|
||
'http_headers': {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||
'Referer': 'https://www.douyin.com/',
|
||
}
|
||
}
|
||
|
||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||
info = ydl.extract_info(url, download=True)
|
||
if 'requested_downloads' in info:
|
||
downloaded_file = info['requested_downloads'][0]['filepath']
|
||
else:
|
||
ext = info.get('ext', 'mp4')
|
||
id = info.get('id')
|
||
downloaded_file = str(temp_dir / f"tool_download_{timestamp}_{id}.{ext}")
|
||
|
||
return Path(downloaded_file)
|
||
|
||
# 先尝试 yt-dlp (Run in Executor)
|
||
try:
|
||
temp_path = await loop.run_in_executor(None, _download_yt_dlp)
|
||
logger.info(f"yt-dlp downloaded to: {temp_path}")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"yt-dlp download failed: {e}. Trying manual Douyin fallback...")
|
||
|
||
# 失败则尝试手动解析 (Douyin Fallback)
|
||
if "douyin" in url:
|
||
manual_path = await download_douyin_manual(url, temp_dir, timestamp)
|
||
if manual_path:
|
||
temp_path = manual_path
|
||
logger.info(f"Manual Douyin fallback successful: {temp_path}")
|
||
else:
|
||
raise HTTPException(400, f"视频下载失败。yt-dlp 报错: {str(e)}")
|
||
elif "bilibili" in url:
|
||
manual_path = await download_bilibili_manual(url, temp_dir, timestamp)
|
||
if manual_path:
|
||
temp_path = manual_path
|
||
logger.info(f"Manual Bilibili fallback successful: {temp_path}")
|
||
else:
|
||
raise HTTPException(400, f"视频下载失败。yt-dlp 报错: {str(e)}")
|
||
else:
|
||
raise HTTPException(400, f"视频下载失败: {str(e)}")
|
||
|
||
if not temp_path or not temp_path.exists():
|
||
raise HTTPException(400, "文件获取失败")
|
||
|
||
# 1.5 安全转换: 强制转为 WAV (16k)
|
||
import subprocess
|
||
audio_path = temp_dir / f"extract_audio_{timestamp}.wav"
|
||
|
||
def _convert_audio():
|
||
try:
|
||
convert_cmd = [
|
||
'ffmpeg',
|
||
'-i', str(temp_path),
|
||
'-vn', # 忽略视频
|
||
'-acodec', 'pcm_s16le',
|
||
'-ar', '16000', # Whisper 推荐采样率
|
||
'-ac', '1', # 单声道
|
||
'-y', # 覆盖
|
||
str(audio_path)
|
||
]
|
||
# 捕获 stderr
|
||
subprocess.run(convert_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||
return True
|
||
except subprocess.CalledProcessError as e:
|
||
error_log = e.stderr.decode('utf-8', errors='ignore') if e.stderr else str(e)
|
||
logger.error(f"FFmpeg check/convert failed: {error_log}")
|
||
# 检查是否为 HTML
|
||
head = b""
|
||
try:
|
||
with open(temp_path, 'rb') as f:
|
||
head = f.read(100)
|
||
except: pass
|
||
if b'<!DOCTYPE html' in head or b'<html' in head:
|
||
raise ValueError("HTML_DETECTED")
|
||
raise ValueError("CONVERT_FAILED")
|
||
|
||
# 执行转换 (Run in Executor)
|
||
try:
|
||
await loop.run_in_executor(None, _convert_audio)
|
||
logger.info(f"Converted to WAV: {audio_path}")
|
||
target_path = audio_path
|
||
except ValueError as ve:
|
||
if str(ve) == "HTML_DETECTED":
|
||
raise HTTPException(400, "下载的文件是网页而非视频,请重试或手动上传。")
|
||
else:
|
||
raise HTTPException(400, "下载的文件已损坏或格式无法识别。")
|
||
|
||
# 2. 提取文案 (Whisper)
|
||
script = await whisper_service.transcribe(str(target_path))
|
||
|
||
# 3. AI 洗稿 (GLM)
|
||
rewritten = None
|
||
if rewrite:
|
||
if script and len(script.strip()) > 0:
|
||
logger.info("Rewriting script...")
|
||
rewritten = await glm_service.rewrite_script(script)
|
||
else:
|
||
logger.warning("No script extracted, skipping rewrite")
|
||
|
||
return {
|
||
"success": True,
|
||
"original_script": script,
|
||
"rewritten_script": rewritten
|
||
}
|
||
|
||
except HTTPException as he:
|
||
raise he
|
||
except Exception as e:
|
||
logger.error(f"Tool extract failed: {e}")
|
||
logger.error(traceback.format_exc())
|
||
|
||
# Friendly error message
|
||
msg = str(e)
|
||
if "Fresh cookies" in msg:
|
||
msg = "下载失败:目标平台开启了反爬验证,请过段时间重试或直接上传视频文件。"
|
||
|
||
raise HTTPException(500, f"提取失败: {msg}")
|
||
finally:
|
||
# 清理临时文件
|
||
if temp_path and temp_path.exists():
|
||
try:
|
||
os.remove(temp_path)
|
||
logger.info(f"Cleaned up temp file: {temp_path}")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")
|
||
|
||
|
||
async def download_douyin_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
|
||
"""
|
||
手动下载抖音视频 (Fallback logic - Ported from SuperIPAgent/douyinDownloader)
|
||
使用特定的 User Profile URL 和硬编码 Cookie 绕过反爬
|
||
"""
|
||
logger.info(f"[SuperIPAgent] Starting download for: {url}")
|
||
|
||
try:
|
||
# 1. 提取 Modal ID (支持短链跳转)
|
||
headers = {
|
||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||
}
|
||
|
||
# 如果是短链或重定向
|
||
resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10)
|
||
final_url = resp.url
|
||
logger.info(f"[SuperIPAgent] Final URL: {final_url}")
|
||
|
||
modal_id = None
|
||
match = re.search(r'/video/(\d+)', final_url)
|
||
if match:
|
||
modal_id = match.group(1)
|
||
|
||
if not modal_id:
|
||
logger.error("[SuperIPAgent] Could not extract modal_id")
|
||
return None
|
||
|
||
logger.info(f"[SuperIPAgent] Extracted modal_id: {modal_id}")
|
||
|
||
# 2. 构造特定请求 URL (Copy from SuperIPAgent)
|
||
# 使用特定用户的 Profile 页 + modal_id 参数,配合特定 Cookie
|
||
target_url = f"https://www.douyin.com/user/MS4wLjABAAAAN_s_hups7LD0N4qnrM3o2gI0vuG3pozNaEolz2_py3cHTTrpVr1Z4dukFD9SOlwY?from_tab_name=main&modal_id={modal_id}"
|
||
|
||
# 3. 使用硬编码 Cookie (Copy from SuperIPAgent)
|
||
headers_with_cookie = {
|
||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"cookie": "douyin.com; device_web_cpu_core=10; device_web_memory_size=8; __ac_nonce=06760391f00b9b51264ae; __ac_signature=_02B4Z6wo00f019a5ceAAAIDAhEZR-X3jjWfWmXVAAJLXd4; ttwid=1%7C7MTKBSMsP4eOv9h5NAh8p0E-NYIud09ftNmB0mjLpWc%7C1734359327%7C8794abeabbd47447e1f56e5abc726be089f2a0344d6343b5f75f23e7b0f0028f; UIFID_TEMP=0de8750d2b188f4235dbfd208e44abbb976428f0720eb983255afefa45d39c0c6532e1d4768dd8587bf919f866ff1396912bcb2af71efee56a14a2a9f37b74010d0a0413795262f6d4afe02a032ac7ab; s_v_web_id=verify_m4r4ribr_c7krmY1z_WoeI_43po_ATpO_I4o8U1bex2D7; hevc_supported=true; home_can_add_dy_2_desktop=%220%22; dy_swidth=2560; dy_sheight=1440; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A2560%2C%5C%22screen_height%5C%22%3A1440%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A10%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A50%7D%22; strategyABtestKey=%221734359328.577%22; csrf_session_id=2f53aed9aa6974e83aa9a1014180c3a4; fpk1=U2FsdGVkX1/IpBh0qdmlKAVhGyYHgur4/VtL9AReZoeSxadXn4juKvsakahRGqjxOPytHWspYoBogyhS/V6QSw==; fpk2=0845b309c7b9b957afd9ecf775a4c21f; passport_csrf_token=d80e0c5b2fa2328219856be5ba7e671e; passport_csrf_token_default=d80e0c5b2fa2328219856be5ba7e671e; odin_tt=3c891091d2eb0f4718c1d5645bc4a0017032d4d5aa989decb729e9da2ad570918cbe5e9133dc6b145fa8c758de98efe32ff1f81aa0d611e838cc73ab08ef7d3f6adf66ab4d10e8372ddd628f94f16b8e; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.5%7D; bd_ticket_guard_client_web_domain=2; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%7D; UIFID=0de8750d2b188f4235dbfd208e44abbb976428f0720eb983255afefa45d39c0c6532e1d4768dd8587bf919f866ff139655a3c2b735923234f371c699560c657923fd3d6c5b63ab7bb9b83423b6cb4787e2ce66a7fbc4ecb24c8570f520fe6de068bbb95115023c0c6c1b6ee31b49fb7e3996fb8349f43a3fd8b7a61cd9e18e8fe65eb6a7c13de4c0960d84e344b644725db3eb2fa6b7caf821de1b50527979f2; is_dash_user=1; biz_trace_id=b57a241f; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCTEo2R0lDalVoWW1XcHpGOFdrN0Vrc0dXcCtaUzNKY1g4NGNGY2k0TTl1TEowNjdUb21mbFU5aDdvWVBGamhNRWNRQWtKdnN1MnM3RmpTWnlJQXpHMjA9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoyfQ%3D%3D; download_guide=%221%2F20241216%2F0%22; sdk_source_info=7e276470716a68645a606960273f276364697660272927676c715a6d6069756077273f276364697660272927666d776a68605a607d71606b766c6a6b5a7666776c7571273f275e58272927666a6b766a69605a696c6061273f27636469766027292762696a6764695a7364776c6467696076273f275e5827292771273f273d33323131333c3036313632342778; bit_env=RiOY4jzzpxZoVCl6zdVSVhVRjdwHRTxqcqWdqMBZLPGjMdB4Tax1kAELHNTVAAh72KuhumewE4Lq6f0-VJ2UpJrkrhSxoPw9LUb3zQrq1OSwbeSPHkRlRgRQvO89sItdGUyq1oFr0XyRCnMYG87KSeWyc4x0czGR0o50hTDoDLG5rJVoRcdQOLvjiAegsqyytKF59sPX_QM9qffK2SqYsg0hCggURc_AI6kguDDE5DvG0bnyz1utw4z1eEnIoLrkGDqzqBZj4dOAr0BVU6ofbsS-pOQ2u2PM1dLP9FlBVBlVaqYVgHJeSLsR5k76BRTddUjTb4zEilVIEwAMJWGN4I1BxVt6fC9B5tBQpuT0lj3n3eKXCKXZsd8FrEs5_pbfDsxV-e_WMiXI2ff4qxiTC0U73sfo9OpicKICtZjdq8qsHxJuu6wVR36zvXeL2Wch5C6MzprNvkivv0l8nbh2mSgy1nabZr3dmU6NcR-Bg3Q3xTWUlR9aAUmpopC-cNuXjgLpT-Lw1AYGilSUnCvosth1Gfypq-b0MpgmdSDgTrQ%3D; gulu_source_res=eyJwX2luIjoiMDhjOGQ3ZTJiODQyNjZkZWI5Y2VkMGJiODNlNmY1ZWY0ZjMyNTE2ZmYyZjAzNDMzZjI0OWU1Y2Q1NTczNTk5NyJ9; passport_auth_mix_state=hp9bc3dgb1tm5wd8p82zawus27g0e3ue; IsDouyinActive=false",
|
||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||
}
|
||
|
||
logger.info(f"[SuperIPAgent] Requesting page with Cookie...")
|
||
# 必须 verify=False 否则有些环境会报错
|
||
response = requests.get(target_url, headers=headers_with_cookie, timeout=10)
|
||
|
||
# 4. 解析 RENDER_DATA
|
||
content_match = re.findall(r'<script id="RENDER_DATA" type="application/json">(.*?)</script>', response.text)
|
||
if not content_match:
|
||
# 尝试解码后再查找?或者结构变了
|
||
# 再尝试找 SSR_HYDRATED_DATA
|
||
if "SSR_HYDRATED_DATA" in response.text:
|
||
content_match = re.findall(r'<script id="SSR_HYDRATED_DATA" type="application/json">(.*?)</script>', response.text)
|
||
|
||
if not content_match:
|
||
logger.error(f"[SuperIPAgent] Could not find RENDER_DATA in page (len={len(response.text)})")
|
||
return None
|
||
|
||
content = unquote(content_match[0])
|
||
try:
|
||
data = json.loads(content)
|
||
except:
|
||
logger.error("[SuperIPAgent] JSON decode failed")
|
||
return None
|
||
|
||
# 5. 提取视频流
|
||
video_url = None
|
||
try:
|
||
# 路径通常是: app -> videoDetail -> video -> bitRateList -> playAddr -> src
|
||
if "app" in data and "videoDetail" in data["app"]:
|
||
info = data["app"]["videoDetail"]["video"]
|
||
if "bitRateList" in info and info["bitRateList"]:
|
||
video_url = info["bitRateList"][0]["playAddr"][0]["src"]
|
||
elif "playAddr" in info and info["playAddr"]:
|
||
video_url = info["playAddr"][0]["src"]
|
||
except Exception as e:
|
||
logger.error(f"[SuperIPAgent] Path extraction failed: {e}")
|
||
|
||
if not video_url:
|
||
logger.error("[SuperIPAgent] No video_url found")
|
||
return None
|
||
|
||
if video_url.startswith("//"):
|
||
video_url = "https:" + video_url
|
||
|
||
logger.info(f"[SuperIPAgent] Found video URL: {video_url[:50]}...")
|
||
|
||
# 6. 下载 (带 Header)
|
||
temp_path = temp_dir / f"douyin_manual_{timestamp}.mp4"
|
||
download_headers = {
|
||
'Referer': 'https://www.douyin.com/',
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||
}
|
||
|
||
dl_resp = requests.get(video_url, headers=download_headers, stream=True, timeout=60)
|
||
if dl_resp.status_code == 200:
|
||
with open(temp_path, 'wb') as f:
|
||
for chunk in dl_resp.iter_content(chunk_size=1024):
|
||
f.write(chunk)
|
||
|
||
logger.info(f"[SuperIPAgent] Downloaded successfully: {temp_path}")
|
||
return temp_path
|
||
else:
|
||
logger.error(f"[SuperIPAgent] Download failed: {dl_resp.status_code}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[SuperIPAgent] Logic failed: {e}")
|
||
return None
|
||
|
||
async def download_bilibili_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
|
||
"""
|
||
手动下载 Bilibili 视频 (Fallback logic - Playwright Version)
|
||
B站通常音视频分离,这里只提取音频即可(因为只需要文案)
|
||
"""
|
||
from playwright.async_api import async_playwright
|
||
|
||
logger.info(f"[Playwright] Starting Bilibili download for: {url}")
|
||
|
||
playwright = None
|
||
browser = None
|
||
try:
|
||
playwright = await async_playwright().start()
|
||
# Launch browser (ensure chromium is installed: playwright install chromium)
|
||
browser = await playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
|
||
|
||
# Mobile User Agent often gives single stream?
|
||
# But Bilibili mobile web is tricky. Desktop is fine.
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
# Intercept audio responses?
|
||
# Bilibili streams are usually .m4s
|
||
# But finding the initial state is easier.
|
||
|
||
logger.info("[Playwright] Navigating to Bilibili...")
|
||
await page.goto(url, timeout=45000)
|
||
|
||
# Wait for video element (triggers loading)
|
||
try:
|
||
await page.wait_for_selector('video', timeout=15000)
|
||
except:
|
||
logger.warning("[Playwright] Video selector timeout")
|
||
|
||
# 1. Try extracting from __playinfo__
|
||
# window.__playinfo__ contains dash streams
|
||
playinfo = await page.evaluate("window.__playinfo__")
|
||
|
||
audio_url = None
|
||
|
||
if playinfo and "data" in playinfo and "dash" in playinfo["data"]:
|
||
dash = playinfo["data"]["dash"]
|
||
if "audio" in dash and dash["audio"]:
|
||
audio_url = dash["audio"][0]["baseUrl"]
|
||
logger.info(f"[Playwright] Found audio stream in __playinfo__: {audio_url[:50]}...")
|
||
|
||
# 2. If playinfo fails, try extracting video src (sometimes it's a blob, which we can't fetch easily without interception)
|
||
# But interception is complex. Let's try requests with Referer if we have URL.
|
||
|
||
if not audio_url:
|
||
logger.warning("[Playwright] Could not find audio in __playinfo__")
|
||
return None
|
||
|
||
# Download the audio stream
|
||
temp_path = temp_dir / f"bilibili_audio_{timestamp}.m4s" # usually m4s
|
||
|
||
try:
|
||
api_request = context.request
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Referer": "https://www.bilibili.com/"
|
||
}
|
||
|
||
logger.info(f"[Playwright] Downloading audio stream...")
|
||
response = await api_request.get(audio_url, headers=headers)
|
||
|
||
if response.status == 200:
|
||
body = await response.body()
|
||
with open(temp_path, 'wb') as f:
|
||
f.write(body)
|
||
|
||
logger.info(f"[Playwright] Downloaded successfully: {temp_path}")
|
||
return temp_path
|
||
else:
|
||
logger.error(f"[Playwright] API Request failed: {response.status}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Playwright] Download logic error: {e}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Playwright] Bilibili download failed: {e}")
|
||
return None
|
||
finally:
|
||
if browser:
|
||
await browser.close()
|
||
if playwright:
|
||
await playwright.stop()
|