356 lines
13 KiB
Python
356 lines
13 KiB
Python
import asyncio
|
||
import os
|
||
import re
|
||
import json
|
||
import time
|
||
import shutil
|
||
import subprocess
|
||
import traceback
|
||
from pathlib import Path
|
||
from typing import Optional, Any
|
||
from urllib.parse import unquote
|
||
|
||
import httpx
|
||
from loguru import logger
|
||
|
||
from app.services.whisper_service import whisper_service
|
||
from app.services.glm_service import glm_service
|
||
|
||
|
||
async def extract_script(file=None, url: Optional[str] = None, rewrite: bool = True) -> dict:
|
||
"""
|
||
文案提取:上传文件或视频链接 -> Whisper 转写 -> (可选) GLM 洗稿
|
||
"""
|
||
if not file and not url:
|
||
raise ValueError("必须提供文件或视频链接")
|
||
|
||
temp_path = None
|
||
try:
|
||
timestamp = int(time.time())
|
||
temp_dir = Path("/tmp")
|
||
if os.name == 'nt':
|
||
temp_dir = Path("d:/tmp")
|
||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
loop = asyncio.get_event_loop()
|
||
|
||
# 1. 获取/保存文件
|
||
if file:
|
||
filename = file.filename
|
||
if not filename:
|
||
raise ValueError("文件名无效")
|
||
safe_filename = Path(filename).name.replace(" ", "_")
|
||
temp_path = temp_dir / f"tool_extract_{timestamp}_{safe_filename}"
|
||
await loop.run_in_executor(None, lambda: shutil.copyfileobj(file.file, open(temp_path, "wb")))
|
||
logger.info(f"Tool processing upload file: {temp_path}")
|
||
else:
|
||
temp_path = await _download_video(url, temp_dir, timestamp)
|
||
|
||
if not temp_path or not temp_path.exists():
|
||
raise ValueError("文件获取失败")
|
||
|
||
# 1.5 安全转换: 强制转为 WAV (16k)
|
||
audio_path = temp_dir / f"extract_audio_{timestamp}.wav"
|
||
try:
|
||
await loop.run_in_executor(None, lambda: _convert_to_wav(temp_path, audio_path))
|
||
logger.info(f"Converted to WAV: {audio_path}")
|
||
except ValueError as ve:
|
||
if str(ve) == "HTML_DETECTED":
|
||
raise ValueError("下载的文件是网页而非视频,请重试或手动上传。")
|
||
else:
|
||
raise ValueError("下载的文件已损坏或格式无法识别。")
|
||
|
||
# 2. 提取文案 (Whisper)
|
||
script = await whisper_service.transcribe(str(audio_path))
|
||
|
||
# 3. AI 洗稿 (GLM)
|
||
rewritten = None
|
||
if rewrite and script and len(script.strip()) > 0:
|
||
logger.info("Rewriting script...")
|
||
rewritten = await glm_service.rewrite_script(script)
|
||
|
||
return {
|
||
"original_script": script,
|
||
"rewritten_script": rewritten
|
||
}
|
||
|
||
finally:
|
||
if temp_path and temp_path.exists():
|
||
try:
|
||
os.remove(temp_path)
|
||
logger.info(f"Cleaned up temp file: {temp_path}")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")
|
||
|
||
|
||
def _convert_to_wav(input_path: Path, output_path: Path) -> None:
|
||
"""FFmpeg 转换为 16k WAV"""
|
||
try:
|
||
convert_cmd = [
|
||
'ffmpeg',
|
||
'-i', str(input_path),
|
||
'-vn',
|
||
'-acodec', 'pcm_s16le',
|
||
'-ar', '16000',
|
||
'-ac', '1',
|
||
'-y',
|
||
str(output_path)
|
||
]
|
||
subprocess.run(convert_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||
except subprocess.CalledProcessError as e:
|
||
error_log = e.stderr.decode('utf-8', errors='ignore') if e.stderr else str(e)
|
||
logger.error(f"FFmpeg check/convert failed: {error_log}")
|
||
head = b""
|
||
try:
|
||
with open(input_path, 'rb') as f:
|
||
head = f.read(100)
|
||
except:
|
||
pass
|
||
if b'<!DOCTYPE html' in head or b'<html' in head:
|
||
raise ValueError("HTML_DETECTED")
|
||
raise ValueError("CONVERT_FAILED")
|
||
|
||
|
||
async def _download_video(url: str, temp_dir: Path, timestamp: int) -> Path:
|
||
"""下载视频(yt-dlp 优先,失败回退手动解析)"""
|
||
url_value = url
|
||
url_match = re.search(r'https?://[^\s]+', url_value)
|
||
if url_match:
|
||
extracted_url = url_match.group(0)
|
||
logger.info(f"Extracted URL from text: {extracted_url}")
|
||
url_value = extracted_url
|
||
|
||
logger.info(f"Tool downloading URL: {url_value}")
|
||
loop = asyncio.get_event_loop()
|
||
|
||
# 先尝试 yt-dlp
|
||
try:
|
||
temp_path = await loop.run_in_executor(None, lambda: _download_yt_dlp(url_value, temp_dir, timestamp))
|
||
logger.info(f"yt-dlp downloaded to: {temp_path}")
|
||
return temp_path
|
||
except Exception as e:
|
||
logger.warning(f"yt-dlp download failed: {e}. Trying manual fallback...")
|
||
|
||
if "douyin" in url_value:
|
||
manual_path = await _download_douyin_manual(url_value, temp_dir, timestamp)
|
||
if manual_path:
|
||
return manual_path
|
||
raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}")
|
||
elif "bilibili" in url_value:
|
||
manual_path = await _download_bilibili_manual(url_value, temp_dir, timestamp)
|
||
if manual_path:
|
||
return manual_path
|
||
raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}")
|
||
else:
|
||
raise ValueError(f"视频下载失败: {str(e)}")
|
||
|
||
|
||
def _download_yt_dlp(url_value: str, temp_dir: Path, timestamp: int) -> Path:
|
||
"""yt-dlp 下载(阻塞调用,应在线程池中运行)"""
|
||
import yt_dlp
|
||
logger.info("Attempting download with yt-dlp...")
|
||
|
||
ydl_opts = {
|
||
'format': 'bestaudio/best',
|
||
'outtmpl': str(temp_dir / f"tool_download_{timestamp}_%(id)s.%(ext)s"),
|
||
'quiet': True,
|
||
'no_warnings': True,
|
||
'http_headers': {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||
'Referer': 'https://www.douyin.com/',
|
||
}
|
||
}
|
||
|
||
with yt_dlp.YoutubeDL() as ydl_raw:
|
||
ydl: Any = ydl_raw
|
||
ydl.params.update(ydl_opts)
|
||
info = ydl.extract_info(url_value, download=True)
|
||
if 'requested_downloads' in info:
|
||
downloaded_file = info['requested_downloads'][0]['filepath']
|
||
else:
|
||
ext = info.get('ext', 'mp4')
|
||
id = info.get('id')
|
||
downloaded_file = str(temp_dir / f"tool_download_{timestamp}_{id}.{ext}")
|
||
|
||
return Path(downloaded_file)
|
||
|
||
|
||
async def _download_douyin_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
|
||
"""手动下载抖音视频 (Fallback)"""
|
||
logger.info(f"[SuperIPAgent] Starting download for: {url}")
|
||
|
||
try:
|
||
headers = {
|
||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||
}
|
||
|
||
async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client:
|
||
resp = await client.get(url, headers=headers)
|
||
final_url = str(resp.url)
|
||
|
||
logger.info(f"[SuperIPAgent] Final URL: {final_url}")
|
||
|
||
modal_id = None
|
||
match = re.search(r'/video/(\d+)', final_url)
|
||
if match:
|
||
modal_id = match.group(1)
|
||
|
||
if not modal_id:
|
||
logger.error("[SuperIPAgent] Could not extract modal_id")
|
||
return None
|
||
|
||
logger.info(f"[SuperIPAgent] Extracted modal_id: {modal_id}")
|
||
|
||
target_url = f"https://www.douyin.com/user/MS4wLjABAAAAN_s_hups7LD0N4qnrM3o2gI0vuG3pozNaEolz2_py3cHTTrpVr1Z4dukFD9SOlwY?from_tab_name=main&modal_id={modal_id}"
|
||
|
||
from app.core.config import settings
|
||
if not settings.DOUYIN_COOKIE:
|
||
logger.warning("[SuperIPAgent] DOUYIN_COOKIE 未配置,视频下载可能失败")
|
||
|
||
headers_with_cookie = {
|
||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"cookie": settings.DOUYIN_COOKIE,
|
||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||
}
|
||
|
||
logger.info(f"[SuperIPAgent] Requesting page with Cookie...")
|
||
|
||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||
response = await client.get(target_url, headers=headers_with_cookie)
|
||
|
||
content_match = re.findall(r'<script id="RENDER_DATA" type="application/json">(.*?)</script>', response.text)
|
||
if not content_match:
|
||
if "SSR_HYDRATED_DATA" in response.text:
|
||
content_match = re.findall(r'<script id="SSR_HYDRATED_DATA" type="application/json">(.*?)</script>', response.text)
|
||
|
||
if not content_match:
|
||
logger.error(f"[SuperIPAgent] Could not find RENDER_DATA in page (len={len(response.text)})")
|
||
return None
|
||
|
||
content = unquote(content_match[0])
|
||
try:
|
||
data = json.loads(content)
|
||
except:
|
||
logger.error("[SuperIPAgent] JSON decode failed")
|
||
return None
|
||
|
||
video_url = None
|
||
try:
|
||
if "app" in data and "videoDetail" in data["app"]:
|
||
info = data["app"]["videoDetail"]["video"]
|
||
if "bitRateList" in info and info["bitRateList"]:
|
||
video_url = info["bitRateList"][0]["playAddr"][0]["src"]
|
||
elif "playAddr" in info and info["playAddr"]:
|
||
video_url = info["playAddr"][0]["src"]
|
||
except Exception as e:
|
||
logger.error(f"[SuperIPAgent] Path extraction failed: {e}")
|
||
|
||
if not video_url:
|
||
logger.error("[SuperIPAgent] No video_url found")
|
||
return None
|
||
|
||
if video_url.startswith("//"):
|
||
video_url = "https:" + video_url
|
||
|
||
logger.info(f"[SuperIPAgent] Found video URL: {video_url[:50]}...")
|
||
|
||
temp_path = temp_dir / f"douyin_manual_{timestamp}.mp4"
|
||
download_headers = {
|
||
'Referer': 'https://www.douyin.com/',
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||
}
|
||
|
||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||
async with client.stream("GET", video_url, headers=download_headers) as dl_resp:
|
||
if dl_resp.status_code == 200:
|
||
with open(temp_path, 'wb') as f:
|
||
async for chunk in dl_resp.aiter_bytes(chunk_size=8192):
|
||
f.write(chunk)
|
||
|
||
logger.info(f"[SuperIPAgent] Downloaded successfully: {temp_path}")
|
||
return temp_path
|
||
else:
|
||
logger.error(f"[SuperIPAgent] Download failed: {dl_resp.status_code}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[SuperIPAgent] Logic failed: {e}")
|
||
return None
|
||
|
||
|
||
async def _download_bilibili_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
|
||
"""手动下载 Bilibili 视频 (Playwright Fallback)"""
|
||
from playwright.async_api import async_playwright
|
||
|
||
logger.info(f"[Playwright] Starting Bilibili download for: {url}")
|
||
|
||
playwright = None
|
||
browser = None
|
||
try:
|
||
playwright = await async_playwright().start()
|
||
browser = await playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
|
||
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
logger.info("[Playwright] Navigating to Bilibili...")
|
||
await page.goto(url, timeout=45000)
|
||
|
||
try:
|
||
await page.wait_for_selector('video', timeout=15000)
|
||
except:
|
||
logger.warning("[Playwright] Video selector timeout")
|
||
|
||
playinfo = await page.evaluate("window.__playinfo__")
|
||
|
||
audio_url = None
|
||
|
||
if playinfo and "data" in playinfo and "dash" in playinfo["data"]:
|
||
dash = playinfo["data"]["dash"]
|
||
if "audio" in dash and dash["audio"]:
|
||
audio_url = dash["audio"][0]["baseUrl"]
|
||
logger.info(f"[Playwright] Found audio stream in __playinfo__: {audio_url[:50]}...")
|
||
|
||
if not audio_url:
|
||
logger.warning("[Playwright] Could not find audio in __playinfo__")
|
||
return None
|
||
|
||
temp_path = temp_dir / f"bilibili_audio_{timestamp}.m4s"
|
||
|
||
try:
|
||
api_request = context.request
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Referer": "https://www.bilibili.com/"
|
||
}
|
||
|
||
logger.info(f"[Playwright] Downloading audio stream...")
|
||
response = await api_request.get(audio_url, headers=headers)
|
||
|
||
if response.status == 200:
|
||
body = await response.body()
|
||
with open(temp_path, 'wb') as f:
|
||
f.write(body)
|
||
|
||
logger.info(f"[Playwright] Downloaded successfully: {temp_path}")
|
||
return temp_path
|
||
else:
|
||
logger.error(f"[Playwright] API Request failed: {response.status}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Playwright] Download logic error: {e}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Playwright] Bilibili download failed: {e}")
|
||
return None
|
||
finally:
|
||
if browser:
|
||
await browser.close()
|
||
if playwright:
|
||
await playwright.stop()
|