import asyncio import os import re import json import time import shutil import subprocess import traceback from pathlib import Path from typing import Optional, Any from urllib.parse import unquote import httpx from loguru import logger from app.services.whisper_service import whisper_service from app.services.glm_service import glm_service async def extract_script(file=None, url: Optional[str] = None, rewrite: bool = True) -> dict: """ 文案提取:上传文件或视频链接 -> Whisper 转写 -> (可选) GLM 洗稿 """ if not file and not url: raise ValueError("必须提供文件或视频链接") temp_path = None try: timestamp = int(time.time()) temp_dir = Path("/tmp") if os.name == 'nt': temp_dir = Path("d:/tmp") temp_dir.mkdir(parents=True, exist_ok=True) loop = asyncio.get_event_loop() # 1. 获取/保存文件 if file: filename = file.filename if not filename: raise ValueError("文件名无效") safe_filename = Path(filename).name.replace(" ", "_") temp_path = temp_dir / f"tool_extract_{timestamp}_{safe_filename}" await loop.run_in_executor(None, lambda: shutil.copyfileobj(file.file, open(temp_path, "wb"))) logger.info(f"Tool processing upload file: {temp_path}") else: temp_path = await _download_video(url, temp_dir, timestamp) if not temp_path or not temp_path.exists(): raise ValueError("文件获取失败") # 1.5 安全转换: 强制转为 WAV (16k) audio_path = temp_dir / f"extract_audio_{timestamp}.wav" try: await loop.run_in_executor(None, lambda: _convert_to_wav(temp_path, audio_path)) logger.info(f"Converted to WAV: {audio_path}") except ValueError as ve: if str(ve) == "HTML_DETECTED": raise ValueError("下载的文件是网页而非视频,请重试或手动上传。") else: raise ValueError("下载的文件已损坏或格式无法识别。") # 2. 提取文案 (Whisper) script = await whisper_service.transcribe(str(audio_path)) # 3. AI 洗稿 (GLM) rewritten = None if rewrite and script and len(script.strip()) > 0: logger.info("Rewriting script...") rewritten = await glm_service.rewrite_script(script) return { "original_script": script, "rewritten_script": rewritten } finally: if temp_path and temp_path.exists(): try: os.remove(temp_path) logger.info(f"Cleaned up temp file: {temp_path}") except Exception as e: logger.warning(f"Failed to cleanup temp file {temp_path}: {e}") def _convert_to_wav(input_path: Path, output_path: Path) -> None: """FFmpeg 转换为 16k WAV""" try: convert_cmd = [ 'ffmpeg', '-i', str(input_path), '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-y', str(output_path) ] subprocess.run(convert_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: error_log = e.stderr.decode('utf-8', errors='ignore') if e.stderr else str(e) logger.error(f"FFmpeg check/convert failed: {error_log}") head = b"" try: with open(input_path, 'rb') as f: head = f.read(100) except: pass if b' Path: """下载视频(yt-dlp 优先,失败回退手动解析)""" url_value = url url_match = re.search(r'https?://[^\s]+', url_value) if url_match: extracted_url = url_match.group(0) logger.info(f"Extracted URL from text: {extracted_url}") url_value = extracted_url logger.info(f"Tool downloading URL: {url_value}") loop = asyncio.get_event_loop() # 先尝试 yt-dlp try: temp_path = await loop.run_in_executor(None, lambda: _download_yt_dlp(url_value, temp_dir, timestamp)) logger.info(f"yt-dlp downloaded to: {temp_path}") return temp_path except Exception as e: logger.warning(f"yt-dlp download failed: {e}. Trying manual fallback...") if "douyin" in url_value: manual_path = await _download_douyin_manual(url_value, temp_dir, timestamp) if manual_path: return manual_path raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}") elif "bilibili" in url_value: manual_path = await _download_bilibili_manual(url_value, temp_dir, timestamp) if manual_path: return manual_path raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}") else: raise ValueError(f"视频下载失败: {str(e)}") def _download_yt_dlp(url_value: str, temp_dir: Path, timestamp: int) -> Path: """yt-dlp 下载(阻塞调用,应在线程池中运行)""" import yt_dlp logger.info("Attempting download with yt-dlp...") ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': str(temp_dir / f"tool_download_{timestamp}_%(id)s.%(ext)s"), 'quiet': True, 'no_warnings': True, 'http_headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'https://www.douyin.com/', } } with yt_dlp.YoutubeDL() as ydl_raw: ydl: Any = ydl_raw ydl.params.update(ydl_opts) info = ydl.extract_info(url_value, download=True) if 'requested_downloads' in info: downloaded_file = info['requested_downloads'][0]['filepath'] else: ext = info.get('ext', 'mp4') id = info.get('id') downloaded_file = str(temp_dir / f"tool_download_{timestamp}_{id}.{ext}") return Path(downloaded_file) async def _download_douyin_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]: """手动下载抖音视频 (Fallback)""" logger.info(f"[SuperIPAgent] Starting download for: {url}") try: headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" } async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client: resp = await client.get(url, headers=headers) final_url = str(resp.url) logger.info(f"[SuperIPAgent] Final URL: {final_url}") modal_id = None match = re.search(r'/video/(\d+)', final_url) if match: modal_id = match.group(1) if not modal_id: logger.error("[SuperIPAgent] Could not extract modal_id") return None logger.info(f"[SuperIPAgent] Extracted modal_id: {modal_id}") target_url = f"https://www.douyin.com/user/MS4wLjABAAAAN_s_hups7LD0N4qnrM3o2gI0vuG3pozNaEolz2_py3cHTTrpVr1Z4dukFD9SOlwY?from_tab_name=main&modal_id={modal_id}" from app.core.config import settings if not settings.DOUYIN_COOKIE: logger.warning("[SuperIPAgent] DOUYIN_COOKIE 未配置,视频下载可能失败") headers_with_cookie = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "cookie": settings.DOUYIN_COOKIE, "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", } logger.info(f"[SuperIPAgent] Requesting page with Cookie...") async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get(target_url, headers=headers_with_cookie) content_match = re.findall(r'', response.text) if not content_match: if "SSR_HYDRATED_DATA" in response.text: content_match = re.findall(r'', response.text) if not content_match: logger.error(f"[SuperIPAgent] Could not find RENDER_DATA in page (len={len(response.text)})") return None content = unquote(content_match[0]) try: data = json.loads(content) except: logger.error("[SuperIPAgent] JSON decode failed") return None video_url = None try: if "app" in data and "videoDetail" in data["app"]: info = data["app"]["videoDetail"]["video"] if "bitRateList" in info and info["bitRateList"]: video_url = info["bitRateList"][0]["playAddr"][0]["src"] elif "playAddr" in info and info["playAddr"]: video_url = info["playAddr"][0]["src"] except Exception as e: logger.error(f"[SuperIPAgent] Path extraction failed: {e}") if not video_url: logger.error("[SuperIPAgent] No video_url found") return None if video_url.startswith("//"): video_url = "https:" + video_url logger.info(f"[SuperIPAgent] Found video URL: {video_url[:50]}...") temp_path = temp_dir / f"douyin_manual_{timestamp}.mp4" download_headers = { 'Referer': 'https://www.douyin.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', } async with httpx.AsyncClient(timeout=60.0) as client: async with client.stream("GET", video_url, headers=download_headers) as dl_resp: if dl_resp.status_code == 200: with open(temp_path, 'wb') as f: async for chunk in dl_resp.aiter_bytes(chunk_size=8192): f.write(chunk) logger.info(f"[SuperIPAgent] Downloaded successfully: {temp_path}") return temp_path else: logger.error(f"[SuperIPAgent] Download failed: {dl_resp.status_code}") return None except Exception as e: logger.error(f"[SuperIPAgent] Logic failed: {e}") return None async def _download_bilibili_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]: """手动下载 Bilibili 视频 (Playwright Fallback)""" from playwright.async_api import async_playwright logger.info(f"[Playwright] Starting Bilibili download for: {url}") playwright = None browser = None try: playwright = await async_playwright().start() browser = await playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) context = await browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) page = await context.new_page() logger.info("[Playwright] Navigating to Bilibili...") await page.goto(url, timeout=45000) try: await page.wait_for_selector('video', timeout=15000) except: logger.warning("[Playwright] Video selector timeout") playinfo = await page.evaluate("window.__playinfo__") audio_url = None if playinfo and "data" in playinfo and "dash" in playinfo["data"]: dash = playinfo["data"]["dash"] if "audio" in dash and dash["audio"]: audio_url = dash["audio"][0]["baseUrl"] logger.info(f"[Playwright] Found audio stream in __playinfo__: {audio_url[:50]}...") if not audio_url: logger.warning("[Playwright] Could not find audio in __playinfo__") return None temp_path = temp_dir / f"bilibili_audio_{timestamp}.m4s" try: api_request = context.request headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Referer": "https://www.bilibili.com/" } logger.info(f"[Playwright] Downloading audio stream...") response = await api_request.get(audio_url, headers=headers) if response.status == 200: body = await response.body() with open(temp_path, 'wb') as f: f.write(body) logger.info(f"[Playwright] Downloaded successfully: {temp_path}") return temp_path else: logger.error(f"[Playwright] API Request failed: {response.status}") return None except Exception as e: logger.error(f"[Playwright] Download logic error: {e}") return None except Exception as e: logger.error(f"[Playwright] Bilibili download failed: {e}") return None finally: if browser: await browser.close() if playwright: await playwright.stop()