from fastapi import APIRouter, UploadFile, File, Form, HTTPException from typing import Optional import shutil import os import time from pathlib import Path from loguru import logger import traceback import re import json import requests from urllib.parse import unquote from app.services.whisper_service import whisper_service from app.services.glm_service import glm_service router = APIRouter() @router.post("/extract-script") async def extract_script_tool( file: Optional[UploadFile] = File(None), url: Optional[str] = Form(None), rewrite: bool = Form(True) ): """ 独立文案提取工具 支持上传视频/音频 OR 输入视频链接 -> 提取文字 -> (可选) AI洗稿 """ if not file and not url: raise HTTPException(400, "必须提供文件或视频链接") temp_path = None try: timestamp = int(time.time()) temp_dir = Path("/tmp") if os.name == 'nt': temp_dir = Path("d:/tmp") temp_dir.mkdir(parents=True, exist_ok=True) # 1. 获取/保存文件 loop = asyncio.get_event_loop() if file: safe_filename = Path(file.filename).name.replace(" ", "_") temp_path = temp_dir / f"tool_extract_{timestamp}_{safe_filename}" # 文件 I/O 放入线程池 await loop.run_in_executor(None, lambda: shutil.copyfileobj(file.file, open(temp_path, "wb"))) logger.info(f"Tool processing upload file: {temp_path}") else: # URL 下载逻辑 # 自动提取文案中的链接 (支持 Douyin/Bilibili 等分享文案) url_match = re.search(r'https?://[^\s]+', url) if url_match: extracted_url = url_match.group(0) logger.info(f"Extracted URL from text: {extracted_url}") url = extracted_url logger.info(f"Tool downloading URL: {url}") # 封装 yt-dlp 下载函数 (Blocking) def _download_yt_dlp(): import yt_dlp logger.info("Attempting download with yt-dlp...") ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': str(temp_dir / f"tool_download_{timestamp}_%(id)s.%(ext)s"), 'quiet': True, 'no_warnings': True, 'http_headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'https://www.douyin.com/', } } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) if 'requested_downloads' in info: downloaded_file = info['requested_downloads'][0]['filepath'] else: ext = info.get('ext', 'mp4') id = info.get('id') downloaded_file = str(temp_dir / f"tool_download_{timestamp}_{id}.{ext}") return Path(downloaded_file) # 先尝试 yt-dlp (Run in Executor) try: temp_path = await loop.run_in_executor(None, _download_yt_dlp) logger.info(f"yt-dlp downloaded to: {temp_path}") except Exception as e: logger.warning(f"yt-dlp download failed: {e}. Trying manual Douyin fallback...") # 失败则尝试手动解析 (Douyin Fallback) if "douyin" in url: manual_path = await download_douyin_manual(url, temp_dir, timestamp) if manual_path: temp_path = manual_path logger.info(f"Manual Douyin fallback successful: {temp_path}") else: raise HTTPException(400, f"视频下载失败。yt-dlp 报错: {str(e)}") elif "bilibili" in url: manual_path = await download_bilibili_manual(url, temp_dir, timestamp) if manual_path: temp_path = manual_path logger.info(f"Manual Bilibili fallback successful: {temp_path}") else: raise HTTPException(400, f"视频下载失败。yt-dlp 报错: {str(e)}") else: raise HTTPException(400, f"视频下载失败: {str(e)}") if not temp_path or not temp_path.exists(): raise HTTPException(400, "文件获取失败") # 1.5 安全转换: 强制转为 WAV (16k) import subprocess audio_path = temp_dir / f"extract_audio_{timestamp}.wav" def _convert_audio(): try: convert_cmd = [ 'ffmpeg', '-i', str(temp_path), '-vn', # 忽略视频 '-acodec', 'pcm_s16le', '-ar', '16000', # Whisper 推荐采样率 '-ac', '1', # 单声道 '-y', # 覆盖 str(audio_path) ] # 捕获 stderr subprocess.run(convert_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return True except subprocess.CalledProcessError as e: error_log = e.stderr.decode('utf-8', errors='ignore') if e.stderr else str(e) logger.error(f"FFmpeg check/convert failed: {error_log}") # 检查是否为 HTML head = b"" try: with open(temp_path, 'rb') as f: head = f.read(100) except: pass if b' 0: logger.info("Rewriting script...") rewritten = await glm_service.rewrite_script(script) else: logger.warning("No script extracted, skipping rewrite") return { "success": True, "original_script": script, "rewritten_script": rewritten } except HTTPException as he: raise he except Exception as e: logger.error(f"Tool extract failed: {e}") logger.error(traceback.format_exc()) # Friendly error message msg = str(e) if "Fresh cookies" in msg: msg = "下载失败:目标平台开启了反爬验证,请过段时间重试或直接上传视频文件。" raise HTTPException(500, f"提取失败: {msg}") finally: # 清理临时文件 if temp_path and temp_path.exists(): try: os.remove(temp_path) logger.info(f"Cleaned up temp file: {temp_path}") except Exception as e: logger.warning(f"Failed to cleanup temp file {temp_path}: {e}") async def download_douyin_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]: """ 手动下载抖音视频 (Fallback logic - Ported from SuperIPAgent/douyinDownloader) 使用特定的 User Profile URL 和硬编码 Cookie 绕过反爬 """ logger.info(f"[SuperIPAgent] Starting download for: {url}") try: # 1. 提取 Modal ID (支持短链跳转) headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" } # 如果是短链或重定向 resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10) final_url = resp.url logger.info(f"[SuperIPAgent] Final URL: {final_url}") modal_id = None match = re.search(r'/video/(\d+)', final_url) if match: modal_id = match.group(1) if not modal_id: logger.error("[SuperIPAgent] Could not extract modal_id") return None logger.info(f"[SuperIPAgent] Extracted modal_id: {modal_id}") # 2. 构造特定请求 URL (Copy from SuperIPAgent) # 使用特定用户的 Profile 页 + modal_id 参数,配合特定 Cookie target_url = f"https://www.douyin.com/user/MS4wLjABAAAAN_s_hups7LD0N4qnrM3o2gI0vuG3pozNaEolz2_py3cHTTrpVr1Z4dukFD9SOlwY?from_tab_name=main&modal_id={modal_id}" # 3. 使用硬编码 Cookie (Copy from SuperIPAgent) headers_with_cookie = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "cookie": "douyin.com; device_web_cpu_core=10; device_web_memory_size=8; __ac_nonce=06760391f00b9b51264ae; __ac_signature=_02B4Z6wo00f019a5ceAAAIDAhEZR-X3jjWfWmXVAAJLXd4; ttwid=1%7C7MTKBSMsP4eOv9h5NAh8p0E-NYIud09ftNmB0mjLpWc%7C1734359327%7C8794abeabbd47447e1f56e5abc726be089f2a0344d6343b5f75f23e7b0f0028f; UIFID_TEMP=0de8750d2b188f4235dbfd208e44abbb976428f0720eb983255afefa45d39c0c6532e1d4768dd8587bf919f866ff1396912bcb2af71efee56a14a2a9f37b74010d0a0413795262f6d4afe02a032ac7ab; s_v_web_id=verify_m4r4ribr_c7krmY1z_WoeI_43po_ATpO_I4o8U1bex2D7; hevc_supported=true; home_can_add_dy_2_desktop=%220%22; dy_swidth=2560; dy_sheight=1440; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A2560%2C%5C%22screen_height%5C%22%3A1440%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A10%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A50%7D%22; strategyABtestKey=%221734359328.577%22; csrf_session_id=2f53aed9aa6974e83aa9a1014180c3a4; fpk1=U2FsdGVkX1/IpBh0qdmlKAVhGyYHgur4/VtL9AReZoeSxadXn4juKvsakahRGqjxOPytHWspYoBogyhS/V6QSw==; fpk2=0845b309c7b9b957afd9ecf775a4c21f; passport_csrf_token=d80e0c5b2fa2328219856be5ba7e671e; passport_csrf_token_default=d80e0c5b2fa2328219856be5ba7e671e; odin_tt=3c891091d2eb0f4718c1d5645bc4a0017032d4d5aa989decb729e9da2ad570918cbe5e9133dc6b145fa8c758de98efe32ff1f81aa0d611e838cc73ab08ef7d3f6adf66ab4d10e8372ddd628f94f16b8e; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.5%7D; bd_ticket_guard_client_web_domain=2; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%7D; UIFID=0de8750d2b188f4235dbfd208e44abbb976428f0720eb983255afefa45d39c0c6532e1d4768dd8587bf919f866ff139655a3c2b735923234f371c699560c657923fd3d6c5b63ab7bb9b83423b6cb4787e2ce66a7fbc4ecb24c8570f520fe6de068bbb95115023c0c6c1b6ee31b49fb7e3996fb8349f43a3fd8b7a61cd9e18e8fe65eb6a7c13de4c0960d84e344b644725db3eb2fa6b7caf821de1b50527979f2; is_dash_user=1; biz_trace_id=b57a241f; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCTEo2R0lDalVoWW1XcHpGOFdrN0Vrc0dXcCtaUzNKY1g4NGNGY2k0TTl1TEowNjdUb21mbFU5aDdvWVBGamhNRWNRQWtKdnN1MnM3RmpTWnlJQXpHMjA9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoyfQ%3D%3D; download_guide=%221%2F20241216%2F0%22; sdk_source_info=7e276470716a68645a606960273f276364697660272927676c715a6d6069756077273f276364697660272927666d776a68605a607d71606b766c6a6b5a7666776c7571273f275e58272927666a6b766a69605a696c6061273f27636469766027292762696a6764695a7364776c6467696076273f275e5827292771273f273d33323131333c3036313632342778; bit_env=RiOY4jzzpxZoVCl6zdVSVhVRjdwHRTxqcqWdqMBZLPGjMdB4Tax1kAELHNTVAAh72KuhumewE4Lq6f0-VJ2UpJrkrhSxoPw9LUb3zQrq1OSwbeSPHkRlRgRQvO89sItdGUyq1oFr0XyRCnMYG87KSeWyc4x0czGR0o50hTDoDLG5rJVoRcdQOLvjiAegsqyytKF59sPX_QM9qffK2SqYsg0hCggURc_AI6kguDDE5DvG0bnyz1utw4z1eEnIoLrkGDqzqBZj4dOAr0BVU6ofbsS-pOQ2u2PM1dLP9FlBVBlVaqYVgHJeSLsR5k76BRTddUjTb4zEilVIEwAMJWGN4I1BxVt6fC9B5tBQpuT0lj3n3eKXCKXZsd8FrEs5_pbfDsxV-e_WMiXI2ff4qxiTC0U73sfo9OpicKICtZjdq8qsHxJuu6wVR36zvXeL2Wch5C6MzprNvkivv0l8nbh2mSgy1nabZr3dmU6NcR-Bg3Q3xTWUlR9aAUmpopC-cNuXjgLpT-Lw1AYGilSUnCvosth1Gfypq-b0MpgmdSDgTrQ%3D; gulu_source_res=eyJwX2luIjoiMDhjOGQ3ZTJiODQyNjZkZWI5Y2VkMGJiODNlNmY1ZWY0ZjMyNTE2ZmYyZjAzNDMzZjI0OWU1Y2Q1NTczNTk5NyJ9; passport_auth_mix_state=hp9bc3dgb1tm5wd8p82zawus27g0e3ue; IsDouyinActive=false", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", } logger.info(f"[SuperIPAgent] Requesting page with Cookie...") # 必须 verify=False 否则有些环境会报错 response = requests.get(target_url, headers=headers_with_cookie, timeout=10) # 4. 解析 RENDER_DATA content_match = re.findall(r'', response.text) if not content_match: # 尝试解码后再查找?或者结构变了 # 再尝试找 SSR_HYDRATED_DATA if "SSR_HYDRATED_DATA" in response.text: content_match = re.findall(r'', response.text) if not content_match: logger.error(f"[SuperIPAgent] Could not find RENDER_DATA in page (len={len(response.text)})") return None content = unquote(content_match[0]) try: data = json.loads(content) except: logger.error("[SuperIPAgent] JSON decode failed") return None # 5. 提取视频流 video_url = None try: # 路径通常是: app -> videoDetail -> video -> bitRateList -> playAddr -> src if "app" in data and "videoDetail" in data["app"]: info = data["app"]["videoDetail"]["video"] if "bitRateList" in info and info["bitRateList"]: video_url = info["bitRateList"][0]["playAddr"][0]["src"] elif "playAddr" in info and info["playAddr"]: video_url = info["playAddr"][0]["src"] except Exception as e: logger.error(f"[SuperIPAgent] Path extraction failed: {e}") if not video_url: logger.error("[SuperIPAgent] No video_url found") return None if video_url.startswith("//"): video_url = "https:" + video_url logger.info(f"[SuperIPAgent] Found video URL: {video_url[:50]}...") # 6. 下载 (带 Header) temp_path = temp_dir / f"douyin_manual_{timestamp}.mp4" download_headers = { 'Referer': 'https://www.douyin.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', } dl_resp = requests.get(video_url, headers=download_headers, stream=True, timeout=60) if dl_resp.status_code == 200: with open(temp_path, 'wb') as f: for chunk in dl_resp.iter_content(chunk_size=1024): f.write(chunk) logger.info(f"[SuperIPAgent] Downloaded successfully: {temp_path}") return temp_path else: logger.error(f"[SuperIPAgent] Download failed: {dl_resp.status_code}") return None except Exception as e: logger.error(f"[SuperIPAgent] Logic failed: {e}") return None async def download_bilibili_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]: """ 手动下载 Bilibili 视频 (Fallback logic - Playwright Version) B站通常音视频分离,这里只提取音频即可(因为只需要文案) """ from playwright.async_api import async_playwright logger.info(f"[Playwright] Starting Bilibili download for: {url}") playwright = None browser = None try: playwright = await async_playwright().start() # Launch browser (ensure chromium is installed: playwright install chromium) browser = await playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) # Mobile User Agent often gives single stream? # But Bilibili mobile web is tricky. Desktop is fine. context = await browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) page = await context.new_page() # Intercept audio responses? # Bilibili streams are usually .m4s # But finding the initial state is easier. logger.info("[Playwright] Navigating to Bilibili...") await page.goto(url, timeout=45000) # Wait for video element (triggers loading) try: await page.wait_for_selector('video', timeout=15000) except: logger.warning("[Playwright] Video selector timeout") # 1. Try extracting from __playinfo__ # window.__playinfo__ contains dash streams playinfo = await page.evaluate("window.__playinfo__") audio_url = None if playinfo and "data" in playinfo and "dash" in playinfo["data"]: dash = playinfo["data"]["dash"] if "audio" in dash and dash["audio"]: audio_url = dash["audio"][0]["baseUrl"] logger.info(f"[Playwright] Found audio stream in __playinfo__: {audio_url[:50]}...") # 2. If playinfo fails, try extracting video src (sometimes it's a blob, which we can't fetch easily without interception) # But interception is complex. Let's try requests with Referer if we have URL. if not audio_url: logger.warning("[Playwright] Could not find audio in __playinfo__") return None # Download the audio stream temp_path = temp_dir / f"bilibili_audio_{timestamp}.m4s" # usually m4s try: api_request = context.request headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Referer": "https://www.bilibili.com/" } logger.info(f"[Playwright] Downloading audio stream...") response = await api_request.get(audio_url, headers=headers) if response.status == 200: body = await response.body() with open(temp_path, 'wb') as f: f.write(body) logger.info(f"[Playwright] Downloaded successfully: {temp_path}") return temp_path else: logger.error(f"[Playwright] API Request failed: {response.status}") return None except Exception as e: logger.error(f"[Playwright] Download logic error: {e}") return None except Exception as e: logger.error(f"[Playwright] Bilibili download failed: {e}") return None finally: if browser: await browser.close() if playwright: await playwright.stop()