ViGent2/backend/app/modules/tools/service.py

import asyncio
import os
import re
import json
import time
import shutil
import subprocess
import traceback
from pathlib import Path
from typing import Optional, Any
from urllib.parse import unquote

import httpx
from loguru import logger

from app.services.whisper_service import whisper_service
from app.services.glm_service import glm_service


async def extract_script(file=None, url: Optional[str] = None, rewrite: bool = True) -> dict:
    """
    文案提取：上传文件或视频链接 -> Whisper 转写 -> (可选) GLM 洗稿
    """
    if not file and not url:
        raise ValueError("必须提供文件或视频链接")

    temp_path = None
    try:
        timestamp = int(time.time())
        temp_dir = Path("/tmp")
        if os.name == 'nt':
            temp_dir = Path("d:/tmp")
        temp_dir.mkdir(parents=True, exist_ok=True)

        loop = asyncio.get_event_loop()

        # 1. 获取/保存文件
        if file:
            filename = file.filename
            if not filename:
                raise ValueError("文件名无效")
            safe_filename = Path(filename).name.replace(" ", "_")
            temp_path = temp_dir / f"tool_extract_{timestamp}_{safe_filename}"
            await loop.run_in_executor(None, lambda: shutil.copyfileobj(file.file, open(temp_path, "wb")))
            logger.info(f"Tool processing upload file: {temp_path}")
        else:
            temp_path = await _download_video(url, temp_dir, timestamp)

        if not temp_path or not temp_path.exists():
            raise ValueError("文件获取失败")

        # 1.5 安全转换: 强制转为 WAV (16k)
        audio_path = temp_dir / f"extract_audio_{timestamp}.wav"
        try:
            await loop.run_in_executor(None, lambda: _convert_to_wav(temp_path, audio_path))
            logger.info(f"Converted to WAV: {audio_path}")
        except ValueError as ve:
            if str(ve) == "HTML_DETECTED":
                raise ValueError("下载的文件是网页而非视频，请重试或手动上传。")
            else:
                raise ValueError("下载的文件已损坏或格式无法识别。")

        # 2. 提取文案 (Whisper)
        script = await whisper_service.transcribe(str(audio_path))

        # 3. AI 洗稿 (GLM)
        rewritten = None
        if rewrite and script and len(script.strip()) > 0:
            logger.info("Rewriting script...")
            rewritten = await glm_service.rewrite_script(script)

        return {
            "original_script": script,
            "rewritten_script": rewritten
        }

    finally:
        if temp_path and temp_path.exists():
            try:
                os.remove(temp_path)
                logger.info(f"Cleaned up temp file: {temp_path}")
            except Exception as e:
                logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")


def _convert_to_wav(input_path: Path, output_path: Path) -> None:
    """FFmpeg 转换为 16k WAV"""
    try:
        convert_cmd = [
            'ffmpeg',
            '-i', str(input_path),
            '-vn',
            '-acodec', 'pcm_s16le',
            '-ar', '16000',
            '-ac', '1',
            '-y',
            str(output_path)
        ]
        subprocess.run(convert_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        error_log = e.stderr.decode('utf-8', errors='ignore') if e.stderr else str(e)
        logger.error(f"FFmpeg check/convert failed: {error_log}")
        head = b""
        try:
            with open(input_path, 'rb') as f:
                head = f.read(100)
        except:
            pass
        if b'<!DOCTYPE html' in head or b'<html' in head:
            raise ValueError("HTML_DETECTED")
        raise ValueError("CONVERT_FAILED")


async def _download_video(url: str, temp_dir: Path, timestamp: int) -> Path:
    """下载视频（yt-dlp 优先，失败回退手动解析）"""
    url_value = url
    url_match = re.search(r'https?://[^\s]+', url_value)
    if url_match:
        extracted_url = url_match.group(0)
        logger.info(f"Extracted URL from text: {extracted_url}")
        url_value = extracted_url

    logger.info(f"Tool downloading URL: {url_value}")
    loop = asyncio.get_event_loop()

    # 先尝试 yt-dlp
    try:
        temp_path = await loop.run_in_executor(None, lambda: _download_yt_dlp(url_value, temp_dir, timestamp))
        logger.info(f"yt-dlp downloaded to: {temp_path}")
        return temp_path
    except Exception as e:
        logger.warning(f"yt-dlp download failed: {e}. Trying manual fallback...")

        if "douyin" in url_value:
            manual_path = await _download_douyin_manual(url_value, temp_dir, timestamp)
            if manual_path:
                return manual_path
            raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}")
        elif "bilibili" in url_value:
            manual_path = await _download_bilibili_manual(url_value, temp_dir, timestamp)
            if manual_path:
                return manual_path
            raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}")
        else:
            raise ValueError(f"视频下载失败: {str(e)}")


def _download_yt_dlp(url_value: str, temp_dir: Path, timestamp: int) -> Path:
    """yt-dlp 下载（阻塞调用，应在线程池中运行）"""
    import yt_dlp
    logger.info("Attempting download with yt-dlp...")

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': str(temp_dir / f"tool_download_{timestamp}_%(id)s.%(ext)s"),
        'quiet': True,
        'no_warnings': True,
        'http_headers': {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Referer': 'https://www.douyin.com/',
        }
    }

    with yt_dlp.YoutubeDL() as ydl_raw:
        ydl: Any = ydl_raw
        ydl.params.update(ydl_opts)
        info = ydl.extract_info(url_value, download=True)
        if 'requested_downloads' in info:
            downloaded_file = info['requested_downloads'][0]['filepath']
        else:
            ext = info.get('ext', 'mp4')
            id = info.get('id')
            downloaded_file = str(temp_dir / f"tool_download_{timestamp}_{id}.{ext}")

        return Path(downloaded_file)


async def _download_douyin_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
    """手动下载抖音视频 (Fallback)"""
    logger.info(f"[SuperIPAgent] Starting download for: {url}")

    try:
        headers = {
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
        }

        async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client:
            resp = await client.get(url, headers=headers)
            final_url = str(resp.url)

        logger.info(f"[SuperIPAgent] Final URL: {final_url}")

        modal_id = None
        match = re.search(r'/video/(\d+)', final_url)
        if match:
            modal_id = match.group(1)

        if not modal_id:
            logger.error("[SuperIPAgent] Could not extract modal_id")
            return None

        logger.info(f"[SuperIPAgent] Extracted modal_id: {modal_id}")

        target_url = f"https://www.douyin.com/user/MS4wLjABAAAAN_s_hups7LD0N4qnrM3o2gI0vuG3pozNaEolz2_py3cHTTrpVr1Z4dukFD9SOlwY?from_tab_name=main&modal_id={modal_id}"

        from app.core.config import settings
        if not settings.DOUYIN_COOKIE:
            logger.warning("[SuperIPAgent] DOUYIN_COOKIE 未配置，视频下载可能失败")

        headers_with_cookie = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "cookie": settings.DOUYIN_COOKIE,
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        }

        logger.info(f"[SuperIPAgent] Requesting page with Cookie...")

        async with httpx.AsyncClient(timeout=10.0) as client:
            response = await client.get(target_url, headers=headers_with_cookie)

        content_match = re.findall(r'<script id="RENDER_DATA" type="application/json">(.*?)</script>', response.text)
        if not content_match:
            if "SSR_HYDRATED_DATA" in response.text:
                content_match = re.findall(r'<script id="SSR_HYDRATED_DATA" type="application/json">(.*?)</script>', response.text)

        if not content_match:
            logger.error(f"[SuperIPAgent] Could not find RENDER_DATA in page (len={len(response.text)})")
            return None

        content = unquote(content_match[0])
        try:
            data = json.loads(content)
        except:
            logger.error("[SuperIPAgent] JSON decode failed")
            return None

        video_url = None
        try:
            if "app" in data and "videoDetail" in data["app"]:
                info = data["app"]["videoDetail"]["video"]
                if "bitRateList" in info and info["bitRateList"]:
                    video_url = info["bitRateList"][0]["playAddr"][0]["src"]
                elif "playAddr" in info and info["playAddr"]:
                    video_url = info["playAddr"][0]["src"]
        except Exception as e:
            logger.error(f"[SuperIPAgent] Path extraction failed: {e}")

        if not video_url:
            logger.error("[SuperIPAgent] No video_url found")
            return None

        if video_url.startswith("//"):
            video_url = "https:" + video_url

        logger.info(f"[SuperIPAgent] Found video URL: {video_url[:50]}...")

        temp_path = temp_dir / f"douyin_manual_{timestamp}.mp4"
        download_headers = {
            'Referer': 'https://www.douyin.com/',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        }

        async with httpx.AsyncClient(timeout=60.0) as client:
            async with client.stream("GET", video_url, headers=download_headers) as dl_resp:
                if dl_resp.status_code == 200:
                    with open(temp_path, 'wb') as f:
                        async for chunk in dl_resp.aiter_bytes(chunk_size=8192):
                            f.write(chunk)

                    logger.info(f"[SuperIPAgent] Downloaded successfully: {temp_path}")
                    return temp_path
                else:
                    logger.error(f"[SuperIPAgent] Download failed: {dl_resp.status_code}")
                    return None

    except Exception as e:
        logger.error(f"[SuperIPAgent] Logic failed: {e}")
        return None


async def _download_bilibili_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
    """手动下载 Bilibili 视频 (Playwright Fallback)"""
    from playwright.async_api import async_playwright

    logger.info(f"[Playwright] Starting Bilibili download for: {url}")

    playwright = None
    browser = None
    try:
        playwright = await async_playwright().start()
        browser = await playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])

        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )

        page = await context.new_page()

        logger.info("[Playwright] Navigating to Bilibili...")
        await page.goto(url, timeout=45000)

        try:
            await page.wait_for_selector('video', timeout=15000)
        except:
            logger.warning("[Playwright] Video selector timeout")

        playinfo = await page.evaluate("window.__playinfo__")

        audio_url = None

        if playinfo and "data" in playinfo and "dash" in playinfo["data"]:
            dash = playinfo["data"]["dash"]
            if "audio" in dash and dash["audio"]:
                audio_url = dash["audio"][0]["baseUrl"]
                logger.info(f"[Playwright] Found audio stream in __playinfo__: {audio_url[:50]}...")

        if not audio_url:
            logger.warning("[Playwright] Could not find audio in __playinfo__")
            return None

        temp_path = temp_dir / f"bilibili_audio_{timestamp}.m4s"

        try:
            api_request = context.request
            headers = {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Referer": "https://www.bilibili.com/"
            }

            logger.info(f"[Playwright] Downloading audio stream...")
            response = await api_request.get(audio_url, headers=headers)

            if response.status == 200:
                body = await response.body()
                with open(temp_path, 'wb') as f:
                    f.write(body)

                logger.info(f"[Playwright] Downloaded successfully: {temp_path}")
                return temp_path
            else:
                logger.error(f"[Playwright] API Request failed: {response.status}")
                return None

        except Exception as e:
            logger.error(f"[Playwright] Download logic error: {e}")
            return None

    except Exception as e:
        logger.error(f"[Playwright] Bilibili download failed: {e}")
        return None
    finally:
        if browser:
            await browser.close()
        if playwright:
            await playwright.stop()