ViGent2/backend/app/modules/tools/service.py

import asyncio
import os
import re
import json
import time
import shutil
import subprocess
import traceback
from pathlib import Path
from typing import Optional, Any
from urllib.parse import unquote

import httpx
from loguru import logger

from app.services.whisper_service import whisper_service
from app.services.glm_service import glm_service


async def extract_script(file=None, url: Optional[str] = None, rewrite: bool = True, custom_prompt: Optional[str] = None) -> dict:
    """
    文案提取：上传文件或视频链接 -> Whisper 转写 -> (可选) GLM 改写
    """
    if not file and not url:
        raise ValueError("必须提供文件或视频链接")

    temp_path = None
    try:
        timestamp = int(time.time())
        temp_dir = Path("/tmp")
        if os.name == 'nt':
            temp_dir = Path("d:/tmp")
        temp_dir.mkdir(parents=True, exist_ok=True)

        loop = asyncio.get_event_loop()

        # 1. 获取/保存文件
        if file:
            filename = file.filename
            if not filename:
                raise ValueError("文件名无效")
            safe_filename = Path(filename).name.replace(" ", "_")
            temp_path = temp_dir / f"tool_extract_{timestamp}_{safe_filename}"
            await loop.run_in_executor(None, lambda: shutil.copyfileobj(file.file, open(temp_path, "wb")))
            logger.info(f"Tool processing upload file: {temp_path}")
        else:
            temp_path = await _download_video(url, temp_dir, timestamp)

        if not temp_path or not temp_path.exists():
            raise ValueError("文件获取失败")

        # 1.5 安全转换: 强制转为 WAV (16k)
        audio_path = temp_dir / f"extract_audio_{timestamp}.wav"
        try:
            await loop.run_in_executor(None, lambda: _convert_to_wav(temp_path, audio_path))
            logger.info(f"Converted to WAV: {audio_path}")
        except ValueError as ve:
            if str(ve) == "HTML_DETECTED":
                raise ValueError("下载的文件是网页而非视频，请重试或手动上传。")
            else:
                raise ValueError("下载的文件已损坏或格式无法识别。")

        # 2. 提取文案 (Whisper)
        script = await whisper_service.transcribe(str(audio_path))

        # 3. AI 改写 (GLM)
        rewritten = None
        if rewrite and script and len(script.strip()) > 0:
            logger.info("Rewriting script...")
            rewritten = await glm_service.rewrite_script(script, custom_prompt)

        return {
            "original_script": script,
            "rewritten_script": rewritten
        }

    finally:
        if temp_path and temp_path.exists():
            try:
                os.remove(temp_path)
                logger.info(f"Cleaned up temp file: {temp_path}")
            except Exception as e:
                logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")


def _convert_to_wav(input_path: Path, output_path: Path) -> None:
    """FFmpeg 转换为 16k WAV"""
    try:
        convert_cmd = [
            'ffmpeg',
            '-i', str(input_path),
            '-vn',
            '-acodec', 'pcm_s16le',
            '-ar', '16000',
            '-ac', '1',
            '-y',
            str(output_path)
        ]
        subprocess.run(convert_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        error_log = e.stderr.decode('utf-8', errors='ignore') if e.stderr else str(e)
        logger.error(f"FFmpeg check/convert failed: {error_log}")
        head = b""
        try:
            with open(input_path, 'rb') as f:
                head = f.read(100)
        except:
            pass
        if b'<!DOCTYPE html' in head or b'<html' in head:
            raise ValueError("HTML_DETECTED")
        raise ValueError("CONVERT_FAILED")


async def _download_video(url: str, temp_dir: Path, timestamp: int) -> Path:
    """下载视频（yt-dlp 优先，失败回退手动解析）"""
    url_value = url
    url_match = re.search(r'https?://[^\s]+', url_value)
    if url_match:
        extracted_url = url_match.group(0)
        logger.info(f"Extracted URL from text: {extracted_url}")
        url_value = extracted_url

    logger.info(f"Tool downloading URL: {url_value}")
    loop = asyncio.get_event_loop()

    # 先尝试 yt-dlp
    try:
        temp_path = await loop.run_in_executor(None, lambda: _download_yt_dlp(url_value, temp_dir, timestamp))
        logger.info(f"yt-dlp downloaded to: {temp_path}")
        return temp_path
    except Exception as e:
        logger.warning(f"yt-dlp download failed: {e}. Trying manual fallback...")

        if "douyin" in url_value:
            manual_path = await _download_douyin_manual(url_value, temp_dir, timestamp)
            if manual_path:
                return manual_path
            raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}")
        elif "bilibili" in url_value:
            manual_path = await _download_bilibili_manual(url_value, temp_dir, timestamp)
            if manual_path:
                return manual_path
            raise ValueError(f"视频下载失败。yt-dlp 报错: {str(e)}")
        else:
            raise ValueError(f"视频下载失败: {str(e)}")


def _download_yt_dlp(url_value: str, temp_dir: Path, timestamp: int) -> Path:
    """yt-dlp 下载（阻塞调用，应在线程池中运行）"""
    import yt_dlp
    logger.info("Attempting download with yt-dlp...")

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': str(temp_dir / f"tool_download_{timestamp}_%(id)s.%(ext)s"),
        'quiet': True,
        'no_warnings': True,
        'http_headers': {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
            'Referer': 'https://www.douyin.com/',
        }
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url_value, download=True)
        if 'requested_downloads' in info:
            downloaded_file = info['requested_downloads'][0]['filepath']
        else:
            ext = info.get('ext', 'mp4')
            vid_id = info.get('id')
            downloaded_file = str(temp_dir / f"tool_download_{timestamp}_{vid_id}.{ext}")

        return Path(downloaded_file)


async def _download_douyin_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
    """手动下载抖音视频 (Fallback) — 通过移动端分享页获取播放地址"""
    logger.info(f"[douyin-fallback] Starting download for: {url}")

    try:
        # 1. 解析短链接，提取视频 ID
        headers = {
            "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15"
        }

        async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client:
            resp = await client.get(url, headers=headers)
            final_url = str(resp.url)

        logger.info(f"[douyin-fallback] Final URL: {final_url}")

        video_id = None
        match = re.search(r'/video/(\d+)', final_url)
        if match:
            video_id = match.group(1)

        if not video_id:
            logger.error("[douyin-fallback] Could not extract video_id")
            return None

        logger.info(f"[douyin-fallback] Extracted video_id: {video_id}")

        # 2. 获取新鲜 ttwid
        ttwid = ""
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                ttwid_resp = await client.post(
                    "https://ttwid.bytedance.com/ttwid/union/register/",
                    json={
                        "region": "cn", "aid": 6383, "needFid": False,
                        "service": "www.douyin.com",
                        "migrate_info": {"ticket": "", "source": "node"},
                        "cbUrlProtocol": "https", "union": True,
                    }
                )
                ttwid = ttwid_resp.cookies.get("ttwid", "")
                logger.info(f"[douyin-fallback] Got fresh ttwid (len={len(ttwid)})")
        except Exception as e:
            logger.warning(f"[douyin-fallback] Failed to get ttwid: {e}")

        # 3. 访问移动端分享页提取播放地址
        page_headers = {
            "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15",
            "cookie": f"ttwid={ttwid}" if ttwid else "",
        }

        async with httpx.AsyncClient(follow_redirects=True, timeout=15.0) as client:
            page_resp = await client.get(
                f"https://m.douyin.com/share/video/{video_id}",
                headers=page_headers,
            )

        page_text = page_resp.text
        logger.info(f"[douyin-fallback] Mobile page length: {len(page_text)}")

        # 4. 提取 play_addr
        addr_match = re.search(
            r'"play_addr":\{"uri":"([^"]+)","url_list":\["([^"]+)"',
            page_text,
        )
        if not addr_match:
            logger.error("[douyin-fallback] Could not find play_addr in mobile page")
            return None

        video_url = addr_match.group(2).replace(r"\u002F", "/")
        if video_url.startswith("//"):
            video_url = "https:" + video_url

        logger.info(f"[douyin-fallback] Found video URL: {video_url[:80]}...")

        # 5. 下载视频
        temp_path = temp_dir / f"douyin_manual_{timestamp}.mp4"
        download_headers = {
            "Referer": "https://www.douyin.com/",
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15",
        }

        async with httpx.AsyncClient(timeout=120.0, follow_redirects=True) as client:
            async with client.stream("GET", video_url, headers=download_headers) as dl_resp:
                if dl_resp.status_code == 200:
                    with open(temp_path, "wb") as f:
                        async for chunk in dl_resp.aiter_bytes(chunk_size=8192):
                            f.write(chunk)

                    logger.info(f"[douyin-fallback] Downloaded successfully: {temp_path}")
                    return temp_path
                else:
                    logger.error(f"[douyin-fallback] Download failed: {dl_resp.status_code}")
                    return None

    except Exception as e:
        logger.error(f"[douyin-fallback] Logic failed: {e}")
        return None


async def _download_bilibili_manual(url: str, temp_dir: Path, timestamp: int) -> Optional[Path]:
    """手动下载 Bilibili 视频 (Playwright Fallback)"""
    from playwright.async_api import async_playwright

    logger.info(f"[Playwright] Starting Bilibili download for: {url}")

    playwright = None
    browser = None
    try:
        playwright = await async_playwright().start()
        browser = await playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])

        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )

        page = await context.new_page()

        logger.info("[Playwright] Navigating to Bilibili...")
        await page.goto(url, timeout=45000)

        try:
            await page.wait_for_selector('video', timeout=15000)
        except:
            logger.warning("[Playwright] Video selector timeout")

        playinfo = await page.evaluate("window.__playinfo__")

        audio_url = None

        if playinfo and "data" in playinfo and "dash" in playinfo["data"]:
            dash = playinfo["data"]["dash"]
            if "audio" in dash and dash["audio"]:
                audio_url = dash["audio"][0]["baseUrl"]
                logger.info(f"[Playwright] Found audio stream in __playinfo__: {audio_url[:50]}...")

        if not audio_url:
            logger.warning("[Playwright] Could not find audio in __playinfo__")
            return None

        temp_path = temp_dir / f"bilibili_audio_{timestamp}.m4s"

        try:
            api_request = context.request
            headers = {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Referer": "https://www.bilibili.com/"
            }

            logger.info(f"[Playwright] Downloading audio stream...")
            response = await api_request.get(audio_url, headers=headers)

            if response.status == 200:
                body = await response.body()
                with open(temp_path, 'wb') as f:
                    f.write(body)

                logger.info(f"[Playwright] Downloaded successfully: {temp_path}")
                return temp_path
            else:
                logger.error(f"[Playwright] API Request failed: {response.status}")
                return None

        except Exception as e:
            logger.error(f"[Playwright] Download logic error: {e}")
            return None

    except Exception as e:
        logger.error(f"[Playwright] Bilibili download failed: {e}")
        return None
    finally:
        if browser:
            await browser.close()
        if playwright:
            await playwright.stop()