ViGent2/backend/app/modules/videos/workflow.py

from typing import Optional, Any
from pathlib import Path
import time
import traceback
import httpx
from loguru import logger

from app.core.config import settings
from app.services.tts_service import TTSService
from app.services.video_service import VideoService
from app.services.lipsync_service import LipSyncService
from app.services.voice_clone_service import voice_clone_service
from app.services.assets_service import (
    get_style,
    get_default_style,
    resolve_bgm_path,
    prepare_style_for_remotion,
)
from app.services.storage import storage_service
from app.services.whisper_service import whisper_service
from app.services.remotion_service import remotion_service

from .schemas import GenerateRequest
from .task_store import task_store


_lipsync_service: Optional[LipSyncService] = None
_lipsync_ready: Optional[bool] = None
_lipsync_last_check: float = 0


def _get_lipsync_service() -> LipSyncService:
    """获取或创建 LipSync 服务实例（单例模式，避免重复初始化）"""
    global _lipsync_service
    if _lipsync_service is None:
        _lipsync_service = LipSyncService()
    return _lipsync_service


async def _check_lipsync_ready(force: bool = False) -> bool:
    """检查 LipSync 是否就绪（带缓存，5分钟内不重复检查）"""
    global _lipsync_ready, _lipsync_last_check

    now = time.time()
    if not force and _lipsync_ready is not None and (now - _lipsync_last_check) < 300:
        return bool(_lipsync_ready)

    lipsync = _get_lipsync_service()
    health = await lipsync.check_health()
    _lipsync_ready = health.get("ready", False)
    _lipsync_last_check = now
    print(f"[LipSync] Health check: ready={_lipsync_ready}")
    return bool(_lipsync_ready)


async def _download_material(path_or_url: str, temp_path: Path):
    """下载素材到临时文件 (流式下载，节省内存)"""
    if path_or_url.startswith("http"):
        timeout = httpx.Timeout(None)
        async with httpx.AsyncClient(timeout=timeout) as client:
            async with client.stream("GET", path_or_url) as resp:
                resp.raise_for_status()
                with open(temp_path, "wb") as f:
                    async for chunk in resp.aiter_bytes():
                        f.write(chunk)
    else:
        src = Path(path_or_url)
        if not src.is_absolute():
            src = settings.BASE_DIR.parent / path_or_url

        if src.exists():
            import shutil
            shutil.copy(src, temp_path)
        else:
            raise FileNotFoundError(f"Material not found: {path_or_url}")


def _update_task(task_id: str, **updates: Any) -> None:
    task_store.update(task_id, updates)


async def process_video_generation(task_id: str, req: GenerateRequest, user_id: str):
    temp_files = []
    try:
        start_time = time.time()
        _update_task(task_id, status="processing", progress=5, message="正在下载素材...")

        temp_dir = settings.UPLOAD_DIR / "temp"
        temp_dir.mkdir(parents=True, exist_ok=True)

        input_material_path = temp_dir / f"{task_id}_input.mp4"
        temp_files.append(input_material_path)

        await _download_material(req.material_path, input_material_path)

        _update_task(task_id, message="正在生成语音...", progress=10)

        audio_path = temp_dir / f"{task_id}_audio.wav"
        temp_files.append(audio_path)

        if req.tts_mode == "voiceclone":
            if not req.ref_audio_id or not req.ref_text:
                raise ValueError("声音克隆模式需要提供参考音频和参考文字")

            _update_task(task_id, message="正在下载参考音频...")

            ref_audio_local = temp_dir / f"{task_id}_ref.wav"
            temp_files.append(ref_audio_local)

            ref_audio_url = await storage_service.get_signed_url(
                bucket="ref-audios",
                path=req.ref_audio_id
            )
            await _download_material(ref_audio_url, ref_audio_local)

            _update_task(task_id, message="正在克隆声音 (Qwen3-TTS)...")
            await voice_clone_service.generate_audio(
                text=req.text,
                ref_audio_path=str(ref_audio_local),
                ref_text=req.ref_text,
                output_path=str(audio_path),
                language="Chinese"
            )
        else:
            _update_task(task_id, message="正在生成语音 (EdgeTTS)...")
            tts = TTSService()
            await tts.generate_audio(req.text, req.voice, str(audio_path))

        tts_time = time.time() - start_time
        print(f"[Pipeline] TTS completed in {tts_time:.1f}s")
        _update_task(task_id, progress=25)

        _update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30)

        lipsync = _get_lipsync_service()
        lipsync_video_path = temp_dir / f"{task_id}_lipsync.mp4"
        temp_files.append(lipsync_video_path)

        lipsync_start = time.time()
        is_ready = await _check_lipsync_ready()

        if is_ready:
            print(f"[LipSync] Starting LatentSync inference...")
            _update_task(task_id, progress=35, message="正在运行 LatentSync 推理...")
            await lipsync.generate(str(input_material_path), str(audio_path), str(lipsync_video_path))
        else:
            print(f"[LipSync] LatentSync not ready, copying original video")
            _update_task(task_id, message="唇形同步不可用，使用原始视频...")
            import shutil
            shutil.copy(str(input_material_path), lipsync_video_path)

        lipsync_time = time.time() - lipsync_start
        print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s")
        _update_task(task_id, progress=80)

        captions_path = None
        if req.enable_subtitles:
            _update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82)

            captions_path = temp_dir / f"{task_id}_captions.json"
            temp_files.append(captions_path)

            try:
                await whisper_service.align(
                    audio_path=str(audio_path),
                    text=req.text,
                    output_path=str(captions_path)
                )
                print(f"[Pipeline] Whisper alignment completed")
            except Exception as e:
                logger.warning(f"Whisper alignment failed, skipping subtitles: {e}")
                captions_path = None

        _update_task(task_id, progress=85)

        video = VideoService()
        final_audio_path = audio_path
        if req.bgm_id:
            _update_task(task_id, message="正在合成背景音乐...", progress=86)

            bgm_path = resolve_bgm_path(req.bgm_id)
            if bgm_path:
                mix_output_path = temp_dir / f"{task_id}_audio_mix.wav"
                temp_files.append(mix_output_path)
                volume = req.bgm_volume if req.bgm_volume is not None else 0.2
                volume = max(0.0, min(float(volume), 1.0))
                try:
                    video.mix_audio(
                        voice_path=str(audio_path),
                        bgm_path=str(bgm_path),
                        output_path=str(mix_output_path),
                        bgm_volume=volume
                    )
                    final_audio_path = mix_output_path
                except Exception as e:
                    logger.warning(f"BGM mix failed, fallback to voice only: {e}")
            else:
                logger.warning(f"BGM not found: {req.bgm_id}")

        use_remotion = (captions_path and captions_path.exists()) or req.title

        subtitle_style = None
        title_style = None
        if req.enable_subtitles:
            subtitle_style = get_style("subtitle", req.subtitle_style_id) or get_default_style("subtitle")
        if req.title:
            title_style = get_style("title", req.title_style_id) or get_default_style("title")

        if req.subtitle_font_size and req.enable_subtitles:
            if subtitle_style is None:
                subtitle_style = {}
            subtitle_style["font_size"] = int(req.subtitle_font_size)

        if req.title_font_size and req.title:
            if title_style is None:
                title_style = {}
            title_style["font_size"] = int(req.title_font_size)

        if use_remotion:
            subtitle_style = prepare_style_for_remotion(
                subtitle_style,
                temp_dir,
                f"{task_id}_subtitle_font"
            )
            title_style = prepare_style_for_remotion(
                title_style,
                temp_dir,
                f"{task_id}_title_font"
            )

        final_output_local_path = temp_dir / f"{task_id}_output.mp4"
        temp_files.append(final_output_local_path)

        if use_remotion:
            _update_task(task_id, message="正在合成视频 (Remotion)...", progress=87)

            composed_video_path = temp_dir / f"{task_id}_composed.mp4"
            temp_files.append(composed_video_path)

            await video.compose(str(lipsync_video_path), str(final_audio_path), str(composed_video_path))

            remotion_health = await remotion_service.check_health()
            if remotion_health.get("ready"):
                try:
                    def on_remotion_progress(percent):
                        mapped = 87 + int(percent * 0.08)
                        _update_task(task_id, progress=mapped)

                    await remotion_service.render(
                        video_path=str(composed_video_path),
                        output_path=str(final_output_local_path),
                        captions_path=str(captions_path) if captions_path else None,
                        title=req.title,
                        title_duration=3.0,
                        fps=25,
                        enable_subtitles=req.enable_subtitles,
                        subtitle_style=subtitle_style,
                        title_style=title_style,
                        on_progress=on_remotion_progress
                    )
                    print(f"[Pipeline] Remotion render completed")
                except Exception as e:
                    logger.warning(f"Remotion render failed, using FFmpeg fallback: {e}")
                    import shutil
                    shutil.copy(str(composed_video_path), final_output_local_path)
            else:
                logger.warning(f"Remotion not ready: {remotion_health.get('error')}, using FFmpeg")
                import shutil
                shutil.copy(str(composed_video_path), final_output_local_path)
        else:
            _update_task(task_id, message="正在合成最终视频...", progress=90)

            await video.compose(str(lipsync_video_path), str(final_audio_path), str(final_output_local_path))

        total_time = time.time() - start_time

        _update_task(task_id, message="正在上传结果...", progress=95)

        storage_path = f"{user_id}/{task_id}_output.mp4"
        with open(final_output_local_path, "rb") as f:
            file_data = f.read()
            await storage_service.upload_file(
                bucket=storage_service.BUCKET_OUTPUTS,
                path=storage_path,
                file_data=file_data,
                content_type="video/mp4"
            )

        signed_url = await storage_service.get_signed_url(
            bucket=storage_service.BUCKET_OUTPUTS,
            path=storage_path
        )

        print(f"[Pipeline] Total generation time: {total_time:.1f}s")

        _update_task(
            task_id,
            status="completed",
            progress=100,
            message=f"生成完成！耗时 {total_time:.0f} 秒",
            output=storage_path,
            download_url=signed_url,
        )

    except Exception as e:
        _update_task(
            task_id,
            status="failed",
            message=f"错误: {str(e)}",
            error=traceback.format_exc(),
        )
        logger.error(f"Generate video failed: {e}")
    finally:
        for f in temp_files:
            try:
                if f.exists():
                    f.unlink()
            except Exception as e:
                print(f"Error cleaning up {f}: {e}")


async def get_lipsync_health():
    lipsync = _get_lipsync_service()
    return await lipsync.check_health()


async def get_voiceclone_health():
    return await voice_clone_service.check_health()