ViGent2/models/MuseTalk/scripts/server.py

"""
MuseTalk v1.5 常驻推理服务 (优化版 v2)
- 端口: 8011
- GPU: 从 backend/.env 读取 MUSETALK_GPU_ID (默认 0)
- 架构: FastAPI + lifespan (与 LatentSync server.py 同模式)

优化项 (vs v1):
1. cv2.VideoCapture 直读帧 (跳过 ffmpeg→PNG→imread)
2. 人脸检测降频 (每 N 帧检测, 中间插值 bbox)
3. BiSeNet mask 缓存 (每 N 帧更新, 中间复用)
4. cv2.VideoWriter 直写视频 (跳过逐帧 PNG 写盘)
5. batch_size 8→32
6. 每阶段计时
"""

import os
import sys
import math
import copy
import time
import glob
import shutil
import tempfile
import subprocess
from pathlib import Path

# --- 自动加载 GPU 配置 (必须在 torch 导入前) ---
def load_gpu_config():
    """尝试从后端 .env 文件读取 MUSETALK_GPU_ID"""
    try:
        current_dir = Path(__file__).resolve().parent
        env_path = current_dir.parent.parent.parent / "backend" / ".env"

        target_gpu = "0"  # 默认 GPU 0

        if env_path.exists():
            print(f"📖 读取配置文件: {env_path}")
            with open(env_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line.startswith("MUSETALK_GPU_ID="):
                        val = line.split("=")[1].strip().split("#")[0].strip()
                        if val:
                            target_gpu = val
                            print(f"⚙️  发现配置 MUSETALK_GPU_ID={target_gpu}")
                        break

        if "CUDA_VISIBLE_DEVICES" not in os.environ:
            os.environ["CUDA_VISIBLE_DEVICES"] = target_gpu
            print(f"✅ 已自动设置: CUDA_VISIBLE_DEVICES={target_gpu}")
        else:
            print(f"ℹ️  检测到外部 CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}，跳过自动配置")

    except Exception as e:
        print(f"⚠️ 读取 GPU 配置失败: {e}，将使用默认设置")

load_gpu_config()

# --- 性能优化: 限制 CPU 线程数 ---
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"
os.environ["TORCH_NUM_THREADS"] = "8"
print("⚙️  已限制 PyTorch CPU 线程数为 8，防止系统卡顿")

import cv2
import torch
import pickle
import numpy as np
from tqdm import tqdm
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
from transformers import WhisperModel

# 添加项目根目录到 sys.path (MuseTalk 根目录)
musetalk_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(musetalk_root))

from musetalk.utils.blending import get_image, get_image_blending, get_image_prepare_material
from musetalk.utils.face_parsing import FaceParsing
from musetalk.utils.audio_processor import AudioProcessor
from musetalk.utils.utils import get_file_type, get_video_fps, datagen, load_all_model
from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs, coord_placeholder

# --- 从 .env 读取额外配置 ---
def load_env_config():
    """读取 MuseTalk 相关环境变量"""
    config = {
        "batch_size": 32,
        "version": "v15",
        "use_float16": True,
    }
    try:
        env_path = musetalk_root.parent.parent / "backend" / ".env"
        if env_path.exists():
            with open(env_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line.startswith("MUSETALK_BATCH_SIZE="):
                        val = line.split("=")[1].strip().split("#")[0].strip()
                        if val:
                            config["batch_size"] = int(val)
                    elif line.startswith("MUSETALK_VERSION="):
                        val = line.split("=")[1].strip().split("#")[0].strip()
                        if val:
                            config["version"] = val
                    elif line.startswith("MUSETALK_USE_FLOAT16="):
                        val = line.split("=")[1].strip().split("#")[0].strip().lower()
                        config["use_float16"] = val in ("true", "1", "yes")
    except Exception as e:
        print(f"⚠️ 读取额外配置失败: {e}")
    return config

env_config = load_env_config()

# 全局模型缓存
models = {}

# ===================== 优化参数 =====================
DETECT_EVERY = 5        # 人脸检测降频: 每 N 帧检测一次
BLEND_CACHE_EVERY = 5   # BiSeNet mask 缓存: 每 N 帧更新一次
# ====================================================


def run_ffmpeg(cmd):
    """执行 FFmpeg 命令"""
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error executing ffmpeg: {cmd}")
        print(f"Return code: {e.returncode}")
        print(f"Stderr: {e.stderr[:500]}")
        return False


@asynccontextmanager
async def lifespan(app: FastAPI):
    """启动时加载所有模型，只做一次"""
    print("⏳ 正在加载 MuseTalk v1.5 模型...")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    version = env_config["version"]
    use_float16 = env_config["use_float16"]

    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        print(f"🖥️  正在使用 GPU: {gpu_name}")
    else:
        print("⚠️  警告: 未检测到 GPU，将使用 CPU 进行推理 (速度极慢)")

    # 根据版本选择模型路径
    models_dir = musetalk_root / "models"
    if version == "v15":
        unet_model_path = str(models_dir / "musetalkV15" / "unet.pth")
        unet_config = str(models_dir / "musetalk" / "config.json")
    else:
        unet_model_path = str(models_dir / "musetalk" / "pytorch_model.bin")
        unet_config = str(models_dir / "musetalk" / "musetalk.json")

    # 切换工作目录（load_all_model 使用相对路径加载 VAE）
    original_cwd = os.getcwd()
    os.chdir(str(musetalk_root))

    vae, unet, pe = load_all_model(
        unet_model_path=unet_model_path,
        vae_type="sd-vae",
        unet_config=unet_config,
        device=device,
    )

    if use_float16 and torch.cuda.is_available():
        print("⚡ 使用 float16 半精度加速")
        pe = pe.half()
        vae.vae = vae.vae.half()
        unet.model = unet.model.half()

    pe = pe.to(device)
    vae.vae = vae.vae.to(device)
    unet.model = unet.model.to(device)

    # Whisper
    whisper_dir = str(models_dir / "whisper")
    audio_processor = AudioProcessor(feature_extractor_path=whisper_dir)
    weight_dtype = unet.model.dtype
    whisper = WhisperModel.from_pretrained(whisper_dir)
    whisper = whisper.to(device=device, dtype=weight_dtype).eval()
    whisper.requires_grad_(False)

    # FaceParsing
    if version == "v15":
        fp = FaceParsing(left_cheek_width=90, right_cheek_width=90)
    else:
        fp = FaceParsing()

    # 恢复工作目录
    os.chdir(original_cwd)

    models["vae"] = vae
    models["unet"] = unet
    models["pe"] = pe
    models["whisper"] = whisper
    models["audio_processor"] = audio_processor
    models["fp"] = fp
    models["device"] = device
    models["weight_dtype"] = weight_dtype
    models["version"] = version
    models["timesteps"] = torch.tensor([0], device=device)

    print("✅ MuseTalk v1.5 模型加载完成，服务就绪！")
    print(f"⚙️  优化参数: batch_size={env_config['batch_size']}, "
          f"detect_every={DETECT_EVERY}, blend_cache_every={BLEND_CACHE_EVERY}")
    yield
    models.clear()
    torch.cuda.empty_cache()


app = FastAPI(lifespan=lifespan)


class LipSyncRequest(BaseModel):
    video_path: str
    audio_path: str
    video_out_path: str
    batch_size: int = 32


@app.get("/health")
def health_check():
    return {"status": "ok", "model_loaded": "unet" in models}


@app.post("/lipsync")
async def generate_lipsync(req: LipSyncRequest):
    if "unet" not in models:
        raise HTTPException(status_code=503, detail="Model not loaded")

    if not os.path.exists(req.video_path):
        raise HTTPException(status_code=404, detail=f"Video not found: {req.video_path}")
    if not os.path.exists(req.audio_path):
        raise HTTPException(status_code=404, detail=f"Audio not found: {req.audio_path}")

    print(f"🎬 收到任务: {Path(req.video_path).name} -> {Path(req.video_out_path).name}")
    start_time = time.time()

    try:
        result = _run_inference(req)
        elapsed = time.time() - start_time
        print(f"✅ 推理完成，耗时 {elapsed:.1f}s ({elapsed/60:.1f}min)")
        return result
    except Exception as e:
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))


# =====================================================================
#  降频人脸检测: 每 N 帧检测一次, 中间帧线性插值 bbox
# =====================================================================
def _detect_faces_subsampled(frames, detect_every=5):
    """
    降频人脸检测:
    - 每 detect_every 帧运行 DWPose + FaceAlignment
    - 中间帧线性插值 bbox 坐标
    - 对于口播视频 (人脸几乎不动), 插值误差可忽略
    """
    from mmpose.apis import inference_topdown
    from mmpose.structures import merge_data_samples
    import musetalk.utils.preprocessing as _prep

    n = len(frames)
    if n == 0:
        return []

    # 确定需要检测的帧索引
    sampled_indices = list(range(0, n, detect_every))
    if sampled_indices[-1] != n - 1:
        sampled_indices.append(n - 1)

    print(f"   检测 {len(sampled_indices)}/{n} 帧 (每{detect_every}帧)")

    # 在采样帧上运行检测
    detected = {}
    for idx in tqdm(sampled_indices, desc="人脸检测"):
        frame = frames[idx]
        try:
            results = inference_topdown(_prep.model, frame)
            results = merge_data_samples(results)
            keypoints = results.pred_instances.keypoints
            face_land_mark = keypoints[0][23:91].astype(np.int32)

            bbox_list = _prep.fa.get_detections_for_batch(np.array([frame]))

            if bbox_list[0] is None:
                detected[idx] = coord_placeholder
                continue

            half_face_coord = face_land_mark[29].copy()
            half_face_dist = np.max(face_land_mark[:, 1]) - half_face_coord[1]
            upper_bond = max(0, half_face_coord[1] - half_face_dist)

            f_landmark = (
                int(np.min(face_land_mark[:, 0])),
                int(upper_bond),
                int(np.max(face_land_mark[:, 0])),
                int(np.max(face_land_mark[:, 1])),
            )
            x1, y1, x2, y2 = f_landmark
            if y2 - y1 <= 0 or x2 - x1 <= 0 or x1 < 0:
                detected[idx] = bbox_list[0] if bbox_list[0] is not None else coord_placeholder
            else:
                detected[idx] = f_landmark
        except Exception as e:
            print(f"⚠️ 帧 {idx} 检测失败: {e}")
            detected[idx] = coord_placeholder

    # 插值填充所有帧
    coord_list = [None] * n
    for idx in sampled_indices:
        coord_list[idx] = detected[idx]

    for i in range(n):
        if coord_list[i] is not None:
            continue

        # 找前后已检测的帧
        prev_idx = max(j for j in sampled_indices if j < i)
        next_idx = min(j for j in sampled_indices if j > i)

        prev_bbox = detected[prev_idx]
        next_bbox = detected[next_idx]

        if prev_bbox == coord_placeholder and next_bbox == coord_placeholder:
            coord_list[i] = coord_placeholder
        elif prev_bbox == coord_placeholder:
            coord_list[i] = next_bbox
        elif next_bbox == coord_placeholder:
            coord_list[i] = prev_bbox
        else:
            alpha = (i - prev_idx) / (next_idx - prev_idx)
            coord_list[i] = tuple(
                int(a * (1 - alpha) + b * alpha)
                for a, b in zip(prev_bbox, next_bbox)
            )

    return coord_list


# =====================================================================
#  核心推理 (优化版)
# =====================================================================
@torch.no_grad()
def _run_inference(req: LipSyncRequest) -> dict:
    """
    优化版推理逻辑:
    1. cv2.VideoCapture 直读帧 (跳过 ffmpeg→PNG→imread)
    2. 人脸检测降频 (每 N 帧, 中间插值)
    3. BiSeNet mask 缓存 (每 N 帧更新)
    4. cv2.VideoWriter 直写 (跳过逐帧 PNG)
    5. 每阶段计时
    """
    vae = models["vae"]
    unet = models["unet"]
    pe = models["pe"]
    whisper = models["whisper"]
    audio_processor = models["audio_processor"]
    fp = models["fp"]
    device = models["device"]
    weight_dtype = models["weight_dtype"]
    version = models["version"]
    timesteps = models["timesteps"]
    batch_size = req.batch_size or env_config["batch_size"]

    video_path = req.video_path
    audio_path = req.audio_path
    output_vid_path = req.video_out_path

    os.makedirs(os.path.dirname(output_vid_path), exist_ok=True)

    t_total = time.time()
    timings = {}

    # ===== Phase 1: 读取视频帧 (cv2.VideoCapture, 跳过 ffmpeg→PNG) =====
    t0 = time.time()
    if get_file_type(video_path) == "video":
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frames.append(frame)
        cap.release()
    elif get_file_type(video_path) == "image":
        frames = [cv2.imread(video_path)]
        fps = 25.0
    else:
        raise ValueError(f"不支持的文件类型: {video_path}")

    timings["1_read"] = time.time() - t0
    print(f"📹 读取 {len(frames)} 帧, FPS={fps} [{timings['1_read']:.1f}s]")

    if not frames:
        raise RuntimeError("视频帧为空")

    # ===== Phase 2: Whisper 音频特征 =====
    t0 = time.time()
    whisper_input_features, librosa_length = audio_processor.get_audio_feature(audio_path)
    whisper_chunks = audio_processor.get_whisper_chunk(
        whisper_input_features, device, weight_dtype, whisper, librosa_length,
        fps=fps,
        audio_padding_length_left=2,
        audio_padding_length_right=2,
    )
    timings["2_whisper"] = time.time() - t0
    print(f"🎵 Whisper 特征 [{timings['2_whisper']:.1f}s]")

    # ===== Phase 3: 人脸检测 (降频) =====
    t0 = time.time()
    coord_list = _detect_faces_subsampled(frames, detect_every=DETECT_EVERY)
    timings["3_face"] = time.time() - t0
    print(f"🔍 人脸检测 [{timings['3_face']:.1f}s]")

    # ===== Phase 4: VAE 潜空间编码 =====
    t0 = time.time()
    input_latent_list = []
    extra_margin = 10
    for bbox, frame in zip(coord_list, frames):
        if bbox == coord_placeholder:
            continue
        x1, y1, x2, y2 = bbox
        if version == "v15":
            y2 = min(y2 + extra_margin, frame.shape[0])
        crop_frame = frame[y1:y2, x1:x2]
        crop_frame = cv2.resize(crop_frame, (256, 256), interpolation=cv2.INTER_LANCZOS4)
        latents = vae.get_latents_for_unet(crop_frame)
        input_latent_list.append(latents)

    timings["4_vae"] = time.time() - t0
    print(f"🧠 VAE 编码 [{timings['4_vae']:.1f}s]")

    # ===== Phase 5: UNet 批量推理 =====
    t0 = time.time()

    # 循环帧序列 (引用, 不复制数据)
    frame_list_cycle = frames + frames[::-1]
    coord_list_cycle = coord_list + coord_list[::-1]
    input_latent_list_cycle = input_latent_list + input_latent_list[::-1]

    video_num = len(whisper_chunks)
    gen = datagen(
        whisper_chunks=whisper_chunks,
        vae_encode_latents=input_latent_list_cycle,
        batch_size=batch_size,
        delay_frame=0,
        device=device,
    )

    res_frame_list = []
    total_batches = int(np.ceil(float(video_num) / batch_size))
    print(f"🚀 推理: {video_num} 帧, batch={batch_size}, {total_batches} 批")

    for i, (whisper_batch, latent_batch) in enumerate(tqdm(gen, total=total_batches)):
        audio_feature_batch = pe(whisper_batch)
        latent_batch = latent_batch.to(dtype=unet.model.dtype)
        pred_latents = unet.model(
            latent_batch, timesteps,
            encoder_hidden_states=audio_feature_batch
        ).sample
        recon = vae.decode_latents(pred_latents)
        for res_frame in recon:
            res_frame_list.append(res_frame)

    timings["5_unet"] = time.time() - t0
    print(f"✅ UNet 推理: {len(res_frame_list)} 帧 [{timings['5_unet']:.1f}s]")

    # ===== Phase 6: 合成 (缓存 BiSeNet mask + cv2.VideoWriter) =====
    t0 = time.time()

    h, w = frames[0].shape[:2]
    temp_raw_path = output_vid_path + ".raw.mp4"

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(temp_raw_path, fourcc, fps, (w, h))

    if not writer.isOpened():
        raise RuntimeError(f"cv2.VideoWriter 打开失败: {temp_raw_path}")

    cached_mask = None
    cached_crop_box = None
    blend_mode = "jaw" if version == "v15" else "raw"

    for i in tqdm(range(len(res_frame_list)), desc="合成"):
        res_frame = res_frame_list[i]
        bbox = coord_list_cycle[i % len(coord_list_cycle)]
        ori_frame = frame_list_cycle[i % len(frame_list_cycle)].copy()

        x1, y1, x2, y2 = bbox
        if version == "v15":
            y2 = min(y2 + extra_margin, ori_frame.shape[0])
        adjusted_bbox = (x1, y1, x2, y2)

        try:
            res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
        except Exception:
            writer.write(ori_frame)
            continue

        # 每 N 帧更新 BiSeNet 人脸解析 mask, 其余帧复用缓存
        if i % BLEND_CACHE_EVERY == 0 or cached_mask is None:
            try:
                cached_mask, cached_crop_box = get_image_prepare_material(
                    ori_frame, adjusted_bbox, mode=blend_mode, fp=fp)
            except Exception:
                # 如果 prepare 失败, 用完整方式
                combine_frame = get_image(
                    ori_frame, res_frame, list(adjusted_bbox),
                    mode=blend_mode, fp=fp)
                writer.write(combine_frame)
                continue

        try:
            combine_frame = get_image_blending(
                ori_frame, res_frame, adjusted_bbox, cached_mask, cached_crop_box)
        except Exception:
            # blending 失败时 fallback 到完整方式
            combine_frame = get_image(
                ori_frame, res_frame, list(adjusted_bbox),
                mode=blend_mode, fp=fp)

        writer.write(combine_frame)

    writer.release()
    timings["6_blend"] = time.time() - t0
    print(f"🎨 合成 [{timings['6_blend']:.1f}s]")

    # ===== Phase 7: FFmpeg 重编码 H.264 + 合并音频 =====
    t0 = time.time()
    cmd = (
        f"ffmpeg -y -v warning -i {temp_raw_path} -i {audio_path} "
        f"-c:v libx264 -crf 18 -pix_fmt yuv420p "
        f"-c:a copy -shortest {output_vid_path}"
    )
    if not run_ffmpeg(cmd):
        raise RuntimeError("FFmpeg 重编码+音频合并失败")

    # 清理临时文件
    if os.path.exists(temp_raw_path):
        os.unlink(temp_raw_path)

    timings["7_encode"] = time.time() - t0
    print(f"🔊 编码+音频 [{timings['7_encode']:.1f}s]")

    # ===== 汇总 =====
    total_time = time.time() - t_total
    print(f"\n⏱️  总耗时: {total_time:.1f}s ({total_time/60:.1f}min)")
    for k, v in timings.items():
        pct = v / total_time * 100
        print(f"   {k}: {v:.1f}s ({pct:.0f}%)")

    if not os.path.exists(output_vid_path):
        raise RuntimeError("输出文件未生成")

    return {"status": "success", "output_path": output_vid_path}


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8011)