更新

2026-02-28 09:16:41 +08:00
parent 0e3502c6f0
commit 29c67f629d
6 changed files with 160 additions and 52 deletions
--- a/models/MuseTalk/scripts/server.py
+++ b/models/MuseTalk/scripts/server.py
@@ -77,7 +77,7 @@ from transformers import WhisperModel
 musetalk_root = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(musetalk_root))

-from musetalk.utils.blending import get_image, get_image_blending, get_image_prepare_material
+from musetalk.utils.blending import get_image, get_image_blending, get_image_blending_fast, get_image_prepare_material
 from musetalk.utils.face_parsing import FaceParsing
 from musetalk.utils.audio_processor import AudioProcessor
 from musetalk.utils.utils import get_file_type, get_video_fps, datagen, load_all_model
@@ -124,13 +124,15 @@ BLEND_CACHE_EVERY = 5   # BiSeNet mask 缓存: 每 N 帧更新一次


 def run_ffmpeg(cmd):
-    """执行 FFmpeg 命令"""
-    print(f"Executing: {cmd}")
+    """执行 FFmpeg 命令（接受列表或字符串）"""
+    if isinstance(cmd, str):
+        cmd = cmd.split()
+    print(f"Executing: {' '.join(cmd)}")
    try:
-        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
-        print(f"Error executing ffmpeg: {cmd}")
+        print(f"Error executing ffmpeg: {' '.join(cmd)}")
        print(f"Return code: {e.returncode}")
        print(f"Stderr: {e.stderr[:500]}")
        return False
@@ -477,73 +479,93 @@ def _run_inference(req: LipSyncRequest) -> dict:
    timings["5_unet"] = time.time() - t0
    print(f"✅ UNet 推理: {len(res_frame_list)} 帧 [{timings['5_unet']:.1f}s]")

-    # ===== Phase 6: 合成 (缓存 BiSeNet mask + cv2.VideoWriter) =====
+    # ===== Phase 6: 合成 (FFmpeg pipe 直写 H.264 + 纯 numpy blending) =====
    t0 = time.time()

    h, w = frames[0].shape[:2]
    temp_raw_path = output_vid_path + ".raw.mp4"

-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    writer = cv2.VideoWriter(temp_raw_path, fourcc, fps, (w, h))
-
-    if not writer.isOpened():
-        raise RuntimeError(f"cv2.VideoWriter 打开失败: {temp_raw_path}")
+    # FFmpeg pipe: rawvideo stdin → NVENC GPU 硬件编码 H.264
+    ffmpeg_cmd = [
+        "ffmpeg", "-y", "-v", "warning",
+        "-f", "rawvideo", "-pix_fmt", "bgr24",
+        "-s", f"{w}x{h}", "-r", str(fps),
+        "-i", "pipe:0",
+        "-c:v", "h264_nvenc", "-preset", "p4", "-cq", "20",
+        "-pix_fmt", "yuv420p",
+        temp_raw_path
+    ]
+    proc = subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE,
+                            stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)

    cached_mask = None
    cached_crop_box = None
    blend_mode = "jaw" if version == "v15" else "raw"

-    for i in tqdm(range(len(res_frame_list)), desc="合成"):
-        res_frame = res_frame_list[i]
-        bbox = coord_list_cycle[i % len(coord_list_cycle)]
-        ori_frame = frame_list_cycle[i % len(frame_list_cycle)].copy()
+    try:
+        for i in tqdm(range(len(res_frame_list)), desc="合成"):
+            res_frame = res_frame_list[i]
+            bbox = coord_list_cycle[i % len(coord_list_cycle)]
+            ori_frame = frame_list_cycle[i % len(frame_list_cycle)].copy()

-        x1, y1, x2, y2 = bbox
-        if version == "v15":
-            y2 = min(y2 + extra_margin, ori_frame.shape[0])
-        adjusted_bbox = (x1, y1, x2, y2)
+            x1, y1, x2, y2 = bbox
+            if version == "v15":
+                y2 = min(y2 + extra_margin, ori_frame.shape[0])
+            adjusted_bbox = (x1, y1, x2, y2)

-        try:
-            res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
-        except Exception:
-            writer.write(ori_frame)
-            continue
-
-        # 每 N 帧更新 BiSeNet 人脸解析 mask, 其余帧复用缓存
-        if i % BLEND_CACHE_EVERY == 0 or cached_mask is None:
            try:
-                cached_mask, cached_crop_box = get_image_prepare_material(
-                    ori_frame, adjusted_bbox, mode=blend_mode, fp=fp)
+                res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
            except Exception:
-                # 如果 prepare 失败, 用完整方式
-                combine_frame = get_image(
-                    ori_frame, res_frame, list(adjusted_bbox),
-                    mode=blend_mode, fp=fp)
-                writer.write(combine_frame)
+                proc.stdin.write(ori_frame.tobytes())
                continue

-        try:
-            combine_frame = get_image_blending(
-                ori_frame, res_frame, adjusted_bbox, cached_mask, cached_crop_box)
-        except Exception:
-            # blending 失败时 fallback 到完整方式
-            combine_frame = get_image(
-                ori_frame, res_frame, list(adjusted_bbox),
-                mode=blend_mode, fp=fp)
+            # 每 N 帧更新 BiSeNet 人脸解析 mask, 其余帧复用缓存
+            if i % BLEND_CACHE_EVERY == 0 or cached_mask is None:
+                try:
+                    cached_mask, cached_crop_box = get_image_prepare_material(
+                        ori_frame, adjusted_bbox, mode=blend_mode, fp=fp)
+                except Exception:
+                    # 如果 prepare 失败, 用完整方式
+                    combine_frame = get_image(
+                        ori_frame, res_frame, list(adjusted_bbox),
+                        mode=blend_mode, fp=fp)
+                    proc.stdin.write(combine_frame.tobytes())
+                    continue

-        writer.write(combine_frame)
+            try:
+                combine_frame = get_image_blending_fast(
+                    ori_frame, res_frame, adjusted_bbox, cached_mask, cached_crop_box)
+            except Exception:
+                # blending_fast 失败时 fallback 到 PIL 方式
+                try:
+                    combine_frame = get_image_blending(
+                        ori_frame, res_frame, adjusted_bbox, cached_mask, cached_crop_box)
+                except Exception:
+                    combine_frame = get_image(
+                        ori_frame, res_frame, list(adjusted_bbox),
+                        mode=blend_mode, fp=fp)
+
+            proc.stdin.write(combine_frame.tobytes())
+    finally:
+        proc.stdin.close()
+
+    proc.wait()
+    stderr_out = proc.stderr.read().decode("utf-8", errors="ignore") if proc.stderr else ""
+    proc.stderr.close()
+    if proc.returncode != 0:
+        raise RuntimeError(f"FFmpeg pipe 编码失败 (rc={proc.returncode}): {stderr_out[:500]}")

-    writer.release()
    timings["6_blend"] = time.time() - t0
    print(f"🎨 合成 [{timings['6_blend']:.1f}s]")

-    # ===== Phase 7: FFmpeg 重编码 H.264 + 合并音频 =====
+    # ===== Phase 7: 音频合并 (-c:v copy, 不重编码视频) =====
    t0 = time.time()
-    cmd = (
-        f"ffmpeg -y -v warning -i {temp_raw_path} -i {audio_path} "
-        f"-c:v libx264 -crf 18 -pix_fmt yuv420p "
-        f"-c:a copy -shortest {output_vid_path}"
-    )
+    cmd = [
+        "ffmpeg", "-y", "-v", "warning",
+        "-i", temp_raw_path, "-i", audio_path,
+        "-c:v", "copy", "-c:a", "copy", "-shortest",
+        output_vid_path
+    ]
    if not run_ffmpeg(cmd):
        raise RuntimeError("FFmpeg 重编码+音频合并失败")