From 29c67f629dfd432730a2db8397231350a6f9d4f5 Mon Sep 17 00:00:00 2001
From: Kevin Wong <lamnickdavid@gmail.com>
Date: Sat, 28 Feb 2026 09:16:41 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Docs/DEPLOY_MANUAL.md                      |   2 +-
 Docs/DevLogs/Day28.md                      |  60 ++++++++++
 Docs/SUBTITLE_DEPLOY.md                    |   1 +
 README.md                                  |   2 +-
 models/MuseTalk/musetalk/utils/blending.py |  25 +++++
 models/MuseTalk/scripts/server.py          | 122 ++++++++++++---------
 6 files changed, 160 insertions(+), 52 deletions(-)

diff --git a/Docs/DEPLOY_MANUAL.md b/Docs/DEPLOY_MANUAL.md
index fab1b86..330fb80 100644
--- a/Docs/DEPLOY_MANUAL.md
+++ b/Docs/DEPLOY_MANUAL.md
@@ -97,7 +97,7 @@ python -m scripts.server  # 测试能否启动，Ctrl+C 退出
 
 ### 3b. MuseTalk 1.5 (长视频唇形同步, GPU0)
 
-> MuseTalk 是单步潜空间修复模型（非扩散模型），推理速度接近实时，适合 >=120s 的长视频。与 CosyVoice 共享 GPU0，fp16 推理约需 4-8GB 显存。
+> MuseTalk 是单步潜空间修复模型（非扩散模型），推理速度接近实时，适合 >=120s 的长视频。与 CosyVoice 共享 GPU0，fp16 推理约需 4-8GB 显存。合成阶段使用 NVENC GPU 硬编码（h264_nvenc）+ 纯 numpy blending，避免双重编码和 PIL 转换开销。
 
 请参考详细的独立部署指南：
 **[MuseTalk 部署指南](MUSETALK_DEPLOY.md)**
diff --git a/Docs/DevLogs/Day28.md b/Docs/DevLogs/Day28.md
index 585c44b..9917e69 100644
--- a/Docs/DevLogs/Day28.md
+++ b/Docs/DevLogs/Day28.md
@@ -201,3 +201,63 @@ const materialPosterUrl = useVideoFrameCapture(
 | TensorRT (DiT 模块) | +20-30% | 需编译 .plan 引擎 |
 | torch.compile() | +10-20% | 一行代码，但首次编译慢 |
 | vLLM (LLM 模块) | +10-15% | 额外依赖 |
+
+---
+
+## MuseTalk 合成阶段性能优化
+
+### 概述
+
+MuseTalk v2 优化后总耗时从 1799s 降到 819s（2.2x），但合成阶段（Phase 6）仍占 462.2s (56.4%)，是最大单一瓶颈。本次优化两个方向：纯 numpy blending 替代 PIL 转换、FFmpeg pipe + NVENC GPU 硬编码替代双重编码。
+
+### 1. 纯 numpy blending 替代 PIL（blending.py）
+
+- **问题**: `get_image_blending` 每帧做 3 次 numpy↔PIL 转换 + BGR↔RGB 通道翻转，纯粹浪费
+- **方案**: 新增 `get_image_blending_fast()` 函数
+  - 全程保持 BGR numpy 数组，不做 PIL 转换和通道翻转
+  - mask 混合用 numpy 向量化广播 `mask * (1/255)` 替代 `PIL.paste with mask`
+  - 原 `get_image_blending` 保留作为 fallback
+- **降级链**: `blending_fast` → `blending`（PIL）→ `get_image`（完整重算）
+
+### 2. FFmpeg pipe + NVENC 硬编码替代双重编码（server.py）
+
+**优化前（双重编码）**:
+```
+Phase 6: 逐帧 → cv2.VideoWriter (mp4v CPU 软编码) → temp_raw.mp4
+Phase 7: FFmpeg 读 temp_raw.mp4 → H.264 CPU 重编码 + 合并音频 → output.mp4
+```
+
+**优化后（单次 GPU 编码）**:
+```
+Phase 6: 逐帧 → FFmpeg stdin pipe (rawvideo → h264_nvenc GPU 编码) → temp_raw.mp4
+Phase 7: FFmpeg 只做音频合并 (-c:v copy -c:a copy) → output.mp4  （秒级）
+```
+
+- NVENC 参数: `-c:v h264_nvenc -preset p4 -cq 20 -pix_fmt yuv420p`
+- RTX 3090 NVENC 专用芯片编码，不占 CUDA 核心，编码速度 >500fps
+
+### 3. FFmpeg 进程资源管理加固
+
+- `try/finally` 包裹写帧循环，确保异常时 `proc.stdin.close()` 执行
+- `proc.wait()` 后读 stderr 再关闭，避免缓冲区死锁
+- stderr decode 加 `errors="ignore"` 防止非 UTF-8 崩溃
+
+### 4. `run_ffmpeg` 安全改进
+
+- 去掉 `shell=True`，改用列表传参，避免路径特殊字符导致命令注入
+- Phase 7 FFmpeg 命令从字符串拼接改为列表传参
+
+### 调优过程
+
+| 版本 | Phase 6 | Phase 7 | 总计 | 结论 |
+|------|---------|---------|------|------|
+| Day27 基线 | 462s | 38s | 819s | — |
+| v1: libx264 -preset medium | 548s | 0.3s | 854s | CPU 编码背压，反而更慢 |
+| v2: h264_nvenc（当前） | 待测 | 待测 | 待测 | NVENC 零背压，预估 Phase 6 < 200s |
+
+### 修改文件
+
+| 文件 | 改动 |
+|------|------|
+| `models/MuseTalk/musetalk/utils/blending.py` | 新增 `get_image_blending_fast()` 纯 numpy 函数 |
+| `models/MuseTalk/scripts/server.py` | Phase 6: FFmpeg pipe + NVENC + blending_fast；Phase 7: -c:v copy；`run_ffmpeg` 去掉 shell=True |
diff --git a/Docs/SUBTITLE_DEPLOY.md b/Docs/SUBTITLE_DEPLOY.md
index ff167d4..afa3d8a 100644
--- a/Docs/SUBTITLE_DEPLOY.md
+++ b/Docs/SUBTITLE_DEPLOY.md
@@ -294,3 +294,4 @@ WhisperService(device="cuda:0")  # 或 "cuda:1"
 | 2026-01-30 | 1.0.1 | 字幕高亮样式与标题动画优化，视觉表现更清晰 |
 | 2026-02-25 | 1.2.0 | 字幕时间戳从线性插值改为 Whisper 节奏映射，修复长视频字幕漂移 |
 | 2026-02-27 | 1.3.0 | 架构图更新 MuseTalk 混合路由；Remotion 并发渲染从 8 提升到 16；GPU 分配说明更新 |
+| 2026-02-28 | 1.3.1 | MuseTalk 合成阶段优化：纯 numpy blending + FFmpeg pipe NVENC GPU 硬编码替代双重编码 |
diff --git a/README.md b/README.md
index ed1e49f..b367cbb 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@
 - 💳 **付费会员** - 支付宝电脑网站支付自动开通会员，到期自动停用并引导续费，管理员手动激活并存。
 - 🔐 **认证与隔离** - 基于 Supabase 的用户隔离，支持手机号注册/登录、密码管理。
 - 🛡️ **服务守护** - 内置 Watchdog 看门狗机制，自动监控并重启僵死服务，确保 7x24h 稳定运行。
-- 🚀 **性能优化** - 视频预压缩、模型常驻服务（近实时加载）、双 GPU 流水线并发、MuseTalk 人脸检测降频 + BiSeNet 缓存、Remotion 16 并发渲染。
+- 🚀 **性能优化** - 视频预压缩、模型常驻服务（近实时加载）、双 GPU 流水线并发、MuseTalk 人脸检测降频 + BiSeNet 缓存 + NVENC GPU 硬编码、Remotion 16 并发渲染。
 
 ---
 
diff --git a/models/MuseTalk/musetalk/utils/blending.py b/models/MuseTalk/musetalk/utils/blending.py
index fa3effc..d7fcabf 100644
--- a/models/MuseTalk/musetalk/utils/blending.py
+++ b/models/MuseTalk/musetalk/utils/blending.py
@@ -109,6 +109,31 @@ def get_image_blending(image, face, face_box, mask_array, crop_box):
     return body[:,:,::-1]
 
 
+def get_image_blending_fast(image, face, face_box, mask_array, crop_box):
+    """纯 numpy blending，无 PIL 转换，无 BGR↔RGB 翻转。
+    所有输入输出均为 BGR numpy uint8，与 get_image_blending 语义等价。
+    """
+    x, y, x1, y1 = face_box
+    x_s, y_s, x_e, y_e = crop_box
+
+    result = image.copy()
+
+    # 1. 将生成的人脸贴入 crop 区域对应位置
+    crop_region = result[y_s:y_e, x_s:x_e].copy()
+    fy, fx = y - y_s, x - x_s
+    fh, fw = y1 - y, x1 - x
+    crop_region[fy:fy+fh, fx:fx+fw] = face
+
+    # 2. mask alpha 混合（numpy 向量化广播）
+    mask_f = mask_array[:, :, np.newaxis].astype(np.float32) * (1.0 / 255.0)
+    orig_region = result[y_s:y_e, x_s:x_e].astype(np.float32)
+    new_region = crop_region.astype(np.float32)
+    blended = orig_region * (1.0 - mask_f) + new_region * mask_f
+    result[y_s:y_e, x_s:x_e] = blended.astype(np.uint8)
+
+    return result
+
+
 def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5, expand=1.5, fp=None, mode="raw"):
     body = Image.fromarray(image[:,:,::-1])
 
diff --git a/models/MuseTalk/scripts/server.py b/models/MuseTalk/scripts/server.py
index c5f999e..e7cc8e3 100644
--- a/models/MuseTalk/scripts/server.py
+++ b/models/MuseTalk/scripts/server.py
@@ -77,7 +77,7 @@ from transformers import WhisperModel
 musetalk_root = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(musetalk_root))
 
-from musetalk.utils.blending import get_image, get_image_blending, get_image_prepare_material
+from musetalk.utils.blending import get_image, get_image_blending, get_image_blending_fast, get_image_prepare_material
 from musetalk.utils.face_parsing import FaceParsing
 from musetalk.utils.audio_processor import AudioProcessor
 from musetalk.utils.utils import get_file_type, get_video_fps, datagen, load_all_model
@@ -124,13 +124,15 @@ BLEND_CACHE_EVERY = 5   # BiSeNet mask 缓存: 每 N 帧更新一次
 
 
 def run_ffmpeg(cmd):
-    """执行 FFmpeg 命令"""
-    print(f"Executing: {cmd}")
+    """执行 FFmpeg 命令（接受列表或字符串）"""
+    if isinstance(cmd, str):
+        cmd = cmd.split()
+    print(f"Executing: {' '.join(cmd)}")
     try:
-        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
         return True
     except subprocess.CalledProcessError as e:
-        print(f"Error executing ffmpeg: {cmd}")
+        print(f"Error executing ffmpeg: {' '.join(cmd)}")
         print(f"Return code: {e.returncode}")
         print(f"Stderr: {e.stderr[:500]}")
         return False
@@ -477,73 +479,93 @@ def _run_inference(req: LipSyncRequest) -> dict:
     timings["5_unet"] = time.time() - t0
     print(f"✅ UNet 推理: {len(res_frame_list)} 帧 [{timings['5_unet']:.1f}s]")
 
-    # ===== Phase 6: 合成 (缓存 BiSeNet mask + cv2.VideoWriter) =====
+    # ===== Phase 6: 合成 (FFmpeg pipe 直写 H.264 + 纯 numpy blending) =====
     t0 = time.time()
 
     h, w = frames[0].shape[:2]
     temp_raw_path = output_vid_path + ".raw.mp4"
 
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    writer = cv2.VideoWriter(temp_raw_path, fourcc, fps, (w, h))
-
-    if not writer.isOpened():
-        raise RuntimeError(f"cv2.VideoWriter 打开失败: {temp_raw_path}")
+    # FFmpeg pipe: rawvideo stdin → NVENC GPU 硬件编码 H.264
+    ffmpeg_cmd = [
+        "ffmpeg", "-y", "-v", "warning",
+        "-f", "rawvideo", "-pix_fmt", "bgr24",
+        "-s", f"{w}x{h}", "-r", str(fps),
+        "-i", "pipe:0",
+        "-c:v", "h264_nvenc", "-preset", "p4", "-cq", "20",
+        "-pix_fmt", "yuv420p",
+        temp_raw_path
+    ]
+    proc = subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE,
+                            stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
 
     cached_mask = None
     cached_crop_box = None
     blend_mode = "jaw" if version == "v15" else "raw"
 
-    for i in tqdm(range(len(res_frame_list)), desc="合成"):
-        res_frame = res_frame_list[i]
-        bbox = coord_list_cycle[i % len(coord_list_cycle)]
-        ori_frame = frame_list_cycle[i % len(frame_list_cycle)].copy()
+    try:
+        for i in tqdm(range(len(res_frame_list)), desc="合成"):
+            res_frame = res_frame_list[i]
+            bbox = coord_list_cycle[i % len(coord_list_cycle)]
+            ori_frame = frame_list_cycle[i % len(frame_list_cycle)].copy()
 
-        x1, y1, x2, y2 = bbox
-        if version == "v15":
-            y2 = min(y2 + extra_margin, ori_frame.shape[0])
-        adjusted_bbox = (x1, y1, x2, y2)
+            x1, y1, x2, y2 = bbox
+            if version == "v15":
+                y2 = min(y2 + extra_margin, ori_frame.shape[0])
+            adjusted_bbox = (x1, y1, x2, y2)
 
-        try:
-            res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
-        except Exception:
-            writer.write(ori_frame)
-            continue
-
-        # 每 N 帧更新 BiSeNet 人脸解析 mask, 其余帧复用缓存
-        if i % BLEND_CACHE_EVERY == 0 or cached_mask is None:
             try:
-                cached_mask, cached_crop_box = get_image_prepare_material(
-                    ori_frame, adjusted_bbox, mode=blend_mode, fp=fp)
+                res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
             except Exception:
-                # 如果 prepare 失败, 用完整方式
-                combine_frame = get_image(
-                    ori_frame, res_frame, list(adjusted_bbox),
-                    mode=blend_mode, fp=fp)
-                writer.write(combine_frame)
+                proc.stdin.write(ori_frame.tobytes())
                 continue
 
-        try:
-            combine_frame = get_image_blending(
-                ori_frame, res_frame, adjusted_bbox, cached_mask, cached_crop_box)
-        except Exception:
-            # blending 失败时 fallback 到完整方式
-            combine_frame = get_image(
-                ori_frame, res_frame, list(adjusted_bbox),
-                mode=blend_mode, fp=fp)
+            # 每 N 帧更新 BiSeNet 人脸解析 mask, 其余帧复用缓存
+            if i % BLEND_CACHE_EVERY == 0 or cached_mask is None:
+                try:
+                    cached_mask, cached_crop_box = get_image_prepare_material(
+                        ori_frame, adjusted_bbox, mode=blend_mode, fp=fp)
+                except Exception:
+                    # 如果 prepare 失败, 用完整方式
+                    combine_frame = get_image(
+                        ori_frame, res_frame, list(adjusted_bbox),
+                        mode=blend_mode, fp=fp)
+                    proc.stdin.write(combine_frame.tobytes())
+                    continue
 
-        writer.write(combine_frame)
+            try:
+                combine_frame = get_image_blending_fast(
+                    ori_frame, res_frame, adjusted_bbox, cached_mask, cached_crop_box)
+            except Exception:
+                # blending_fast 失败时 fallback 到 PIL 方式
+                try:
+                    combine_frame = get_image_blending(
+                        ori_frame, res_frame, adjusted_bbox, cached_mask, cached_crop_box)
+                except Exception:
+                    combine_frame = get_image(
+                        ori_frame, res_frame, list(adjusted_bbox),
+                        mode=blend_mode, fp=fp)
+
+            proc.stdin.write(combine_frame.tobytes())
+    finally:
+        proc.stdin.close()
+
+    proc.wait()
+    stderr_out = proc.stderr.read().decode("utf-8", errors="ignore") if proc.stderr else ""
+    proc.stderr.close()
+    if proc.returncode != 0:
+        raise RuntimeError(f"FFmpeg pipe 编码失败 (rc={proc.returncode}): {stderr_out[:500]}")
 
-    writer.release()
     timings["6_blend"] = time.time() - t0
     print(f"🎨 合成 [{timings['6_blend']:.1f}s]")
 
-    # ===== Phase 7: FFmpeg 重编码 H.264 + 合并音频 =====
+    # ===== Phase 7: 音频合并 (-c:v copy, 不重编码视频) =====
     t0 = time.time()
-    cmd = (
-        f"ffmpeg -y -v warning -i {temp_raw_path} -i {audio_path} "
-        f"-c:v libx264 -crf 18 -pix_fmt yuv420p "
-        f"-c:a copy -shortest {output_vid_path}"
-    )
+    cmd = [
+        "ffmpeg", "-y", "-v", "warning",
+        "-i", temp_raw_path, "-i", audio_path,
+        "-c:v", "copy", "-c:a", "copy", "-shortest",
+        output_vid_path
+    ]
     if not run_ffmpeg(cmd):
         raise RuntimeError("FFmpeg 重编码+音频合并失败")