From 0e3502c6f0a772bb4125022ae50e95a92f3c05a0 Mon Sep 17 00:00:00 2001 From: Kevin Wong Date: Fri, 27 Feb 2026 16:11:34 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Docs/BACKEND_DEV.md | 8 + Docs/BACKEND_README.md | 16 +- Docs/COSYVOICE3_DEPLOY.md | 1 + Docs/DEPLOY_MANUAL.md | 71 +- Docs/DevLogs/Day27.md | 145 + Docs/DevLogs/Day28.md | 203 + Docs/FRONTEND_README.md | 6 +- Docs/MUSETALK_DEPLOY.md | 252 + Docs/SUBTITLE_DEPLOY.md | 10 +- Docs/task_complete.md | 13 +- README.md | 15 +- backend/.env.example | 26 +- backend/app/core/config.py | 17 +- backend/app/modules/ai/router.py | 23 + backend/app/modules/tools/service.py | 8 +- backend/app/modules/videos/workflow.py | 156 +- backend/app/services/lipsync_service.py | 92 +- backend/app/services/video_service.py | 426 +- .../features/home/model/useHomeController.ts | 21 +- .../home/model/useVideoFrameCapture.ts | 94 + .../features/home/ui/FloatingStylePreview.tsx | 8 +- frontend/src/features/home/ui/HomePage.tsx | 121 +- .../src/features/home/ui/RewriteModal.tsx | 213 + .../src/features/home/ui/ScriptEditor.tsx | 40 +- .../home/ui/ScriptExtractionModal.tsx | 105 +- .../features/home/ui/TitleSubtitlePanel.tsx | 5 +- .../script-extraction/useScriptExtraction.ts | 36 +- models/CosyVoice/cosyvoice_server.py | 2 +- models/MuseTalk/LICENSE | 159 + models/MuseTalk/README.md | 556 + models/MuseTalk/app.py | 570 + .../MuseTalk/configs/inference/realtime.yaml | 10 + models/MuseTalk/configs/inference/test.yaml | 10 + models/MuseTalk/configs/training/gpu.yaml | 21 + .../MuseTalk/configs/training/preprocess.yaml | 31 + models/MuseTalk/configs/training/stage1.yaml | 89 + models/MuseTalk/configs/training/stage2.yaml | 89 + models/MuseTalk/configs/training/syncnet.yaml | 19 + models/MuseTalk/download_weights.bat | 41 + models/MuseTalk/download_weights.sh | 51 + models/MuseTalk/entrypoint.sh | 9 + models/MuseTalk/inference.sh | 72 + models/MuseTalk/musetalk/data/audio.py | 168 + models/MuseTalk/musetalk/data/dataset.py | 610 + .../MuseTalk/musetalk/data/sample_method.py | 233 + models/MuseTalk/musetalk/loss/basic_loss.py | 81 + models/MuseTalk/musetalk/loss/conv.py | 44 + .../MuseTalk/musetalk/loss/discriminator.py | 145 + models/MuseTalk/musetalk/loss/resnet.py | 152 + models/MuseTalk/musetalk/loss/syncnet.py | 95 + models/MuseTalk/musetalk/loss/vgg_face.py | 237 + models/MuseTalk/musetalk/models/syncnet.py | 240 + models/MuseTalk/musetalk/models/unet.py | 51 + models/MuseTalk/musetalk/models/vae.py | 148 + models/MuseTalk/musetalk/utils/__init__.py | 5 + .../musetalk/utils/audio_processor.py | 113 + models/MuseTalk/musetalk/utils/audio_utils.py | 17 + models/MuseTalk/musetalk/utils/blending.py | 136 + .../musetalk/utils/dwpose/default_runtime.py | 54 + ...8xb32-270e_coco-ubody-wholebody-384x288.py | 257 + .../musetalk/utils/face_detection/README.md | 1 + .../musetalk/utils/face_detection/__init__.py | 7 + .../musetalk/utils/face_detection/api.py | 240 + .../face_detection/detection/__init__.py | 1 + .../utils/face_detection/detection/core.py | 130 + .../face_detection/detection/sfd/__init__.py | 1 + .../face_detection/detection/sfd/bbox.py | 129 + .../face_detection/detection/sfd/detect.py | 114 + .../face_detection/detection/sfd/net_s3fd.py | 129 + .../detection/sfd/sfd_detector.py | 59 + .../musetalk/utils/face_detection/models.py | 261 + .../musetalk/utils/face_detection/utils.py | 313 + .../musetalk/utils/face_parsing/__init__.py | 117 + .../musetalk/utils/face_parsing/model.py | 283 + .../musetalk/utils/face_parsing/resnet.py | 109 + .../MuseTalk/musetalk/utils/preprocessing.py | 155 + .../MuseTalk/musetalk/utils/training_utils.py | 337 + models/MuseTalk/musetalk/utils/utils.py | 319 + .../musetalk/whisper/audio2feature.py | 128 + .../musetalk/whisper/whisper/__init__.py | 116 + .../musetalk/whisper/whisper/__main__.py | 4 + .../whisper/whisper/assets/gpt2/merges.txt | 50001 ++++++++++++++++ .../assets/gpt2/special_tokens_map.json | 1 + .../whisper/assets/gpt2/tokenizer_config.json | 1 + .../whisper/whisper/assets/gpt2/vocab.json | 1 + .../whisper/whisper/assets/mel_filters.npz | Bin 0 -> 2048 bytes .../assets/multilingual/added_tokens.json | 1 + .../whisper/assets/multilingual/merges.txt | 50000 +++++++++++++++ .../multilingual/special_tokens_map.json | 1 + .../assets/multilingual/tokenizer_config.json | 1 + .../whisper/assets/multilingual/vocab.json | 1 + .../musetalk/whisper/whisper/audio.py | 125 + .../musetalk/whisper/whisper/decoding.py | 729 + .../musetalk/whisper/whisper/model.py | 290 + .../whisper/whisper/normalizers/__init__.py | 2 + .../whisper/whisper/normalizers/basic.py | 71 + .../whisper/whisper/normalizers/english.json | 1742 + .../whisper/whisper/normalizers/english.py | 543 + .../musetalk/whisper/whisper/tokenizer.py | 331 + .../musetalk/whisper/whisper/transcribe.py | 207 + .../musetalk/whisper/whisper/utils.py | 87 + models/MuseTalk/musetalk_api.py | 157 + models/MuseTalk/requirements.txt | 20 + models/MuseTalk/scripts/__init__.py | 1 + models/MuseTalk/scripts/inference.py | 314 + models/MuseTalk/scripts/preprocess.py | 334 + models/MuseTalk/scripts/realtime_inference.py | 409 + models/MuseTalk/scripts/server.py | 572 + models/MuseTalk/test_ffmpeg.py | 33 + models/MuseTalk/train.py | 580 + models/MuseTalk/train.sh | 34 + remotion/render.ts | 8 +- run_musetalk.sh | 17 + 113 files changed, 115723 insertions(+), 490 deletions(-) create mode 100644 Docs/DevLogs/Day28.md create mode 100644 Docs/MUSETALK_DEPLOY.md create mode 100644 frontend/src/features/home/model/useVideoFrameCapture.ts create mode 100644 frontend/src/features/home/ui/RewriteModal.tsx create mode 100644 models/MuseTalk/LICENSE create mode 100644 models/MuseTalk/README.md create mode 100644 models/MuseTalk/app.py create mode 100644 models/MuseTalk/configs/inference/realtime.yaml create mode 100644 models/MuseTalk/configs/inference/test.yaml create mode 100644 models/MuseTalk/configs/training/gpu.yaml create mode 100644 models/MuseTalk/configs/training/preprocess.yaml create mode 100644 models/MuseTalk/configs/training/stage1.yaml create mode 100644 models/MuseTalk/configs/training/stage2.yaml create mode 100644 models/MuseTalk/configs/training/syncnet.yaml create mode 100644 models/MuseTalk/download_weights.bat create mode 100644 models/MuseTalk/download_weights.sh create mode 100644 models/MuseTalk/entrypoint.sh create mode 100644 models/MuseTalk/inference.sh create mode 100644 models/MuseTalk/musetalk/data/audio.py create mode 100644 models/MuseTalk/musetalk/data/dataset.py create mode 100644 models/MuseTalk/musetalk/data/sample_method.py create mode 100644 models/MuseTalk/musetalk/loss/basic_loss.py create mode 100644 models/MuseTalk/musetalk/loss/conv.py create mode 100644 models/MuseTalk/musetalk/loss/discriminator.py create mode 100644 models/MuseTalk/musetalk/loss/resnet.py create mode 100644 models/MuseTalk/musetalk/loss/syncnet.py create mode 100644 models/MuseTalk/musetalk/loss/vgg_face.py create mode 100644 models/MuseTalk/musetalk/models/syncnet.py create mode 100644 models/MuseTalk/musetalk/models/unet.py create mode 100644 models/MuseTalk/musetalk/models/vae.py create mode 100644 models/MuseTalk/musetalk/utils/__init__.py create mode 100644 models/MuseTalk/musetalk/utils/audio_processor.py create mode 100644 models/MuseTalk/musetalk/utils/audio_utils.py create mode 100644 models/MuseTalk/musetalk/utils/blending.py create mode 100644 models/MuseTalk/musetalk/utils/dwpose/default_runtime.py create mode 100644 models/MuseTalk/musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/README.md create mode 100644 models/MuseTalk/musetalk/utils/face_detection/__init__.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/api.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/detection/__init__.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/detection/core.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/detection/sfd/__init__.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/detection/sfd/bbox.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/detection/sfd/detect.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/detection/sfd/net_s3fd.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/detection/sfd/sfd_detector.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/models.py create mode 100644 models/MuseTalk/musetalk/utils/face_detection/utils.py create mode 100644 models/MuseTalk/musetalk/utils/face_parsing/__init__.py create mode 100644 models/MuseTalk/musetalk/utils/face_parsing/model.py create mode 100644 models/MuseTalk/musetalk/utils/face_parsing/resnet.py create mode 100644 models/MuseTalk/musetalk/utils/preprocessing.py create mode 100644 models/MuseTalk/musetalk/utils/training_utils.py create mode 100644 models/MuseTalk/musetalk/utils/utils.py create mode 100644 models/MuseTalk/musetalk/whisper/audio2feature.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/__init__.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/__main__.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/gpt2/merges.txt create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/gpt2/special_tokens_map.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/gpt2/tokenizer_config.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/gpt2/vocab.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/mel_filters.npz create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/multilingual/added_tokens.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/multilingual/merges.txt create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/multilingual/special_tokens_map.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/multilingual/tokenizer_config.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/assets/multilingual/vocab.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/audio.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/decoding.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/model.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/normalizers/__init__.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/normalizers/basic.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/normalizers/english.json create mode 100644 models/MuseTalk/musetalk/whisper/whisper/normalizers/english.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/tokenizer.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/transcribe.py create mode 100644 models/MuseTalk/musetalk/whisper/whisper/utils.py create mode 100644 models/MuseTalk/musetalk_api.py create mode 100644 models/MuseTalk/requirements.txt create mode 100644 models/MuseTalk/scripts/__init__.py create mode 100644 models/MuseTalk/scripts/inference.py create mode 100644 models/MuseTalk/scripts/preprocess.py create mode 100644 models/MuseTalk/scripts/realtime_inference.py create mode 100644 models/MuseTalk/scripts/server.py create mode 100644 models/MuseTalk/test_ffmpeg.py create mode 100644 models/MuseTalk/train.py create mode 100644 models/MuseTalk/train.sh create mode 100644 run_musetalk.sh diff --git a/Docs/BACKEND_DEV.md b/Docs/BACKEND_DEV.md index 51f92ed..cd797c1 100644 --- a/Docs/BACKEND_DEV.md +++ b/Docs/BACKEND_DEV.md @@ -156,6 +156,14 @@ backend/user_data/{user_uuid}/cookies/ - `LATENTSYNC_*` - `CORS_ORIGINS` (CORS 白名单,默认 *) +### MuseTalk / 混合唇形同步 +- `MUSETALK_GPU_ID` (GPU 编号,默认 0) +- `MUSETALK_API_URL` (常驻服务地址,默认 http://localhost:8011) +- `MUSETALK_BATCH_SIZE` (推理批大小,默认 32) +- `MUSETALK_VERSION` (v15) +- `MUSETALK_USE_FLOAT16` (半精度,默认 true) +- `LIPSYNC_DURATION_THRESHOLD` (秒,>=此值用 MuseTalk,默认 120) + ### 微信视频号 - `WEIXIN_HEADLESS_MODE` (headful/headless-new) - `WEIXIN_CHROME_PATH` / `WEIXIN_BROWSER_CHANNEL` diff --git a/Docs/BACKEND_README.md b/Docs/BACKEND_README.md index 56efce1..2964d53 100644 --- a/Docs/BACKEND_README.md +++ b/Docs/BACKEND_README.md @@ -101,7 +101,7 @@ backend/ * `POST /api/tools/extract-script`: 从视频链接提取文案 10. **健康检查** - * `GET /api/lipsync/health`: LatentSync 服务健康状态 + * `GET /api/lipsync/health`: 唇形同步服务健康状态(含 LatentSync + MuseTalk + 混合路由阈值) * `GET /api/voiceclone/health`: CosyVoice 3.0 服务健康状态 11. **支付 (Payment)** @@ -202,6 +202,12 @@ GLM_API_KEY=your_glm_api_key # LatentSync 配置 LATENTSYNC_GPU_ID=1 + +# MuseTalk 配置 (长视频唇形同步) +MUSETALK_GPU_ID=0 +MUSETALK_API_URL=http://localhost:8011 +MUSETALK_BATCH_SIZE=32 +LIPSYNC_DURATION_THRESHOLD=120 ``` ### 4. 启动服务 @@ -224,6 +230,14 @@ uvicorn app.main:app --host 0.0.0.0 --port 8006 --reload 3. **重要**: 如果模型占用 GPU,请务必使用 `asyncio.Lock` 进行并发控制,防止 OOM。 4. 在 `app/modules/` 下创建对应模块,添加 router/service/schemas,并在 `main.py` 注册路由。 +### 唇形同步混合路由 + +`lipsync_service.py` 实现了 LatentSync + MuseTalk 混合路由: +- 短视频 (<`LIPSYNC_DURATION_THRESHOLD`s) → LatentSync 1.6 (GPU1, 端口 8007) +- 长视频 (>=阈值) → MuseTalk 1.5 (GPU0, 端口 8011) +- MuseTalk 不可用时自动回退到 LatentSync +- 路由逻辑对 workflow 完全透明 + ### 添加定时任务 目前推荐使用 **APScheduler** 或 **Crontab** 来管理定时任务。 diff --git a/Docs/COSYVOICE3_DEPLOY.md b/Docs/COSYVOICE3_DEPLOY.md index d309d30..4318a53 100644 --- a/Docs/COSYVOICE3_DEPLOY.md +++ b/Docs/COSYVOICE3_DEPLOY.md @@ -7,6 +7,7 @@ | 模型 | Fun-CosyVoice3-0.5B-2512 (0.5B 参数) | | 端口 | 8010 | | GPU | 0 (CUDA_VISIBLE_DEVICES=0) | +| 推理精度 | FP16 (自动混合精度) | | PM2 名称 | vigent2-cosyvoice (id=15) | | Conda 环境 | cosyvoice (Python 3.10) | | 启动脚本 | `run_cosyvoice.sh` | diff --git a/Docs/DEPLOY_MANUAL.md b/Docs/DEPLOY_MANUAL.md index 5be1b76..fab1b86 100644 --- a/Docs/DEPLOY_MANUAL.md +++ b/Docs/DEPLOY_MANUAL.md @@ -7,8 +7,8 @@ | 服务器 | Dell PowerEdge R730 | | CPU | 2× Intel Xeon E5-2680 v4 (56 线程) | | 内存 | 192GB DDR4 | -| GPU 0 | NVIDIA RTX 3090 24GB | -| GPU 1 | NVIDIA RTX 3090 24GB (用于 LatentSync) | +| GPU 0 | NVIDIA RTX 3090 24GB (MuseTalk + CosyVoice) | +| GPU 1 | NVIDIA RTX 3090 24GB (LatentSync) | | 部署路径 | `/home/rongye/ProgramFiles/ViGent2` | --- @@ -72,7 +72,9 @@ cd /home/rongye/ProgramFiles/ViGent2 --- -## 步骤 3: 部署 AI 模型 (LatentSync 1.6) +## 步骤 3: 部署 AI 模型 + +### 3a. LatentSync 1.6 (短视频唇形同步, GPU1) > ⚠️ **重要**:LatentSync 需要独立的 Conda 环境和 **~18GB VRAM**。请**不要**直接安装在后端环境中。 @@ -93,6 +95,26 @@ conda activate latentsync python -m scripts.server # 测试能否启动,Ctrl+C 退出 ``` +### 3b. MuseTalk 1.5 (长视频唇形同步, GPU0) + +> MuseTalk 是单步潜空间修复模型(非扩散模型),推理速度接近实时,适合 >=120s 的长视频。与 CosyVoice 共享 GPU0,fp16 推理约需 4-8GB 显存。 + +请参考详细的独立部署指南: +**[MuseTalk 部署指南](MUSETALK_DEPLOY.md)** + +简要步骤: +1. 创建独立的 `musetalk` Conda 环境 (Python 3.10 + PyTorch 2.0.1 + CUDA 11.8) +2. 安装 mmcv/mmdet/mmpose 等依赖 +3. 下载模型权重 (`download_weights.sh`) +4. 创建必要的软链接 (`musetalk/config.json`, `musetalk/musetalkV15`) + +**验证 MuseTalk 部署**: +```bash +cd /home/rongye/ProgramFiles/ViGent2/models/MuseTalk +/home/rongye/ProgramFiles/miniconda3/envs/musetalk/bin/python scripts/server.py +# 另一个终端: curl http://localhost:8011/health +``` + --- ## 步骤 4: 安装后端依赖 @@ -189,7 +211,7 @@ cp .env.example .env | `SUPABASE_PUBLIC_URL` | `https://api.hbyrkj.top` | Supabase API 公网地址 (前端访问) | | `LATENTSYNC_GPU_ID` | 1 | GPU 选择 (0 或 1) | | `LATENTSYNC_USE_SERVER` | false | 设为 true 以启用常驻服务加速 | -| `LATENTSYNC_INFERENCE_STEPS` | 20 | 推理步数 (20-50) | +| `LATENTSYNC_INFERENCE_STEPS` | 16 | 推理步数 (16-50) | | `LATENTSYNC_GUIDANCE_SCALE` | 1.5 | 引导系数 (1.0-3.0) | | `DEBUG` | true | 生产环境改为 false | | `REDIS_URL` | `redis://localhost:6379/0` | 任务状态存储(不可用时回退内存) | @@ -212,7 +234,12 @@ cp .env.example .env | `DOUYIN_RECORD_VIDEO` | false | 录制浏览器操作视频 | | `DOUYIN_KEEP_SUCCESS_VIDEO` | false | 成功后保留录屏 | | `CORS_ORIGINS` | `*` | CORS 允许源 (生产环境建议白名单) | -| `DOUYIN_COOKIE` | 空 | 抖音视频下载 Cookie (文案提取功能) | +| `MUSETALK_GPU_ID` | 0 | MuseTalk GPU 编号 | +| `MUSETALK_API_URL` | `http://localhost:8011` | MuseTalk 常驻服务地址 | +| `MUSETALK_BATCH_SIZE` | 32 | MuseTalk 推理批大小 | +| `MUSETALK_VERSION` | v15 | MuseTalk 模型版本 | +| `MUSETALK_USE_FLOAT16` | true | MuseTalk 半精度加速 | +| `LIPSYNC_DURATION_THRESHOLD` | 120 | 秒,>=此值用 MuseTalk,<此值用 LatentSync | | `ALIPAY_APP_ID` | 空 | 支付宝应用 APPID | | `ALIPAY_PRIVATE_KEY_PATH` | 空 | 应用私钥 PEM 文件路径 | | `ALIPAY_PUBLIC_KEY_PATH` | 空 | 支付宝公钥 PEM 文件路径 | @@ -271,6 +298,13 @@ cd /home/rongye/ProgramFiles/ViGent2/models/LatentSync conda activate latentsync python -m scripts.server ``` + +### 启动 MuseTalk (终端 4, 长视频唇形同步) + +```bash +cd /home/rongye/ProgramFiles/ViGent2/models/MuseTalk +/home/rongye/ProgramFiles/miniconda3/envs/musetalk/bin/python scripts/server.py +``` ### 验证 @@ -364,7 +398,27 @@ pm2 save curl http://localhost:8010/health ``` -### 5. 启动服务看门狗 (Watchdog) +### 5. 启动 MuseTalk 长视频唇形同步服务 + +> 长视频 (>=120s) 自动路由到 MuseTalk。MuseTalk 不可用时自动回退 LatentSync。 +> 详细部署步骤见 [MuseTalk 部署指南](MUSETALK_DEPLOY.md)。 + +1. 启动脚本位于项目根目录: `run_musetalk.sh` + +2. 使用 pm2 启动: +```bash +cd /home/rongye/ProgramFiles/ViGent2 +pm2 start ./run_musetalk.sh --name vigent2-musetalk +pm2 save +``` + +3. 验证服务: +```bash +curl http://localhost:8011/health +# {"status":"ok","model_loaded":true} +``` + +### 6. 启动服务看门狗 (Watchdog) > 🛡️ **推荐**:监控 CosyVoice 和 LatentSync 服务健康状态,卡死时自动重启。 @@ -381,6 +435,8 @@ pm2 save pm2 startup ``` +> **提示**: 完整的 PM2 进程列表应包含 5-6 个服务: vigent2-backend, vigent2-frontend, vigent2-latentsync, vigent2-cosyvoice, vigent2-musetalk, vigent2-watchdog。 + ### pm2 常用命令 ```bash @@ -388,6 +444,7 @@ pm2 status # 查看所有服务状态 pm2 logs # 查看所有日志 pm2 logs vigent2-backend # 查看后端日志 pm2 logs vigent2-cosyvoice # 查看 CosyVoice 日志 +pm2 logs vigent2-musetalk # 查看 MuseTalk 日志 pm2 restart all # 重启所有服务 pm2 stop vigent2-latentsync # 停止 LatentSync 服务 pm2 delete all # 删除所有服务 @@ -527,6 +584,7 @@ sudo lsof -i :8006 sudo lsof -i :3002 sudo lsof -i :8007 sudo lsof -i :8010 # CosyVoice +sudo lsof -i :8011 # MuseTalk ``` ### 查看日志 @@ -537,6 +595,7 @@ pm2 logs vigent2-backend pm2 logs vigent2-frontend pm2 logs vigent2-latentsync pm2 logs vigent2-cosyvoice +pm2 logs vigent2-musetalk ``` ### SSH 连接卡顿 / 系统响应慢 diff --git a/Docs/DevLogs/Day27.md b/Docs/DevLogs/Day27.md index c965c66..c6ed2bd 100644 --- a/Docs/DevLogs/Day27.md +++ b/Docs/DevLogs/Day27.md @@ -84,3 +84,148 @@ - `npm run build`(前端)— 零报错 - 描边:标题/副标题/字幕使用 CSS 原生描边,无重影、无虚胖 - 样式选择:前端下拉可加载全部 12 个标题 + 8 个字幕样式 + +--- + +## 视频生成流水线性能优化 + +### 概述 + +针对视频生成流水线进行全面性能优化,涵盖 FFmpeg 编码参数、LatentSync 推理参数、多素材并行化、以及后处理阶段并行化。预估 15s 单素材视频从 ~280s 降至 ~190s (32%),30s 双素材从 ~400s 降至 ~240s (40%)。 + +**服务器配置**: 2x RTX 3090 (24GB), 2x Xeon E5-2680 v4 (56核), 192GB RAM + +### 第一阶段:FFmpeg 编码优化 + +**最终合成 preset `slow` → `medium`** +- 合成阶段从 ~50s 降到 ~25s,质量几乎无变化 + +**中间文件 CRF 18 → 23** +- 中间产物(trim、prepare_segment、concat、loop、normalize_orientation)不是最终输出,不需要高质量编码 +- 每个中间步骤快 3-8 秒 + +**最终合成 CRF 18 → 20** +- 15 秒口播视频 CRF 18 vs 20 肉眼无法区分 + +### 第二阶段:LatentSync 推理参数调优 + +**inference_steps 20 → 16** +- 推理时间线性减少 20%(~180s → ~144s) + +**guidance_scale 2.0 → 1.5** +- classifier-free guidance 权重降低,每步计算量微降(5-10%) + +> ⚠️ 两项需重启 LatentSync 服务后测试唇形质量,确认可接受再保留。如质量不佳可回退 .env 参数。 + +### 第三阶段:多素材流水线并行化 + +**素材下载 + 归一化并行** +- 串行 `for` 循环改为 `asyncio.gather()`,`normalize_orientation` 通过 `run_in_executor` 在线程池执行 +- N 个素材从串行 N×5s → ~5s + +**片段预处理并行** +- 逐个 `prepare_segment` 改为 `asyncio.gather()` + `run_in_executor` +- 2 素材 ~90s → ~50s;4 素材 ~180s → ~60s + +### 第四阶段:流水线交叠 + +**Whisper 字幕对齐 与 BGM 混音 并行** +- 两者互不依赖(都只依赖 audio_path),用 `asyncio.gather()` 并行执行 +- 单素材模式下 Whisper 从 LatentSync 之后的串行步骤移至与 BGM 并行 +- 不开 BGM 或不开字幕时行为不变,只有同时启用时才并行 + +### 修改文件 + +| 文件 | 改动 | +|------|------| +| `backend/app/services/video_service.py` | compose: preset slow→medium, CRF 18→20; normalize_orientation/prepare_segment/concat: CRF 18→23 | +| `backend/app/services/lipsync_service.py` | _loop_video_to_duration: CRF 18→23 | +| `backend/.env` | LATENTSYNC_INFERENCE_STEPS=16, LATENTSYNC_GUIDANCE_SCALE=1.5 | +| `backend/app/modules/videos/workflow.py` | import asyncio; 素材下载/归一化并行; 片段预处理并行; Whisper+BGM 并行 | + +### 回退方案 + +- FFmpeg 参数:如画质不满意,将最终 CRF 改回 18、preset 改回 slow +- LatentSync:如唇形质量下降,将 .env 中 `INFERENCE_STEPS` 改回 20、`GUIDANCE_SCALE` 改回 2.0 +- 并行化:纯架构优化,无质量影响,无需回退 + +--- + +## MuseTalk + LatentSync 混合唇形同步方案 + +### 概述 + +LatentSync 1.6 质量高但推理极慢(~78% 总时长),长视频(>=2min)耗时 20-60 分钟不可接受。MuseTalk 1.5 是单步潜空间修复(非扩散模型),逐帧推理速度接近实时(30fps+ on V100),适合长视频。混合方案按音频时长自动路由:短视频用 LatentSync 保质量,长视频用 MuseTalk 保速度。 + +### 架构 + +- **路由阈值**: `LIPSYNC_DURATION_THRESHOLD` (默认 120s) +- **短视频 (<120s)**: LatentSync 1.6 (GPU1, 端口 8007) +- **长视频 (>=120s)**: MuseTalk 1.5 (GPU0, 端口 8011) +- **回退**: MuseTalk 不可用时自动 fallback 到 LatentSync + +### 改动文件 + +| 文件 | 改动 | +|------|------| +| `models/MuseTalk/` | 从 Temp/MuseTalk 复制代码 + 下载权重 | +| `models/MuseTalk/scripts/server.py` | 新建 FastAPI 常驻服务 (端口 8011, GPU0) | +| `backend/app/core/config.py` | 新增 MUSETALK_* 和 LIPSYNC_DURATION_THRESHOLD | +| `backend/.env` | 新增对应环境变量 | +| `backend/app/services/lipsync_service.py` | 新增 `_call_musetalk_server()` + 混合路由逻辑 + 扩展 `check_health()` | + +--- + +## MuseTalk 推理性能优化 (server.py v2) + +### 概述 + +MuseTalk 首次长视频测试 (136s, 3404 帧) 耗时 1799s (~30 分钟),分析发现瓶颈集中在人脸检测 (28%)、BiSeNet 合成 (22%)、I/O (17%),而非 UNet 推理本身 (17%)。通过 6 项优化预估降至 8-10 分钟 (~3x 加速)。 + +### 性能瓶颈分析 (优化前, 1799s) + +| 阶段 | 耗时 | 占比 | 瓶颈原因 | +|------|------|------|---------| +| DWPose + 人脸检测 | ~510s | 28% | `batch_size_fa=1`, 每帧跑 2 个 NN, 完全串行 | +| 合成 + BiSeNet 人脸解析 | ~400s | 22% | 每帧都跑 BiSeNet + PNG 写盘 | +| UNet 推理 | ~300s | 17% | batch_size=8 太小 | +| I/O (PNG 读写 + FFmpeg) | ~300s | 17% | PNG 压缩慢, ffmpeg→PNG→imread 链路 | +| VAE 编码 | ~100s | 6% | 逐帧编码, 未批处理 | + +### 6 项优化 + +| # | 优化项 | 详情 | +|---|--------|------| +| 1 | **batch_size 8→32** | `.env` 修改, RTX 3090 显存充裕 | +| 2 | **cv2.VideoCapture 直读帧** | 跳过 ffmpeg→PNG→imread 链路, 省去 3404 次 PNG 编解码 | +| 3 | **人脸检测降频 (每5帧)** | 每 5 帧运行 DWPose + FaceAlignment, 中间帧线性插值 bbox | +| 4 | **BiSeNet mask 缓存 (每5帧)** | 每 5 帧运行 `get_image_prepare_material`, 中间帧用 `get_image_blending` 复用缓存 mask | +| 5 | **cv2.VideoWriter 直写** | 跳过逐帧 PNG 写盘 + ffmpeg 重编码, 用 VideoWriter 直写 mp4 | +| 6 | **每阶段计时** | 7 个阶段精确计时, 方便后续进一步调优 | + +### 修改文件 + +| 文件 | 改动 | +|------|------| +| `models/MuseTalk/scripts/server.py` | 完全重写 `_run_inference()`, 新增 `_detect_faces_subsampled()` | +| `backend/.env` | `MUSETALK_BATCH_SIZE` 8→32 | + +--- + +## Remotion 并发渲染优化 + +### 概述 + +Remotion 渲染在 56 核服务器上默认只用 8 并发 (`min(8, cores/2)`),改为 16 并发,预估从 ~5 分钟降到 ~2-3 分钟。 + +### 改动 + +- `remotion/render.ts`: `renderMedia()` 新增 `concurrency` 参数 (默认 16), 支持 `--concurrency` CLI 参数覆盖 +- `remotion/dist/render.js`: 重新编译 + +### 修改文件 + +| 文件 | 改动 | +|------|------| +| `remotion/render.ts` | `RenderOptions` 新增 `concurrency` 字段, `renderMedia()` 传入 `concurrency` | +| `remotion/dist/render.js` | TypeScript 重新编译 | diff --git a/Docs/DevLogs/Day28.md b/Docs/DevLogs/Day28.md new file mode 100644 index 0000000..585c44b --- /dev/null +++ b/Docs/DevLogs/Day28.md @@ -0,0 +1,203 @@ +## CosyVoice FP16 加速 + 文档更新 + AI改写界面重构 + 标题字幕面板重排与视频帧预览 (Day 28) + +### 概述 + +CosyVoice 3.0 声音克隆服务开启 FP16 半精度推理,预估提速 30-40%。同步更新 4 个项目文档。重构 AI 改写文案界面(RewriteModal 两步流程 + ScriptExtractionModal 逻辑抽取)。前端将"标题与字幕"面板从第二步移至第四步(素材编辑之后),样式预览窗口背景从紫粉渐变改为视频片头帧截图,实现所见即所得。 + +--- + +## ✅ 改动内容 + +### 1. CosyVoice FP16 半精度加速 + +- **问题**: CosyVoice 3.0 以 FP32 全精度运行,RTF (Real-Time Factor) 约 0.9-1.35x,生成 2 分钟音频需要约 2 分钟 +- **根因**: `AutoModel()` 初始化时未传入 `fp16=True`,LLM 推理和 Flow Matching (DiT) 均在 FP32 下运行 +- **修复**: 一行改动开启 FP16 自动混合精度 + +```python +# 旧: _model = AutoModel(model_dir=str(MODEL_DIR)) +# 新: +_model = AutoModel(model_dir=str(MODEL_DIR), fp16=True) +``` + +- **生效机制**: `CosyVoice3Model` 在 `llm_job()` 和 `token2wav()` 中通过 `torch.cuda.amp.autocast(self.fp16)` 自动将计算转为 FP16 +- **预期效果**: + - 推理速度提升 30-40% + - 显存占用降低 ~30% + - 语音质量基本无损(0.5B 模型 FP16 精度充足) +- **验证**: 服务重启后自检通过,健康检查 `ready: true` + +### 2. 文档全面更新 (4 个文件) + +补充 Day 27 新增的 MuseTalk 混合唇形同步方案、性能优化、Remotion 并发渲染等内容到所有相关文档。 + +#### README.md +- 项目描述更新为 "LatentSync 1.6 + MuseTalk 1.5 混合唇形同步" +- 唇形同步功能描述改为混合方案(短视频 LatentSync,长视频 MuseTalk) +- 技术栈表新增 MuseTalk 1.5 +- 项目结构新增 `models/MuseTalk/` +- 服务架构表新增 MuseTalk (端口 8011) +- 文档中心新增 MuseTalk 部署指南链接 +- 性能优化描述新增降频检测 + Remotion 16 并发 + +#### DEPLOY_MANUAL.md +- GPU 分配说明更新 (GPU0=MuseTalk+CosyVoice, GPU1=LatentSync) +- 步骤 3 拆分为 3a (LatentSync) + 3b (MuseTalk) +- 环境变量表新增 7 个 MuseTalk 变量,移除过时的 `DOUYIN_COOKIE` +- LatentSync 推理步数默认值 20→16 +- 测试运行新增 MuseTalk 启动终端 +- PM2 管理新增 MuseTalk 服务(第 5 项) +- 端口检查、日志查看命令新增 8011/vigent2-musetalk + +#### SUBTITLE_DEPLOY.md +- 技术架构图更新为 LatentSync/MuseTalk 混合路由 +- 新增唇形同步路由说明 +- Remotion 配置表新增 `concurrency` 参数 (默认 16) +- GPU 分配说明更新 +- 更新日志新增 v1.3.0 条目 + +#### BACKEND_README.md +- 健康检查接口描述更新为含 LatentSync + MuseTalk + 混合路由阈值 +- 环境变量配置新增 MuseTalk 相关变量 +- 服务集成指南新增"唇形同步混合路由"章节 + +--- + +### 3. AI 改写文案界面重构 + +#### RewriteModal 重构 + +将 AI 改写弹窗改为两步式流程,提升交互体验: + +**第一步 — 配置与触发**: +- 自定义提示词输入(可选),自动持久化到 localStorage +- "开始改写"按钮触发 `/api/ai/rewrite` 请求 + +**第二步 — 结果对比与选择**: +- 上方:AI 改写结果 + "使用此结果"按钮(紫粉渐变色,醒目) +- 下方:原文对比 + "保留原文"按钮(灰色低调) +- 底部:可"重新改写"(重回第一步,保留自定义提示词) +- ESC 快捷键关闭 + +#### ScriptExtractionModal 逻辑抽取 + +将文案提取模态框的全部业务逻辑抽取到独立 hook `useScriptExtraction`: + +- **useScriptExtraction.ts** (新建): 管理 URL/文件双模式输入、拖拽上传、提取请求、步骤状态机 (config → processing → result)、剪贴板复制 +- **ScriptExtractionModal.tsx**: 纯展示组件,消费 hook 返回值,新增 ESC/Enter 快捷键 + +#### ScriptEditor 工具栏调整 + +- 按钮组右对齐 (`justify-end`),统一高度 `h-7` 和圆角 +- "历史文案"按钮用灰色 (bg-gray-600) 区分辅助功能 +- "文案提取助手"用紫色 (bg-purple-600) 表示主功能 +- "AI多语言"用绿渐变 (emerald-teal),"AI生成标题标签"用蓝渐变 (blue-cyan) +- "AI智能改写"和"保存文案"移至文本框下方状态栏 + +--- + +### 4. 标题字幕面板重排 + 视频帧背景预览 + +#### 面板顺序重排 + +将 `` 从第二步移至第四步(素材编辑之后),使用户在设置标题字幕样式时已经完成了素材选择和时间轴编排。 + +新顺序: +``` +一、文案提取与编辑(不变) +二、配音(原三) +三、素材编辑(原四) +四、标题与字幕(原二)→ 移到素材编辑之后 +``` + +#### 新建 useVideoFrameCapture hook + +从视频 URL 截取 0.1s 处帧画面,返回 JPEG data URL: + +- 创建 `