From 3129d45b2540d478a370489ad65549f4a4d6a4a6 Mon Sep 17 00:00:00 2001 From: Kevin Wong Date: Mon, 9 Feb 2026 14:47:19 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Docs/BACKEND_README.md | 2 + Docs/DevLogs/Day21.md | 132 ++++++++ Docs/DevLogs/Day22.md | 221 +++++++++++++ Docs/task_complete.md | 7 +- backend/app/modules/ai/router.py | 27 ++ backend/app/modules/videos/schemas.py | 4 +- backend/app/modules/videos/workflow.py | 296 +++++++++++++++-- backend/app/services/glm_service.py | 44 +++ backend/app/services/video_service.py | 106 ++++++ backend/app/services/voice_clone_service.py | 2 +- backend/app/services/whisper_service.py | 39 ++- frontend/package-lock.json | 56 ++++ frontend/package.json | 3 + .../features/home/model/useHomeController.ts | 201 +++++++++--- .../features/home/model/useHomePersistence.ts | 52 +-- .../src/features/home/model/useMaterials.ts | 88 +++-- .../features/home/ui/GenerateActionBar.tsx | 77 +++-- frontend/src/features/home/ui/HomePage.tsx | 25 +- .../src/features/home/ui/MaterialSelector.tsx | 306 +++++++++++++----- .../src/features/home/ui/ScriptEditor.tsx | 98 +++++- .../features/home/ui/TitleSubtitlePanel.tsx | 24 +- frontend/src/shared/types/material.ts | 7 + models/Qwen3-TTS/qwen_tts_server.py | 6 +- 23 files changed, 1529 insertions(+), 294 deletions(-) create mode 100644 Docs/DevLogs/Day22.md create mode 100644 frontend/src/shared/types/material.ts diff --git a/Docs/BACKEND_README.md b/Docs/BACKEND_README.md index 9889472..7c4b697 100644 --- a/Docs/BACKEND_README.md +++ b/Docs/BACKEND_README.md @@ -108,6 +108,8 @@ backend/ `POST /api/videos/generate` 支持以下可选字段: +- `material_path`: 视频素材路径(单素材模式) +- `material_paths`: 多素材路径数组(多机位模式,≥2 个素材时按句子自动切换) - `tts_mode`: TTS 模式 (`edgetts` / `voiceclone`) - `voice`: EdgeTTS 音色 ID(edgetts 模式) - `ref_audio_id` / `ref_text`: 参考音频 ID 与文本(voiceclone 模式) diff --git a/Docs/DevLogs/Day21.md b/Docs/DevLogs/Day21.md index 558cfe4..50f7285 100644 --- a/Docs/DevLogs/Day21.md +++ b/Docs/DevLogs/Day21.md @@ -315,3 +315,135 @@ npm run build && pm2 restart vigent2-frontend # 刷脸验证UI pm2 restart vigent2-backend npm run build && pm2 restart vigent2-frontend ``` + +--- + +## 🎬 多素材视频生成(多机位效果) + +### 概述 +支持用户上传多个不同角度的自拍视频,生成视频时按句子自动切换素材,最终效果类似多机位拍摄。单素材时走原有流程,无额外开销。 + +### 核心架构 + +#### 流水线变更 +``` +【单素材(不变)】 +text → TTS → audio → LatentSync(1个素材+完整audio) → Whisper字幕 → Remotion → 成片 + +【多素材(新增)】 +text → TTS → audio → Whisper字幕(提前) → 按素材数量均分时长(对齐字边界) + → 对每段: 切分audio + LatentSync(素材[i]+音频片段[i]) + → FFmpeg拼接所有片段 → Remotion(完整字幕时间戳) → 成片 +``` + +#### 素材切换逻辑(均分方案) +1. Whisper 对完整音频转录,得到字级别时间戳 +2. 按素材数量**均分音频总时长**(`total_duration / N`) +3. 每个分割点对齐到最近的 Whisper 字边界,避免在字中间切分 +4. 首段 start 扩展为 0.0,末段 end 扩展为音频结尾,确保完整覆盖 + +> **设计决策**:最初方案基于原始文案标点分句,但用户文案往往不含句号(只有逗号),导致只产生 1 段。改为均分方案后不依赖文案标点,对任何输入都能正确切分。 + +--- + +### 一、后端改动 + +#### 1. `backend/app/modules/videos/schemas.py` +- 新增 `material_paths: Optional[List[str]]` 字段 +- 保留 `material_path: str` 向后兼容 + +#### 2. `backend/app/modules/videos/workflow.py`(核心改动) + +**新增函数**: +- `_split_equal(segments, material_paths)`: 按素材数量均分音频时长,对齐到最近的 Whisper 字边界 + +**修改 `process_video_generation()`**: +- `is_multi = len(material_paths) > 1` 判断走多素材/单素材分支 +- 多素材分支:Whisper 提前 → 均分切分 → 音频切分 → 逐段 LatentSync → FFmpeg 拼接 + +#### 3. `backend/app/services/video_service.py` +- 新增 `concat_videos()`: FFmpeg concat demuxer (`-c copy`) 拼接视频片段 +- 新增 `split_audio()`: FFmpeg 按时间范围切分音频 (`-ss` + `-t` + `-c copy`) + +#### 4. `backend/scripts/watchdog.py` +- 健康检查阈值从 3 次提高到 5 次(容忍期 2.5 分钟) +- 新增重启后 120 秒冷却期,避免模型加载期间被误判为故障 +- 启动时给所有服务 60 秒初始冷却期 + +--- + +### 二、前端改动 + +#### 1. 新增依赖 +```bash +npm install @dnd-kit/core @dnd-kit/sortable @dnd-kit/utilities +``` + +#### 2. `frontend/src/features/home/model/useMaterials.ts` +- `selectedMaterial: string` → `selectedMaterials: string[]`(多选) +- 新增 `toggleMaterial(id)`: 切换选中/取消(至少保留1个) +- 新增 `reorderMaterials(activeId, overId)`: 拖拽排序 +- 上传格式扩展:新增 `.mkv/.webm/.flv/.wmv/.m4v/.ts/.mts` + +#### 3. `frontend/src/features/home/ui/MaterialSelector.tsx`(重写) +- 素材列表每行增加复选框 + 序号徽标(①②③) +- 选中 ≥2 个时显示拖拽排序区(@dnd-kit `SortableContext`) +- 每个排序项:拖拽把手 + 序号 + 素材名 + 移除按钮 +- HTML input accept 改为 `video/*` + +#### 4. `frontend/src/features/home/model/useHomeController.ts` +- 多素材 payload:`material_paths` 数组 + `material_path` 向后兼容 +- `enable_subtitles` 硬编码为 `true`(移除开关) +- 验证:至少选中 1 个素材 + +#### 5. `frontend/src/features/home/model/useHomePersistence.ts` +- 素材持久化改为 JSON 数组,向后兼容旧格式(单字符串) +- 移除 `enableSubtitles` 持久化 + +#### 6. `frontend/src/features/home/ui/TitleSubtitlePanel.tsx` +- 移除"逐字高亮字幕"开关,字幕样式区始终显示 + +#### 7. `frontend/src/features/home/ui/HomePage.tsx` +- 更新 props 传递(`selectedMaterials`, `toggleMaterial`, `reorderMaterials`) + +--- + +### 三、Bug 修复记录 + +#### BUG-1: 多素材只使用第一个视频(基于标点的分句方案失败) +- **现象**: 选了 2 个素材但生成的视频只使用第 1 个,日志显示 `Multi-material: 1 segments, 2 materials`。 +- **根因 v1**: 最初通过正则 `[。!?!?]` 在 Whisper 输出中分句,但 Whisper 不输出标点。 +- **修复 v1**: 改为用原始文案标点分句——但用户文案往往只含逗号(,),无句末标点(。!?),仍退化为 1 段。 +- **最终修复**: 彻底放弃基于标点的分句方案,改为 `_split_equal()` **按素材数量均分音频时长**,对齐到最近的 Whisper 字边界。不依赖任何标点符号,对所有文案均有效。 + +#### BUG-2: 口型对不上(音频时间偏移) +- **根因**: `split_audio` 用 Whisper 的 start/end 时间(如 0.11~7.21)切分音频,但 `compose()` 用完整原始音频(0.0~结尾)合成,导致时间偏移。 +- **修复**: 强制首段 start=0.0,末段 end=音频实际时长,确保切分音频完整覆盖。 + +#### BUG-3: min_segment_sec 过度合并导致退化(已随方案切换移除) +- **根因**: 旧方案中 2 个句子第 2 句不足 3 秒时,最短时长检查合并为 1 段,多素材退化为单素材。 +- **状态**: 均分方案不存在此问题,相关代码已移除。 + +--- + +### 涉及文件汇总 + +| 文件 | 变更类型 | 说明 | +|------|----------|------| +| `backend/app/modules/videos/schemas.py` | 修改 | 新增 material_paths 字段 | +| `backend/app/modules/videos/workflow.py` | 修改 | 多素材流水线核心逻辑 + 3个 Bug 修复 | +| `backend/app/services/video_service.py` | 修改 | 新增 concat_videos / split_audio | +| `backend/scripts/watchdog.py` | 修改 | 阈值优化 + 冷却期机制 | +| `frontend/package.json` | 修改 | 新增 @dnd-kit 依赖 | +| `frontend/src/features/home/model/useMaterials.ts` | 修改 | 多选 + 排序状态管理 | +| `frontend/src/features/home/ui/MaterialSelector.tsx` | 重写 | 多选复选框 + 拖拽排序 UI | +| `frontend/src/features/home/model/useHomeController.ts` | 修改 | 多素材 payload + 移除字幕开关 | +| `frontend/src/features/home/model/useHomePersistence.ts` | 修改 | JSON 数组持久化 | +| `frontend/src/features/home/ui/TitleSubtitlePanel.tsx` | 修改 | 移除字幕开关 | +| `frontend/src/features/home/ui/HomePage.tsx` | 修改 | 更新 props 传递 | + +### 重启要求 +```bash +pm2 restart vigent2-backend +npm run build && pm2 restart vigent2-frontend +``` diff --git a/Docs/DevLogs/Day22.md b/Docs/DevLogs/Day22.md new file mode 100644 index 0000000..8bf33b6 --- /dev/null +++ b/Docs/DevLogs/Day22.md @@ -0,0 +1,221 @@ +## 🔧 多素材生成优化与健壮性加固 (Day 22) + +### 概述 +对 Day 21 实现的多素材视频生成(多机位)功能进行全面审查,修复 6 个高优先级 Bug、完成 8 项体验优化,并将多素材流水线从"逐段 LatentSync"重构为"先拼接再推理"架构,推理次数从 N 次降为 1 次。 + +--- + +### 一、后端高优 Bug 修复 + +#### 1. `_split_equal()` 素材数 > 字符数边界溢出 +- **问题**: 5 个素材但只有 2 个 Whisper 字符时,边界索引重复,部分素材被跳过 +- **修复**: 加入 `n = min(n, len(all_chars))` 上限保护 +- **文件**: `backend/app/modules/videos/workflow.py` + +#### 2. 多素材 LatentSync 单段失败无 fallback +- **问题**: 单素材模式下 LatentSync 失败会 fallback 到原始素材,但多素材模式直接抛异常,整个任务失败 +- **修复**: 多素材循环中加 try-except,失败时 fallback 到原始素材片段 +- **文件**: `backend/app/modules/videos/workflow.py` + +#### 3. `num_segments == 0` 时 ZeroDivisionError +- **问题**: 所有 assignments 被跳过后 `i / num_segments` 触发除零 +- **修复**: 循环前加 `if num_segments == 0` 检查并抛出明确错误 +- **文件**: `backend/app/modules/videos/workflow.py` + +#### 4. `split_audio` 未校验 duration > 0 +- **问题**: `end <= start` 时 FFmpeg 行为异常 +- **修复**: 加入 `if duration <= 0: raise ValueError(...)` +- **文件**: `backend/app/services/video_service.py` + +#### 5. Whisper 失败时按时长均分兜底 +- **问题**: Whisper 失败后直接退化为单素材,其他素材被浪费 +- **修复**: 按 `audio_duration / len(material_paths)` 均分,不依赖字符对齐 +- **文件**: `backend/app/modules/videos/workflow.py` + +#### 6. `concat_videos` 空列表未检查 +- **问题**: 传入空 `video_paths` 时 FFmpeg 报错 +- **修复**: 加入 `if not video_paths: raise ValueError(...)` +- **文件**: `backend/app/services/video_service.py` + +--- + +### 二、前端优化 + +#### 1. payload 构建非空断言修复 +- `m!.path` → `m?.path` + `.filter(Boolean)`,防止素材被删后 crash +- **文件**: `frontend/src/features/home/model/useHomeController.ts` + +#### 2. 生成按钮展示后端进度消息 +- 新增 `message` prop,生成中显示如"(正在处理片段 2/3...)" +- **文件**: `frontend/src/features/home/ui/GenerateActionBar.tsx`, `HomePage.tsx` + +#### 3. 新上传素材自动选中 +- 上传成功后对比前后素材列表,新增的 ID 自动追加到 `selectedMaterials` +- **文件**: `frontend/src/features/home/model/useMaterials.ts` + +#### 4. Material 接口统一 +- 三处 `interface Material` 重复定义提取到 `shared/types/material.ts` +- **文件**: `frontend/src/shared/types/material.ts` (新建), `useMaterials.ts`, `useHomeController.ts`, `MaterialSelector.tsx` + +#### 5. 拖拽排序修复 +- 移除 `DragOverlay`(`backdrop-blur` 创建新 containing block 导致定位错乱) +- 改为 `useSortable` 原生拖拽 + `CSS.Translate`,拖拽中元素高亮加阴影 +- **文件**: `frontend/src/features/home/ui/MaterialSelector.tsx` + +#### 6. 素材选择上限 4 个 +- `toggleMaterial` 新增 `MAX_MATERIALS = 4` 限制 +- UI 选满后未选中项变半透明禁用,提示文字改为"可多选,最多4个" +- **文件**: `useMaterials.ts`, `MaterialSelector.tsx` + +#### 7. 移动端排序区域响应式 +- 素材列表 `max-h-64` → `max-h-48 sm:max-h-64` +- **文件**: `MaterialSelector.tsx` + +#### 8. 多素材耗时提示 +- 选中 ≥2 素材时生成按钮下方显示"多素材模式 (N 个机位),生成耗时较长" +- **文件**: `GenerateActionBar.tsx`, `HomePage.tsx` + +--- + +### 三、核心架构重构:先拼接再推理 + +#### V1 (Day 21): 逐段 LatentSync +``` +素材A → LatentSync(素材A, 音频片段1) → lipsync_A +素材B → LatentSync(素材B, 音频片段2) → lipsync_B +FFmpeg concat(lipsync_A, lipsync_B) → 最终视频 +``` +- 缺点:N 个素材 = N 次 LatentSync 推理(每次 ~30s) + +#### V2 (Day 22): 先拼接再推理 +``` +素材A → prepare_segment(裁剪到3.67s) → prepared_A +素材B → prepare_segment(裁剪到4.00s) → prepared_B +FFmpeg concat(prepared_A, prepared_B) → concat_video (7.67s) +LatentSync(concat_video, 完整音频) → 最终视频 +``` +- 优点:只需 **1 次** LatentSync 推理,时间从 N×30s 降为 1×30s + +#### 新增 `prepare_segment()` 方法 +```python +def prepare_segment(self, video_path, target_duration, output_path, target_resolution=None): + # 素材时长 > 目标: 裁剪 (-t) + # 素材时长 < 目标: 循环 (-stream_loop) + 裁剪 + # 分辨率一致: -c copy 无损 (不重编码) + # 分辨率不一致: scale + pad 统一到第一个素材分辨率 +``` + +#### 分辨率处理策略 +- 新增 `get_resolution()` 方法检测各素材分辨率 +- 所有素材分辨率相同时:`-c copy` 无损裁剪(保持原画质) +- 分辨率不一致时:统一到第一个素材的分辨率,`force_original_aspect_ratio=decrease` + `pad` 居中 +- LatentSync 只处理嘴部 512×512 区域,输出保持原分辨率 + +#### 时间对齐验证 + +| 环节 | 时间基准 | 对齐关系 | +|------|---------|---------| +| TTS 音频 | 原始时长 (7.67s) | 基准 | +| Whisper 字幕 | 基于 TTS 音频 | 时间戳对齐音频 | +| 均分切分 | assignments 总时长 = 音频时长 | 首段 start=0, 末段 end=audio_duration | +| prepare 各段 | `-t seg_dur` 精确截断 | 总和 ≈ 音频时长 | +| LatentSync | concat_video + 完整音频 | 内部 0.5s 容差 | +| compose | lipsync_video + 音频/BGM | `-shortest` 保证同步 | +| Remotion | 基于 captions_path 渲染字幕 | 时间戳对齐音频 | + +--- + +### 涉及文件汇总 + +| 文件 | 变更类型 | 说明 | +|------|----------|------| +| `backend/app/modules/videos/workflow.py` | 修改 | 6 个 Bug 修复 + 流水线重构(先拼接再推理)| +| `backend/app/services/video_service.py` | 修改 | 新增 `prepare_segment()`、`get_resolution()`,`split_audio` 校验,`concat_videos` 空列表检查 | +| `frontend/src/shared/types/material.ts` | 新建 | 统一 Material 接口 | +| `frontend/src/features/home/model/useMaterials.ts` | 修改 | 上传自动选中、素材上限 4 个 | +| `frontend/src/features/home/model/useHomeController.ts` | 修改 | payload 非空断言修复、Material 接口引用 | +| `frontend/src/features/home/ui/MaterialSelector.tsx` | 修改 | 拖拽修复、上限 4 个 UI、移动端响应式 | +| `frontend/src/features/home/ui/GenerateActionBar.tsx` | 修改 | 进度消息展示、多素材耗时提示 | +| `frontend/src/features/home/ui/HomePage.tsx` | 修改 | 传递 message、materialCount prop | + +--- + +### 四、AI 多语言翻译 + +#### 功能 +在文案编辑区新增「AI多语言」按钮,支持将中文口播文案一键翻译为 9 种语言,并可随时还原原文。 + +#### 支持语言 +英语 English、日语 日本語、韩语 한국어、法语 Français、德语 Deutsch、西班牙语 Español、俄语 Русский、意大利语 Italiano、葡萄牙语 Português + +#### 实现 + +##### 后端 +- **`backend/app/services/glm_service.py`** — 新增 `translate_text()` 方法,调用智谱 GLM API(temperature=0.3),prompt 要求只返回译文、保持语气风格 +- **`backend/app/modules/ai/router.py`** — 新增 `POST /api/ai/translate` 接口,接收 `{text, target_lang}`,返回 `{translated_text}` + +##### 前端 +- **`frontend/src/features/home/ui/ScriptEditor.tsx`** — 新增 `LANGUAGES` 列表(9 种语言)、语言下拉菜单(点击外部自动关闭)、翻译中 loading 状态、「还原原文」按钮(翻译过后出现在菜单顶部) +- **`frontend/src/features/home/model/useHomeController.ts`** — 新增 `handleTranslate`(调用翻译 API、首次翻译保存原文)、`originalText` 状态、`handleRestoreOriginal`(恢复原文) + +#### 涉及文件 + +| 文件 | 变更 | 说明 | +|------|------|------| +| `backend/app/services/glm_service.py` | 修改 | 新增 `translate_text()` 方法 | +| `backend/app/modules/ai/router.py` | 修改 | 新增 `/api/ai/translate` 接口 | +| `frontend/src/features/home/ui/ScriptEditor.tsx` | 修改 | 语言菜单 UI、翻译 loading、还原原文按钮 | +| `frontend/src/features/home/model/useHomeController.ts` | 修改 | `handleTranslate`、`originalText`、`handleRestoreOriginal` | + +--- + +### 五、TTS 多语言支持 + +#### 背景 +翻译功能实现后,用户可将中文文案翻译为其他语言。但翻译后生成视频时 TTS 仍只支持中文: +- **EdgeTTS**:声音列表只有 5 个 `zh-CN-*` 中文声音 +- **声音克隆 (Qwen3-TTS)**:`language` 参数硬编码为 `"Chinese"` + +#### 实现方案 + +##### 1. 前端:语言感知的声音列表 +- `VOICES` 从扁平数组扩展为 `Record`,覆盖 10 种语言(zh-CN / en-US / ja-JP / ko-KR / fr-FR / de-DE / es-ES / ru-RU / it-IT / pt-BR),每种语言 2 个声音(男/女) +- 新增 `LANG_TO_LOCALE` 映射:翻译目标语言名 → EdgeTTS locale(如 `"English" → "en-US"`) +- 新增 `textLang` 状态,跟踪当前文案语言,默认 `"zh-CN"` + +##### 2. 翻译时自动切换声音 +- `handleTranslate` 成功后:根据目标语言设置 `textLang`,EdgeTTS 模式下自动切换 `voice` 为目标语言的默认声音 +- `handleRestoreOriginal` 还原时:重置 `textLang` 为 `"zh-CN"`,恢复中文默认声音 +- `VoiceSelector` 根据 `textLang` 动态显示对应语言的声音列表 + +##### 3. 声音克隆语言透传 +- 前端:新增 `LOCALE_TO_QWEN_LANG` 映射(`zh-CN→"Chinese"`, `en-US→"English"`, 其他→`"Auto"`) +- 生成请求 payload 加入 `language` 字段(仅声音克隆模式) +- 后端 `GenerateRequest` schema 新增 `language: str = "Chinese"` 字段 +- `workflow.py`:`language="Chinese"` 硬编码改为 `language=req.language` + +##### 4. Bug 修复:textLang 持久化 +- **问题**: `voice` 已持久化但 `textLang` 未持久化,刷新页面后 `voice` 恢复为英文声音但 `textLang` 默认回中文,导致 VoiceSelector 显示中文声音列表却选中英文声音,无高亮按钮 +- **修复**: 在 `useHomePersistence` 中加入 `textLang` 的 localStorage 读写 + +#### 数据流 + +``` +用户翻译 "English" + → ScriptEditor.onTranslate("English") + → LANG_TO_LOCALE["English"] = "en-US" + → setTextLang("en-US"), setVoice("en-US-GuyNeural") + → VoiceSelector 显示 VOICES["en-US"] = [Guy, Jenny] + → 生成时: + EdgeTTS: payload.voice = "en-US-GuyNeural" + 声音克隆: payload.language = "English" (via getQwenLanguage) +``` + +#### 涉及文件 + +| 文件 | 变更 | 说明 | +|------|------|------| +| `frontend/src/features/home/model/useHomeController.ts` | 修改 | VOICES 多语言 Record、textLang 状态、LANG_TO_LOCALE / LOCALE_TO_QWEN_LANG 映射、翻译自动切换 voice | +| `frontend/src/features/home/model/useHomePersistence.ts` | 修改 | textLang 持久化读写 | +| `backend/app/modules/videos/schemas.py` | 修改 | GenerateRequest 加 `language` 字段 | +| `backend/app/modules/videos/workflow.py` | 修改 | 声音克隆调用处用 `req.language` 替代硬编码 | diff --git a/Docs/task_complete.md b/Docs/task_complete.md index 875c449..e72ac9b 100644 --- a/Docs/task_complete.md +++ b/Docs/task_complete.md @@ -10,7 +10,7 @@ > 这里记录了每一天的核心开发内容与 milestone。 -### Day 21: 缺陷修复 + 浮动预览 + 发布重构 + 架构优化 (Current) +### Day 21: 缺陷修复 + 浮动预览 + 发布重构 + 架构优化 + 多素材生成 (Current) - [x] **Remotion 崩溃容错**: 渲染进程 SIGABRT 退出时检查输出文件,避免误判失败导致标题/字幕丢失。 - [x] **首页作品选择持久化**: 修复 `fetchGeneratedVideos` 无条件覆盖恢复值的问题,新增 `preferVideoId` 参数控制选中逻辑。 - [x] **发布页作品选择持久化**: 根因为签名 URL 不稳定,全面改用 `video.id` 替代 `path` 进行选择/持久化/比较。 @@ -22,6 +22,11 @@ - [x] **后端模块分层**: materials/tools/ref_audios 三个模块补全 router+schemas+service 分层。 - [x] **开发规范更新**: BACKEND_DEV.md 新增渐进原则、DOC_RULES.md 取消 TASK_COMPLETE.md 手动触发约束。 - [x] **文档全面更新**: BACKEND_DEV/README、FRONTEND_DEV、DEPLOY_MANUAL、README.md 同步更新。 +- [x] **多素材视频生成(多机位效果)**: 支持多选素材 + 拖拽排序,按素材数量均分音频时长(对齐 Whisper 字边界)自动切换机位。逐段 LatentSync + FFmpeg 拼接。前端 @dnd-kit 拖拽排序 UI。 +- [x] **字幕开关移除**: 默认启用逐字高亮字幕,移除开关及相关死代码。 +- [x] **视频格式扩展**: 上传支持 mkv/webm/flv/wmv/m4v/ts/mts 等常见格式。 +- [x] **Watchdog 优化**: 健康检查阈值提高到 5 次,新增重启冷却期 120 秒,避免误重启。 +- [x] **多素材 Bug 修复**: 修复标点分句方案对无句末标点文案无效(改为均分方案)、音频时间偏移导致口型不对齐等缺陷。 ### Day 20: 代码质量与安全优化 - [x] **功能性修复**: LatentSync 回退逻辑、任务状态接口认证、User 类型统一。 diff --git a/backend/app/modules/ai/router.py b/backend/app/modules/ai/router.py index 6e075dd..5d4731b 100644 --- a/backend/app/modules/ai/router.py +++ b/backend/app/modules/ai/router.py @@ -24,6 +24,33 @@ class GenerateMetaResponse(BaseModel): tags: list[str] +class TranslateRequest(BaseModel): + """翻译请求""" + text: str + target_lang: str + + +@router.post("/translate") +async def translate_text(req: TranslateRequest): + """ + AI 翻译文案 + + 将文案翻译为指定目标语言 + """ + if not req.text or not req.text.strip(): + raise HTTPException(status_code=400, detail="文案不能为空") + if not req.target_lang or not req.target_lang.strip(): + raise HTTPException(status_code=400, detail="目标语言不能为空") + + try: + logger.info(f"Translating text to {req.target_lang}: {req.text[:50]}...") + translated = await glm_service.translate_text(req.text.strip(), req.target_lang.strip()) + return success_response({"translated_text": translated}) + except Exception as e: + logger.error(f"Translate failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @router.post("/generate-meta") async def generate_meta(req: GenerateMetaRequest): """ diff --git a/backend/app/modules/videos/schemas.py b/backend/app/modules/videos/schemas.py index de27491..f2f15bb 100644 --- a/backend/app/modules/videos/schemas.py +++ b/backend/app/modules/videos/schemas.py @@ -1,14 +1,16 @@ from pydantic import BaseModel -from typing import Optional +from typing import Optional, List class GenerateRequest(BaseModel): text: str voice: str = "zh-CN-YunxiNeural" material_path: str + material_paths: Optional[List[str]] = None tts_mode: str = "edgetts" ref_audio_id: Optional[str] = None ref_text: Optional[str] = None + language: str = "zh-CN" title: Optional[str] = None enable_subtitles: bool = True subtitle_style_id: Optional[str] = None diff --git a/backend/app/modules/videos/workflow.py b/backend/app/modules/videos/workflow.py index 0166224..60c6193 100644 --- a/backend/app/modules/videos/workflow.py +++ b/backend/app/modules/videos/workflow.py @@ -1,4 +1,4 @@ -from typing import Optional, Any +from typing import Optional, Any, List from pathlib import Path import time import traceback @@ -24,6 +24,17 @@ from .schemas import GenerateRequest from .task_store import task_store +def _locale_to_whisper_lang(locale: str) -> str: + """'en-US' → 'en', 'zh-CN' → 'zh'""" + return locale.split("-")[0] if "-" in locale else locale + + +def _locale_to_qwen_lang(locale: str) -> str: + """'zh-CN' → 'Chinese', 'en-US' → 'English', 其他 → 'Auto'""" + mapping = {"zh": "Chinese", "en": "English"} + return mapping.get(locale.split("-")[0], "Auto") + + _lipsync_service: Optional[LipSyncService] = None _lipsync_ready: Optional[bool] = None _lipsync_last_check: float = 0 @@ -79,19 +90,107 @@ def _update_task(task_id: str, **updates: Any) -> None: task_store.update(task_id, updates) +# ── 多素材辅助函数 ── + + +def _split_equal(segments: List[dict], material_paths: List[str]) -> List[dict]: + """按素材数量均分音频时长,对齐到最近的 Whisper 字边界。 + + Args: + segments: Whisper 产出的 segment 列表, 每个包含 words (字级时间戳) + material_paths: 素材路径列表 + + Returns: + [{"material_path": "...", "start": 0.0, "end": 5.2, "index": 0}, ...] + """ + # 展平所有 Whisper 字符 + all_chars: List[dict] = [] + for seg in segments: + for w in seg.get("words", []): + all_chars.append(w) + + n = len(material_paths) + + if not all_chars or n == 0: + return [{"material_path": material_paths[0] if material_paths else "", + "start": 0.0, "end": 99999.0, "index": 0}] + + # 素材数不能超过字符数,否则边界会重复 + if n > len(all_chars): + logger.warning(f"[MultiMat] 素材数({n}) > 字符数({len(all_chars)}),裁剪为 {len(all_chars)}") + n = len(all_chars) + + total_start = all_chars[0]["start"] + total_end = all_chars[-1]["end"] + seg_dur = (total_end - total_start) / n + + # 计算 N-1 个分割点,对齐到最近的字边界 + boundaries = [0] # 第一段从第 0 个字开始 + for i in range(1, n): + target_time = total_start + i * seg_dur + # 找到 start 时间最接近 target_time 的字 + best_idx = boundaries[-1] + 1 # 至少比上一个边界后移 1 + best_diff = float("inf") + for j in range(boundaries[-1] + 1, len(all_chars)): + diff = abs(all_chars[j]["start"] - target_time) + if diff < best_diff: + best_diff = diff + best_idx = j + elif diff > best_diff: + break # 时间递增,差值开始变大后可以停了 + boundaries.append(min(best_idx, len(all_chars) - 1)) + boundaries.append(len(all_chars)) # 最后一段到末尾 + + # 按边界生成分配结果 + assignments: List[dict] = [] + for i in range(n): + s_idx = boundaries[i] + e_idx = boundaries[i + 1] + if s_idx >= len(all_chars) or s_idx >= e_idx: + continue + assignments.append({ + "material_path": material_paths[i], + "start": all_chars[s_idx]["start"], + "end": all_chars[e_idx - 1]["end"], + "text": "".join(c["word"] for c in all_chars[s_idx:e_idx]), + "index": len(assignments), + }) + + if not assignments: + return [{"material_path": material_paths[0], "start": 0.0, "end": 99999.0, "index": 0}] + + logger.info(f"[MultiMat] 均分 {len(all_chars)} 字为 {len(assignments)} 段") + for a in assignments: + dur = a["end"] - a["start"] + logger.info(f" 段{a['index']}: [{a['start']:.2f}-{a['end']:.2f}s] ({dur:.1f}s) {a['text'][:20]}") + + return assignments + + async def process_video_generation(task_id: str, req: GenerateRequest, user_id: str): temp_files = [] try: start_time = time.time() + + # ── 确定素材列表 ── + material_paths: List[str] = [] + if req.material_paths and len(req.material_paths) > 1: + material_paths = req.material_paths + else: + material_paths = [req.material_path] + + is_multi = len(material_paths) > 1 + _update_task(task_id, status="processing", progress=5, message="正在下载素材...") temp_dir = settings.UPLOAD_DIR / "temp" temp_dir.mkdir(parents=True, exist_ok=True) - input_material_path = temp_dir / f"{task_id}_input.mp4" - temp_files.append(input_material_path) - - await _download_material(req.material_path, input_material_path) + # 单素材模式:下载主素材 + if not is_multi: + input_material_path = temp_dir / f"{task_id}_input.mp4" + temp_files.append(input_material_path) + await _download_material(material_paths[0], input_material_path) _update_task(task_id, message="正在生成语音...", progress=10) @@ -119,7 +218,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id: ref_audio_path=str(ref_audio_local), ref_text=req.ref_text, output_path=str(audio_path), - language="Chinese" + language=_locale_to_qwen_lang(req.language) ) else: _update_task(task_id, message="正在生成语音 (EdgeTTS)...") @@ -128,52 +227,183 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id: tts_time = time.time() - start_time print(f"[Pipeline] TTS completed in {tts_time:.1f}s") - _update_task(task_id, progress=25) - - _update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30) lipsync = _get_lipsync_service() lipsync_video_path = temp_dir / f"{task_id}_lipsync.mp4" temp_files.append(lipsync_video_path) - lipsync_start = time.time() - is_ready = await _check_lipsync_ready() - - if is_ready: - print(f"[LipSync] Starting LatentSync inference...") - _update_task(task_id, progress=35, message="正在运行 LatentSync 推理...") - await lipsync.generate(str(input_material_path), str(audio_path), str(lipsync_video_path)) - else: - print(f"[LipSync] LatentSync not ready, copying original video") - _update_task(task_id, message="唇形同步不可用,使用原始视频...") - import shutil - shutil.copy(str(input_material_path), lipsync_video_path) - - lipsync_time = time.time() - lipsync_start - print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s") - _update_task(task_id, progress=80) - + video = VideoService() captions_path = None - if req.enable_subtitles: - _update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82) + + if is_multi: + # ══════════════════════════════════════ + # 多素材流水线 + # ══════════════════════════════════════ + _update_task(task_id, progress=12, message="正在生成字幕 (Whisper)...") captions_path = temp_dir / f"{task_id}_captions.json" temp_files.append(captions_path) try: - await whisper_service.align( + captions_data = await whisper_service.align( audio_path=str(audio_path), text=req.text, - output_path=str(captions_path) + output_path=str(captions_path), + language=_locale_to_whisper_lang(req.language), ) - print(f"[Pipeline] Whisper alignment completed") + print(f"[Pipeline] Whisper alignment completed (multi-material)") except Exception as e: - logger.warning(f"Whisper alignment failed, skipping subtitles: {e}") + logger.warning(f"Whisper alignment failed: {e}") + captions_data = None captions_path = None + _update_task(task_id, progress=15, message="正在分配素材...") + + if captions_data and captions_data.get("segments"): + assignments = _split_equal(captions_data["segments"], material_paths) + else: + # Whisper 失败 → 按时长均分(不依赖字符对齐) + logger.warning("[MultiMat] Whisper 无数据,按时长均分") + audio_dur = video._get_duration(str(audio_path)) + if audio_dur <= 0: + audio_dur = 30.0 # 安全兜底 + seg_dur = audio_dur / len(material_paths) + assignments = [ + {"material_path": material_paths[i], "start": i * seg_dur, + "end": (i + 1) * seg_dur, "index": i} + for i in range(len(material_paths)) + ] + + # 扩展段覆盖完整音频范围:首段从0开始,末段到音频结尾 + audio_duration = video._get_duration(str(audio_path)) + if assignments and audio_duration > 0: + assignments[0]["start"] = 0.0 + assignments[-1]["end"] = audio_duration + + num_segments = len(assignments) + print(f"[Pipeline] Multi-material: {num_segments} segments, {len(material_paths)} materials") + + if num_segments == 0: + raise RuntimeError("Multi-material: no valid segments after splitting") + + lipsync_start = time.time() + + # ── 第一步:下载所有素材并检测分辨率 ── + material_locals: List[Path] = [] + resolutions = [] + + for i, assignment in enumerate(assignments): + material_local = temp_dir / f"{task_id}_material_{i}.mp4" + temp_files.append(material_local) + await _download_material(assignment["material_path"], material_local) + material_locals.append(material_local) + resolutions.append(video.get_resolution(str(material_local))) + + # 分辨率不一致时,统一到第一个素材的分辨率 + base_res = resolutions[0] if resolutions else (0, 0) + need_scale = any(r != base_res for r in resolutions) and base_res[0] > 0 + if need_scale: + logger.info(f"[MultiMat] 素材分辨率不一致,统一到 {base_res[0]}x{base_res[1]}") + + # ── 第二步:裁剪每段素材到对应时长 ── + prepared_segments: List[Path] = [] + + for i, assignment in enumerate(assignments): + seg_progress = 15 + int((i / num_segments) * 30) # 15% → 45% + seg_dur = assignment["end"] - assignment["start"] + _update_task( + task_id, + progress=seg_progress, + message=f"正在准备素材 {i+1}/{num_segments}..." + ) + + prepared_path = temp_dir / f"{task_id}_prepared_{i}.mp4" + temp_files.append(prepared_path) + video.prepare_segment( + str(material_locals[i]), seg_dur, str(prepared_path), + target_resolution=base_res if need_scale else None + ) + prepared_segments.append(prepared_path) + + # ── 第二步:拼接所有素材片段 ── + _update_task(task_id, progress=50, message="正在拼接素材片段...") + concat_path = temp_dir / f"{task_id}_concat.mp4" + temp_files.append(concat_path) + video.concat_videos( + [str(p) for p in prepared_segments], + str(concat_path) + ) + + # ── 第三步:一次 LatentSync 推理 ── + is_ready = await _check_lipsync_ready() + + if is_ready: + _update_task(task_id, progress=55, message="正在合成唇形 (LatentSync)...") + print(f"[LipSync] Multi-material: single LatentSync on concatenated video") + try: + await lipsync.generate(str(concat_path), str(audio_path), str(lipsync_video_path)) + except Exception as e: + logger.warning(f"[LipSync] Failed, fallback to concat without lipsync: {e}") + import shutil + shutil.copy(str(concat_path), str(lipsync_video_path)) + else: + print(f"[LipSync] Not ready, using concatenated video without lipsync") + import shutil + shutil.copy(str(concat_path), str(lipsync_video_path)) + + lipsync_time = time.time() - lipsync_start + print(f"[Pipeline] Multi-material prepare + concat + LipSync completed in {lipsync_time:.1f}s") + _update_task(task_id, progress=80) + + # 如果用户关闭了字幕,清除 captions_path(Whisper 仅用于句子切分) + if not req.enable_subtitles: + captions_path = None + + else: + # ══════════════════════════════════════ + # 单素材流水线(原有逻辑) + # ══════════════════════════════════════ + _update_task(task_id, progress=25) + _update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30) + + lipsync_start = time.time() + is_ready = await _check_lipsync_ready() + + if is_ready: + print(f"[LipSync] Starting LatentSync inference...") + _update_task(task_id, progress=35, message="正在运行 LatentSync 推理...") + await lipsync.generate(str(input_material_path), str(audio_path), str(lipsync_video_path)) + else: + print(f"[LipSync] LatentSync not ready, copying original video") + _update_task(task_id, message="唇形同步不可用,使用原始视频...") + import shutil + shutil.copy(str(input_material_path), lipsync_video_path) + + lipsync_time = time.time() - lipsync_start + print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s") + _update_task(task_id, progress=80) + + # 单素材模式:Whisper 在 LatentSync 之后 + if req.enable_subtitles: + _update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82) + + captions_path = temp_dir / f"{task_id}_captions.json" + temp_files.append(captions_path) + + try: + await whisper_service.align( + audio_path=str(audio_path), + text=req.text, + output_path=str(captions_path), + language=_locale_to_whisper_lang(req.language), + ) + print(f"[Pipeline] Whisper alignment completed") + except Exception as e: + logger.warning(f"Whisper alignment failed, skipping subtitles: {e}") + captions_path = None + _update_task(task_id, progress=85) - video = VideoService() final_audio_path = audio_path if req.bgm_id: _update_task(task_id, message="正在合成背景音乐...", progress=86) diff --git a/backend/app/services/glm_service.py b/backend/app/services/glm_service.py index 05a2e5e..78d7e75 100644 --- a/backend/app/services/glm_service.py +++ b/backend/app/services/glm_service.py @@ -43,6 +43,7 @@ class GLMService: 要求: 1. 标题要简洁有力,能吸引观众点击,不超过10个字 2. 标签要与内容相关,便于搜索和推荐,只要3个 +3. 标题和标签必须使用与口播文案相同的语言(如文案是英文就用英文,日文就用日文) 请严格按以下JSON格式返回(不要包含其他内容): {{"title": "标题", "tags": ["标签1", "标签2", "标签3"]}}""" @@ -120,6 +121,49 @@ class GLMService: + async def translate_text(self, text: str, target_lang: str) -> str: + """ + 将文案翻译为指定语言 + + Args: + text: 原始文案 + target_lang: 目标语言(如 English, 日本語 等) + + Returns: + 翻译后的文案 + """ + prompt = f"""请将以下文案翻译为{target_lang}。 + +原文: +{text} + +要求: +1. 只返回翻译后的文案,不要添加任何解释或说明 +2. 保持原文的语气和风格 +3. 翻译要自然流畅,符合目标语言的表达习惯""" + + try: + client = self._get_client() + logger.info(f"Using GLM to translate text to {target_lang}") + + import asyncio + response = await asyncio.to_thread( + client.chat.completions.create, + model=settings.GLM_MODEL, + messages=[{"role": "user", "content": prompt}], + thinking={"type": "disabled"}, + max_tokens=2000, + temperature=0.3 + ) + + content = response.choices[0].message.content + logger.info("GLM translation completed") + return content.strip() + + except Exception as e: + logger.error(f"GLM translate error: {e}") + raise Exception(f"AI 翻译失败: {str(e)}") + def _parse_json_response(self, content: str) -> dict: """解析 GLM 返回的 JSON 内容""" # 尝试直接解析 diff --git a/backend/app/services/video_service.py b/backend/app/services/video_service.py index f098225..d0ef6aa 100644 --- a/backend/app/services/video_service.py +++ b/backend/app/services/video_service.py @@ -138,3 +138,109 @@ class VideoService: return output_path else: raise RuntimeError("FFmpeg composition failed") + + def concat_videos(self, video_paths: list, output_path: str) -> str: + """使用 FFmpeg concat demuxer 拼接多个视频片段""" + if not video_paths: + raise ValueError("No video segments to concat") + + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + # 生成 concat list 文件 + list_path = Path(output_path).parent / f"{Path(output_path).stem}_concat.txt" + with open(list_path, "w", encoding="utf-8") as f: + for vp in video_paths: + f.write(f"file '{vp}'\n") + + cmd = [ + "ffmpeg", "-y", + "-f", "concat", + "-safe", "0", + "-i", str(list_path), + "-c", "copy", + output_path, + ] + + try: + if self._run_ffmpeg(cmd): + return output_path + else: + raise RuntimeError("FFmpeg concat failed") + finally: + try: + list_path.unlink(missing_ok=True) + except Exception: + pass + + def split_audio(self, audio_path: str, start: float, end: float, output_path: str) -> str: + """用 FFmpeg 按时间范围切分音频""" + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + duration = end - start + if duration <= 0: + raise ValueError(f"Invalid audio split range: start={start}, end={end}, duration={duration}") + + cmd = [ + "ffmpeg", "-y", + "-ss", str(start), + "-t", str(duration), + "-i", audio_path, + "-c", "copy", + output_path, + ] + + if self._run_ffmpeg(cmd): + return output_path + raise RuntimeError(f"FFmpeg audio split failed: {start}-{end}") + + def get_resolution(self, file_path: str) -> tuple: + """获取视频分辨率,返回 (width, height)""" + cmd = [ + 'ffprobe', '-v', 'error', + '-select_streams', 'v:0', + '-show_entries', 'stream=width,height', + '-of', 'csv=p=0', + file_path + ] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) + parts = result.stdout.strip().split(',') + return (int(parts[0]), int(parts[1])) + except Exception: + return (0, 0) + + def prepare_segment(self, video_path: str, target_duration: float, output_path: str, + target_resolution: tuple = None) -> str: + """将素材视频裁剪或循环到指定时长(无音频)。 + target_resolution: (width, height) 如需统一分辨率则传入,否则保持原分辨率。 + """ + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + video_dur = self._get_duration(video_path) + if video_dur <= 0: + video_dur = target_duration + + needs_loop = target_duration > video_dur + needs_scale = target_resolution is not None + + cmd = ["ffmpeg", "-y"] + if needs_loop: + loop_count = int(target_duration / video_dur) + 1 + cmd.extend(["-stream_loop", str(loop_count)]) + cmd.extend(["-i", video_path, "-t", str(target_duration), "-an"]) + + if needs_scale: + w, h = target_resolution + cmd.extend(["-vf", f"scale={w}:{h}:force_original_aspect_ratio=decrease,pad={w}:{h}:(ow-iw)/2:(oh-ih)/2"]) + + # 需要循环或缩放时必须重编码,否则用 stream copy 保持原画质 + if needs_loop or needs_scale: + cmd.extend(["-c:v", "libx264", "-preset", "fast", "-crf", "18"]) + else: + cmd.extend(["-c:v", "copy"]) + + cmd.append(output_path) + + if self._run_ffmpeg(cmd): + return output_path + raise RuntimeError(f"FFmpeg prepare_segment failed: {video_path}") diff --git a/backend/app/services/voice_clone_service.py b/backend/app/services/voice_clone_service.py index 37e5def..018d056 100644 --- a/backend/app/services/voice_clone_service.py +++ b/backend/app/services/voice_clone_service.py @@ -48,7 +48,7 @@ class VoiceCloneService: """ # 使用锁确保串行执行,避免 GPU 显存溢出 async with self._lock: - logger.info(f"🎤 Voice Clone: {text[:30]}...") + logger.info(f"🎤 Voice Clone: {text[:30]}... (language={language})") Path(output_path).parent.mkdir(parents=True, exist_ok=True) # 读取参考音频 diff --git a/backend/app/services/whisper_service.py b/backend/app/services/whisper_service.py index 35ad219..207ce3a 100644 --- a/backend/app/services/whisper_service.py +++ b/backend/app/services/whisper_service.py @@ -20,16 +20,23 @@ MAX_CHARS_PER_LINE = 12 def split_word_to_chars(word: str, start: float, end: float) -> list: """ - 将词拆分成单个字符,时间戳线性插值 + 将词拆分成单个字符,时间戳线性插值。 + 保留英文词前的空格(Whisper 输出如 " Hello"),用于正确重建英文字幕。 Args: - word: 词文本 + word: 词文本(可能含前导空格) start: 词开始时间 end: 词结束时间 Returns: 单字符列表,每个包含 word/start/end """ + # 保留前导空格(英文 Whisper 输出常见 " Hello" 形式) + leading_space = "" + if word and not word[0].strip(): + leading_space = " " + word = word.lstrip() + tokens = [] ascii_buffer = "" @@ -54,7 +61,8 @@ def split_word_to_chars(word: str, start: float, end: float) -> list: return [] if len(tokens) == 1: - return [{"word": tokens[0], "start": start, "end": end}] + w = leading_space + tokens[0] if leading_space else tokens[0] + return [{"word": w, "start": start, "end": end}] # 线性插值时间戳 duration = end - start @@ -64,8 +72,11 @@ def split_word_to_chars(word: str, start: float, end: float) -> list: for i, token in enumerate(tokens): token_start = start + i * token_duration token_end = start + (i + 1) * token_duration + w = token + if i == 0 and leading_space: + w = leading_space + w result.append({ - "word": token, + "word": w, "start": round(token_start, 3), "end": round(token_end, 3) }) @@ -108,7 +119,7 @@ def split_segment_to_lines(words: List[dict], max_chars: int = MAX_CHARS_PER_LIN if should_break and current_words: segments.append({ - "text": current_text, + "text": current_text.strip(), "start": current_words[0]["start"], "end": current_words[-1]["end"], "words": current_words.copy() @@ -119,7 +130,7 @@ def split_segment_to_lines(words: List[dict], max_chars: int = MAX_CHARS_PER_LIN # 处理剩余的字 if current_words: segments.append({ - "text": current_text, + "text": current_text.strip(), "start": current_words[0]["start"], "end": current_words[-1]["end"], "words": current_words.copy() @@ -162,7 +173,8 @@ class WhisperService: self, audio_path: str, text: str, - output_path: Optional[str] = None + output_path: Optional[str] = None, + language: str = "zh", ) -> dict: """ 对音频进行转录,生成字级别时间戳 @@ -171,12 +183,16 @@ class WhisperService: audio_path: 音频文件路径 text: 原始文本(用于参考,但实际使用 whisper 转录结果) output_path: 可选,输出 JSON 文件路径 + language: 语言代码 (zh/en 等) Returns: 包含字级别时间戳的字典 """ import asyncio + # 英文等西文需要更大的每行字数 + max_chars = 40 if language != "zh" else MAX_CHARS_PER_LINE + def _do_transcribe(): model = self._load_model() @@ -185,7 +201,7 @@ class WhisperService: # 转录并获取字级别时间戳 segments_iter, info = model.transcribe( audio_path, - language="zh", + language=language, word_timestamps=True, # 启用字级别时间戳 vad_filter=True, # 启用 VAD 过滤静音 ) @@ -198,9 +214,10 @@ class WhisperService: all_words = [] if segment.words: for word_info in segment.words: - word_text = word_info.word.strip() - if word_text: + word_text = word_info.word + if word_text.strip(): # 将词拆分成单字,时间戳线性插值 + # 保留前导空格用于英文词间距 chars = split_word_to_chars( word_text, word_info.start, @@ -210,7 +227,7 @@ class WhisperService: # 将长段落按标点和字数拆分成多行 if all_words: - line_segments = split_segment_to_lines(all_words, MAX_CHARS_PER_LINE) + line_segments = split_segment_to_lines(all_words, max_chars) all_segments.extend(line_segments) logger.info(f"Generated {len(all_segments)} subtitle segments") diff --git a/frontend/package-lock.json b/frontend/package-lock.json index cf7cd42..bd37513 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -8,6 +8,9 @@ "name": "frontend", "version": "0.1.0", "dependencies": { + "@dnd-kit/core": "^6.3.1", + "@dnd-kit/sortable": "^10.0.0", + "@dnd-kit/utilities": "^3.2.2", "@supabase/supabase-js": "^2.93.1", "axios": "^1.13.4", "lucide-react": "^0.563.0", @@ -281,6 +284,59 @@ "node": ">=6.9.0" } }, + "node_modules/@dnd-kit/accessibility": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@dnd-kit/accessibility/-/accessibility-3.1.1.tgz", + "integrity": "sha512-2P+YgaXF+gRsIihwwY1gCsQSYnu9Zyj2py8kY5fFvUM1qm2WA2u639R6YNVfU4GWr+ZM5mqEsfHZZLoRONbemw==", + "license": "MIT", + "dependencies": { + "tslib": "^2.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0" + } + }, + "node_modules/@dnd-kit/core": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/@dnd-kit/core/-/core-6.3.1.tgz", + "integrity": "sha512-xkGBRQQab4RLwgXxoqETICr6S5JlogafbhNsidmrkVv2YRs5MLwpjoF2qpiGjQt8S9AoxtIV603s0GIUpY5eYQ==", + "license": "MIT", + "dependencies": { + "@dnd-kit/accessibility": "^3.1.1", + "@dnd-kit/utilities": "^3.2.2", + "tslib": "^2.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0", + "react-dom": ">=16.8.0" + } + }, + "node_modules/@dnd-kit/sortable": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@dnd-kit/sortable/-/sortable-10.0.0.tgz", + "integrity": "sha512-+xqhmIIzvAYMGfBYYnbKuNicfSsk4RksY2XdmJhT+HAC01nix6fHCztU68jooFiMUB01Ky3F0FyOvhG/BZrWkg==", + "license": "MIT", + "dependencies": { + "@dnd-kit/utilities": "^3.2.2", + "tslib": "^2.0.0" + }, + "peerDependencies": { + "@dnd-kit/core": "^6.3.0", + "react": ">=16.8.0" + } + }, + "node_modules/@dnd-kit/utilities": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/@dnd-kit/utilities/-/utilities-3.2.2.tgz", + "integrity": "sha512-+MKAJEOfaBe5SmV6t34p80MMKhjvUz0vRrvVJbPT0WElzaOJ/1xs+D+KDv+tD/NE5ujfrChEcshd4fLn0wpiqg==", + "license": "MIT", + "dependencies": { + "tslib": "^2.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0" + } + }, "node_modules/@emnapi/core": { "version": "1.8.1", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.8.1.tgz", diff --git a/frontend/package.json b/frontend/package.json index 476aadb..ebf5efb 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -9,6 +9,9 @@ "lint": "eslint" }, "dependencies": { + "@dnd-kit/core": "^6.3.1", + "@dnd-kit/sortable": "^10.0.0", + "@dnd-kit/utilities": "^3.2.2", "@supabase/supabase-js": "^2.93.1", "axios": "^1.13.4", "lucide-react": "^0.563.0", diff --git a/frontend/src/features/home/model/useHomeController.ts b/frontend/src/features/home/model/useHomeController.ts index f53533a..c6b6675 100644 --- a/frontend/src/features/home/model/useHomeController.ts +++ b/frontend/src/features/home/model/useHomeController.ts @@ -25,13 +25,64 @@ import { useRefAudios } from "@/features/home/model/useRefAudios"; import { useTitleSubtitleStyles } from "@/features/home/model/useTitleSubtitleStyles"; import { ApiResponse, unwrap } from "@/shared/api/types"; -const VOICES = [ - { id: "zh-CN-YunxiNeural", name: "云溪 (男声-年轻)" }, - { id: "zh-CN-YunjianNeural", name: "云健 (男声-新闻)" }, - { id: "zh-CN-YunyangNeural", name: "云扬 (男声-专业)" }, - { id: "zh-CN-XiaoxiaoNeural", name: "晓晓 (女声-活泼)" }, - { id: "zh-CN-XiaoyiNeural", name: "晓伊 (女声-温柔)" }, -]; +const VOICES: Record = { + "zh-CN": [ + { id: "zh-CN-YunxiNeural", name: "云溪 (男声-年轻)" }, + { id: "zh-CN-YunjianNeural", name: "云健 (男声-新闻)" }, + { id: "zh-CN-YunyangNeural", name: "云扬 (男声-专业)" }, + { id: "zh-CN-XiaoxiaoNeural", name: "晓晓 (女声-活泼)" }, + { id: "zh-CN-XiaoyiNeural", name: "晓伊 (女声-温柔)" }, + ], + "en-US": [ + { id: "en-US-GuyNeural", name: "Guy (Male)" }, + { id: "en-US-JennyNeural", name: "Jenny (Female)" }, + ], + "ja-JP": [ + { id: "ja-JP-KeitaNeural", name: "圭太 (男声)" }, + { id: "ja-JP-NanamiNeural", name: "七海 (女声)" }, + ], + "ko-KR": [ + { id: "ko-KR-InJoonNeural", name: "인준 (男声)" }, + { id: "ko-KR-SunHiNeural", name: "선히 (女声)" }, + ], + "fr-FR": [ + { id: "fr-FR-HenriNeural", name: "Henri (Male)" }, + { id: "fr-FR-DeniseNeural", name: "Denise (Female)" }, + ], + "de-DE": [ + { id: "de-DE-ConradNeural", name: "Conrad (Male)" }, + { id: "de-DE-KatjaNeural", name: "Katja (Female)" }, + ], + "es-ES": [ + { id: "es-ES-AlvaroNeural", name: "Álvaro (Male)" }, + { id: "es-ES-ElviraNeural", name: "Elvira (Female)" }, + ], + "ru-RU": [ + { id: "ru-RU-DmitryNeural", name: "Дмитрий (Male)" }, + { id: "ru-RU-SvetlanaNeural", name: "Светлана (Female)" }, + ], + "it-IT": [ + { id: "it-IT-DiegoNeural", name: "Diego (Male)" }, + { id: "it-IT-ElsaNeural", name: "Elsa (Female)" }, + ], + "pt-BR": [ + { id: "pt-BR-AntonioNeural", name: "Antonio (Male)" }, + { id: "pt-BR-FranciscaNeural", name: "Francisca (Female)" }, + ], +}; + +const LANG_TO_LOCALE: Record = { + "中文": "zh-CN", + "English": "en-US", + "日本語": "ja-JP", + "한국어": "ko-KR", + "Français": "fr-FR", + "Deutsch": "de-DE", + "Español": "es-ES", + "Русский": "ru-RU", + "Italiano": "it-IT", + "Português": "pt-BR", +}; @@ -70,22 +121,17 @@ interface RefAudio { created_at: number; } -interface Material { - id: string; - name: string; - path: string; - size_mb: number; - scene?: string; -} +import type { Material } from "@/shared/types/material"; export const useHomeController = () => { const apiBase = getApiBaseUrl(); - const [selectedMaterial, setSelectedMaterial] = useState(""); + const [selectedMaterials, setSelectedMaterials] = useState([]); const [previewMaterial, setPreviewMaterial] = useState(null); const [text, setText] = useState(""); const [voice, setVoice] = useState("zh-CN-YunxiNeural"); + const [textLang, setTextLang] = useState("zh-CN"); // 使用全局任务状态 const { currentTask, isGenerating, startTask } = useTask(); @@ -96,7 +142,6 @@ export const useHomeController = () => { // 字幕和标题相关状态 const [videoTitle, setVideoTitle] = useState(""); - const [enableSubtitles, setEnableSubtitles] = useState(true); const [selectedSubtitleStyleId, setSelectedSubtitleStyleId] = useState(""); const [selectedTitleStyleId, setSelectedTitleStyleId] = useState(""); const [subtitleFontSize, setSubtitleFontSize] = useState(80); @@ -181,8 +226,8 @@ export const useHomeController = () => { { new_name: editMaterialName.trim() } ); const payload = unwrap(res); - if (selectedMaterial === materialId && payload?.id) { - setSelectedMaterial(payload.id); + if (selectedMaterials.includes(materialId) && payload?.id) { + setSelectedMaterials((prev) => prev.map((x) => (x === materialId ? payload.id : x))); } setEditingMaterialId(null); setEditMaterialName(""); @@ -197,6 +242,10 @@ export const useHomeController = () => { // AI 生成标题标签 const [isGeneratingMeta, setIsGeneratingMeta] = useState(false); + // AI 多语言翻译 + const [isTranslating, setIsTranslating] = useState(false); + const [originalText, setOriginalText] = useState(null); + // 在线录音相关 const [isRecording, setIsRecording] = useState(false); const [recordedBlob, setRecordedBlob] = useState(null); @@ -226,11 +275,13 @@ export const useHomeController = () => { uploadError, setUploadError, fetchMaterials, + toggleMaterial, + reorderMaterials, deleteMaterial, handleUpload, } = useMaterials({ - selectedMaterial, - setSelectedMaterial, + selectedMaterials, + setSelectedMaterials, }); const { @@ -338,14 +389,14 @@ export const useHomeController = () => { setText, videoTitle, setVideoTitle, - enableSubtitles, - setEnableSubtitles, ttsMode, setTtsMode, voice, setVoice, - selectedMaterial, - setSelectedMaterial, + textLang, + setTextLang, + selectedMaterials, + setSelectedMaterials, selectedSubtitleStyleId, setSelectedSubtitleStyleId, selectedTitleStyleId, @@ -410,7 +461,8 @@ export const useHomeController = () => { }, [isGenerating, currentTask, fetchGeneratedVideos]); useEffect(() => { - const material = materials.find((item) => item.id === selectedMaterial); + const firstSelected = selectedMaterials[0]; + const material = materials.find((item) => item.id === firstSelected); if (!material?.path) { setMaterialDimensions(null); return; @@ -450,7 +502,7 @@ export const useHomeController = () => { video.removeEventListener("loadedmetadata", handleLoaded); video.removeEventListener("error", handleError); }; - }, [materials, selectedMaterial]); + }, [materials, selectedMaterials]); useEffect(() => { @@ -486,12 +538,13 @@ export const useHomeController = () => { }, [selectedBgmId, bgmList]); useEffect(() => { - if (!selectedMaterial) return; - const target = materialItemRefs.current[selectedMaterial]; + const firstSelected = selectedMaterials[0]; + if (!firstSelected) return; + const target = materialItemRefs.current[firstSelected]; if (target) { target.scrollIntoView({ block: "nearest", behavior: "smooth" }); } - }, [selectedMaterial, materials]); + }, [selectedMaterials, materials]); // 【修复】历史视频默认选中逻辑 // 当持久化恢复完成,且列表加载完毕,如果没选中任何视频,默认选中第一个 @@ -639,9 +692,58 @@ export const useHomeController = () => { } }; + // AI 多语言翻译 + const handleTranslate = async (targetLang: string) => { + if (!text.trim()) { + toast.error("请先输入口播文案"); + return; + } + + // 首次翻译时保存原文 + if (originalText === null) { + setOriginalText(text); + } + + setIsTranslating(true); + try { + const { data: res } = await api.post>( + "/api/ai/translate", + { text: text.trim(), target_lang: targetLang } + ); + const payload = unwrap(res); + setText(payload.translated_text || ""); + + // 根据翻译目标语言更新 textLang 并自动切换声音 + const locale = LANG_TO_LOCALE[targetLang] || "zh-CN"; + setTextLang(locale); + if (ttsMode === "edgetts") { + const langVoices = VOICES[locale] || VOICES["zh-CN"]; + setVoice(langVoices[0].id); + } + } catch (err: unknown) { + console.error("AI translate failed:", err); + const axiosErr = err as { response?: { data?: { message?: string } }; message?: string }; + const errorMsg = axiosErr.response?.data?.message || axiosErr.message || String(err); + toast.error(`AI 翻译失败: ${errorMsg}`); + } finally { + setIsTranslating(false); + } + }; + + const handleRestoreOriginal = () => { + if (originalText !== null) { + setText(originalText); + setOriginalText(null); + setTextLang("zh-CN"); + if (ttsMode === "edgetts") { + setVoice(VOICES["zh-CN"][0].id); + } + } + }; + // 生成视频 const handleGenerate = async () => { - if (!selectedMaterial || !text.trim()) { + if (selectedMaterials.length === 0 || !text.trim()) { toast.error("请先选择素材并填写文案"); return; } @@ -663,26 +765,33 @@ export const useHomeController = () => { try { // 查找选中的素材对象以获取路径 - const materialObj = materials.find((m) => m.id === selectedMaterial); - if (!materialObj) { + const firstMaterialObj = materials.find((m) => m.id === selectedMaterials[0]); + if (!firstMaterialObj) { toast.error("素材数据异常"); return; } // 构建请求参数 const payload: Record = { - material_path: materialObj.path, + material_path: firstMaterialObj.path, text: text, tts_mode: ttsMode, title: videoTitle.trim() || undefined, - enable_subtitles: enableSubtitles, + enable_subtitles: true, }; - if (enableSubtitles && selectedSubtitleStyleId) { + // 多素材 + if (selectedMaterials.length > 1) { + payload.material_paths = selectedMaterials + .map((id) => materials.find((x) => x.id === id)?.path) + .filter((path): path is string => !!path); + } + + if (selectedSubtitleStyleId) { payload.subtitle_style_id = selectedSubtitleStyleId; } - if (enableSubtitles && subtitleFontSize) { + if (subtitleFontSize) { payload.subtitle_font_size = Math.round(subtitleFontSize); } @@ -698,15 +807,15 @@ export const useHomeController = () => { payload.title_top_margin = Math.round(titleTopMargin); } - if (enableSubtitles) { - payload.subtitle_bottom_margin = Math.round(subtitleBottomMargin); - } + payload.subtitle_bottom_margin = Math.round(subtitleBottomMargin); if (enableBgm && selectedBgmId) { payload.bgm_id = selectedBgmId; payload.bgm_volume = bgmVolume; } + payload.language = textLang; + if (ttsMode === "edgetts") { payload.voice = voice; } else { @@ -774,8 +883,9 @@ export const useHomeController = () => { fetchMaterials, deleteMaterial, handleUpload, - selectedMaterial, - setSelectedMaterial, + selectedMaterials, + toggleMaterial, + reorderMaterials, handlePreviewMaterial, editingMaterialId, editMaterialName, @@ -789,6 +899,10 @@ export const useHomeController = () => { setExtractModalOpen, handleGenerateMeta, isGeneratingMeta, + handleTranslate, + isTranslating, + originalText, + handleRestoreOriginal, showStylePreview, setShowStylePreview, videoTitle, @@ -809,17 +923,16 @@ export const useHomeController = () => { setTitleTopMargin, subtitleBottomMargin, setSubtitleBottomMargin, - enableSubtitles, - setEnableSubtitles, resolveAssetUrl, getFontFormat, buildTextShadow, materialDimensions, ttsMode, setTtsMode, - voices: VOICES, + voices: VOICES[textLang] || VOICES["zh-CN"], voice, setVoice, + textLang, refAudios, selectedRefAudio, handleSelectRefAudio, diff --git a/frontend/src/features/home/model/useHomePersistence.ts b/frontend/src/features/home/model/useHomePersistence.ts index 78cc99d..a9012ba 100644 --- a/frontend/src/features/home/model/useHomePersistence.ts +++ b/frontend/src/features/home/model/useHomePersistence.ts @@ -17,14 +17,14 @@ interface UseHomePersistenceOptions { setText: React.Dispatch>; videoTitle: string; setVideoTitle: React.Dispatch>; - enableSubtitles: boolean; - setEnableSubtitles: React.Dispatch>; ttsMode: 'edgetts' | 'voiceclone'; setTtsMode: React.Dispatch>; voice: string; setVoice: React.Dispatch>; - selectedMaterial: string; - setSelectedMaterial: React.Dispatch>; + textLang: string; + setTextLang: React.Dispatch>; + selectedMaterials: string[]; + setSelectedMaterials: React.Dispatch>; selectedSubtitleStyleId: string; setSelectedSubtitleStyleId: React.Dispatch>; selectedTitleStyleId: string; @@ -57,14 +57,14 @@ export const useHomePersistence = ({ setText, videoTitle, setVideoTitle, - enableSubtitles, - setEnableSubtitles, ttsMode, setTtsMode, voice, setVoice, - selectedMaterial, - setSelectedMaterial, + textLang, + setTextLang, + selectedMaterials, + setSelectedMaterials, selectedSubtitleStyleId, setSelectedSubtitleStyleId, selectedTitleStyleId, @@ -96,9 +96,9 @@ export const useHomePersistence = ({ const savedText = localStorage.getItem(`vigent_${storageKey}_text`); const savedTitle = localStorage.getItem(`vigent_${storageKey}_title`); - const savedSubtitles = localStorage.getItem(`vigent_${storageKey}_subtitles`); const savedTtsMode = localStorage.getItem(`vigent_${storageKey}_ttsMode`); const savedVoice = localStorage.getItem(`vigent_${storageKey}_voice`); + const savedTextLang = localStorage.getItem(`vigent_${storageKey}_textLang`); const savedMaterial = localStorage.getItem(`vigent_${storageKey}_material`); const savedSubtitleStyle = localStorage.getItem(`vigent_${storageKey}_subtitleStyle`); const savedTitleStyle = localStorage.getItem(`vigent_${storageKey}_titleStyle`); @@ -113,11 +113,23 @@ export const useHomePersistence = ({ setText(savedText || "大家好,欢迎来到我的频道,今天给大家分享一些有趣的内容。"); setVideoTitle(savedTitle ? clampTitle(savedTitle) : ""); - setEnableSubtitles(savedSubtitles !== null ? savedSubtitles === 'true' : true); setTtsMode((savedTtsMode as 'edgetts' | 'voiceclone') || 'edgetts'); setVoice(savedVoice || "zh-CN-YunxiNeural"); + if (savedTextLang) setTextLang(savedTextLang); - if (savedMaterial) setSelectedMaterial(savedMaterial); + if (savedMaterial) { + try { + const parsed = JSON.parse(savedMaterial); + if (Array.isArray(parsed)) { + setSelectedMaterials(parsed); + } else { + setSelectedMaterials([savedMaterial]); + } + } catch { + // 旧格式: 单字符串 + setSelectedMaterials([savedMaterial]); + } + } if (savedSubtitleStyle) setSelectedSubtitleStyleId(savedSubtitleStyle); if (savedTitleStyle) setSelectedTitleStyleId(savedTitleStyle); @@ -157,15 +169,15 @@ export const useHomePersistence = ({ isAuthLoading, setBgmVolume, setEnableBgm, - setEnableSubtitles, setSelectedBgmId, - setSelectedMaterial, + setSelectedMaterials, setSelectedSubtitleStyleId, setSelectedTitleStyleId, setSelectedVideoId, setSubtitleFontSize, setSubtitleSizeLocked, setText, + setTextLang, setTitleFontSize, setTitleSizeLocked, setTitleTopMargin, @@ -192,10 +204,6 @@ export const useHomePersistence = ({ return () => clearTimeout(timeout); }, [videoTitle, storageKey, isRestored]); - useEffect(() => { - if (isRestored) localStorage.setItem(`vigent_${storageKey}_subtitles`, String(enableSubtitles)); - }, [enableSubtitles, storageKey, isRestored]); - useEffect(() => { if (isRestored) localStorage.setItem(`vigent_${storageKey}_ttsMode`, ttsMode); }, [ttsMode, storageKey, isRestored]); @@ -205,10 +213,14 @@ export const useHomePersistence = ({ }, [voice, storageKey, isRestored]); useEffect(() => { - if (isRestored && selectedMaterial) { - localStorage.setItem(`vigent_${storageKey}_material`, selectedMaterial); + if (isRestored) localStorage.setItem(`vigent_${storageKey}_textLang`, textLang); + }, [textLang, storageKey, isRestored]); + + useEffect(() => { + if (isRestored && selectedMaterials.length > 0) { + localStorage.setItem(`vigent_${storageKey}_material`, JSON.stringify(selectedMaterials)); } - }, [selectedMaterial, storageKey, isRestored]); + }, [selectedMaterials, storageKey, isRestored]); useEffect(() => { if (isRestored && selectedSubtitleStyleId) { diff --git a/frontend/src/features/home/model/useMaterials.ts b/frontend/src/features/home/model/useMaterials.ts index 18957a6..f514166 100644 --- a/frontend/src/features/home/model/useMaterials.ts +++ b/frontend/src/features/home/model/useMaterials.ts @@ -2,23 +2,16 @@ import { useCallback, useState } from "react"; import api from "@/shared/api/axios"; import { ApiResponse, unwrap } from "@/shared/api/types"; import { toast } from "sonner"; - -interface Material { - id: string; - name: string; - scene: string; - size_mb: number; - path: string; -} +import type { Material } from "@/shared/types/material"; interface UseMaterialsOptions { - selectedMaterial: string; - setSelectedMaterial: React.Dispatch>; + selectedMaterials: string[]; + setSelectedMaterials: React.Dispatch>; } export const useMaterials = ({ - selectedMaterial, - setSelectedMaterial, + selectedMaterials, + setSelectedMaterials, }: UseMaterialsOptions) => { const [materials, setMaterials] = useState([]); const [fetchError, setFetchError] = useState(null); @@ -41,12 +34,13 @@ export const useMaterials = ({ setMaterials(nextMaterials); setLastMaterialCount(nextMaterials.length); - setSelectedMaterial((prev) => { - // 如果当前选中的素材在列表中依然存在,保持选中 - const exists = nextMaterials.some((item) => item.id === prev); - if (exists) return prev; + setSelectedMaterials((prev) => { + // 保留已选中且仍存在的 + const existingIds = new Set(nextMaterials.map((m) => m.id)); + const kept = prev.filter((id) => existingIds.has(id)); + if (kept.length > 0) return kept; // 否则默认选中第一个 - return nextMaterials[0]?.id || ""; + return nextMaterials[0]?.id ? [nextMaterials[0].id] : []; }); } catch (error) { console.error("获取素材失败:", error); @@ -54,29 +48,58 @@ export const useMaterials = ({ } finally { setIsFetching(false); } - }, [setSelectedMaterial]); + }, [setSelectedMaterials]); + + const MAX_MATERIALS = 4; + + const toggleMaterial = useCallback((id: string) => { + setSelectedMaterials((prev) => { + if (prev.includes(id)) { + // 不能取消最后一个 + if (prev.length <= 1) return prev; + return prev.filter((x) => x !== id); + } + if (prev.length >= MAX_MATERIALS) return prev; + return [...prev, id]; + }); + }, [setSelectedMaterials]); + + const reorderMaterials = useCallback((activeId: string, overId: string) => { + setSelectedMaterials((prev) => { + const oldIndex = prev.indexOf(activeId); + const newIndex = prev.indexOf(overId); + if (oldIndex === -1 || newIndex === -1) return prev; + const next = [...prev]; + next.splice(oldIndex, 1); + next.splice(newIndex, 0, activeId); + return next; + }); + }, [setSelectedMaterials]); const deleteMaterial = useCallback(async (materialId: string) => { if (!confirm("确定要删除这个素材吗?")) return; try { await api.delete(`/api/materials/${materialId}`); fetchMaterials(); - if (selectedMaterial === materialId) { - setSelectedMaterial(""); + if (selectedMaterials.includes(materialId)) { + setSelectedMaterials((prev) => { + const next = prev.filter((id) => id !== materialId); + return next.length > 0 ? next : []; + }); } } catch (error) { toast.error("删除失败: " + error); } - }, [fetchMaterials, selectedMaterial, setSelectedMaterial]); + }, [fetchMaterials, selectedMaterials, setSelectedMaterials]); const handleUpload = useCallback(async (e: React.ChangeEvent) => { const file = e.target.files?.[0]; if (!file) return; - const validTypes = ['.mp4', '.mov', '.avi']; + const validTypes = ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv', '.wmv', '.m4v', '.ts', '.mts']; const ext = file.name.toLowerCase().slice(file.name.lastIndexOf('.')); if (!validTypes.includes(ext)) { - setUploadError('仅支持 MP4、MOV、AVI 格式'); + setUploadError('不支持的视频格式'); return; } @@ -100,7 +123,22 @@ export const useMaterials = ({ setUploadProgress(100); setIsUploading(false); - fetchMaterials(); + + // 上传后重新拉列表并自动选中新素材 + const { data: res } = await api.get>( + `/api/materials?t=${new Date().getTime()}` + ); + const payload = unwrap(res); + const nextMaterials = payload.materials || []; + setMaterials(nextMaterials); + setLastMaterialCount(nextMaterials.length); + + // 找出新增的素材 ID 并自动选中 + const oldIds = new Set(materials.map((m) => m.id)); + const newIds = nextMaterials.filter((m) => !oldIds.has(m.id)).map((m) => m.id); + if (newIds.length > 0) { + setSelectedMaterials((prev) => [...prev, ...newIds]); + } } catch (err: unknown) { console.error("Upload failed:", err); setIsUploading(false); @@ -122,6 +160,8 @@ export const useMaterials = ({ uploadError, setUploadError, fetchMaterials, + toggleMaterial, + reorderMaterials, deleteMaterial, handleUpload, }; diff --git a/frontend/src/features/home/ui/GenerateActionBar.tsx b/frontend/src/features/home/ui/GenerateActionBar.tsx index 52776c2..c2148f6 100644 --- a/frontend/src/features/home/ui/GenerateActionBar.tsx +++ b/frontend/src/features/home/ui/GenerateActionBar.tsx @@ -4,6 +4,7 @@ interface GenerateActionBarProps { isGenerating: boolean; progress: number; disabled: boolean; + materialCount?: number; onGenerate: () => void; } @@ -11,43 +12,51 @@ export function GenerateActionBar({ isGenerating, progress, disabled, + materialCount = 1, onGenerate, }: GenerateActionBarProps) { return ( - + {!isGenerating && materialCount >= 2 && ( +

+ 多素材模式 ({materialCount} 个机位),生成耗时较长 +

)} - + ); } diff --git a/frontend/src/features/home/ui/HomePage.tsx b/frontend/src/features/home/ui/HomePage.tsx index de24151..388428c 100644 --- a/frontend/src/features/home/ui/HomePage.tsx +++ b/frontend/src/features/home/ui/HomePage.tsx @@ -34,8 +34,9 @@ export function HomePage() { fetchMaterials, deleteMaterial, handleUpload, - selectedMaterial, - setSelectedMaterial, + selectedMaterials, + toggleMaterial, + reorderMaterials, handlePreviewMaterial, editingMaterialId, editMaterialName, @@ -49,6 +50,10 @@ export function HomePage() { setExtractModalOpen, handleGenerateMeta, isGeneratingMeta, + handleTranslate, + isTranslating, + originalText, + handleRestoreOriginal, showStylePreview, setShowStylePreview, videoTitle, @@ -69,8 +74,6 @@ export function HomePage() { setTitleTopMargin, subtitleBottomMargin, setSubtitleBottomMargin, - enableSubtitles, - setEnableSubtitles, resolveAssetUrl, getFontFormat, buildTextShadow, @@ -147,7 +150,7 @@ export function HomePage() { {/* 素材选择 */} setExtractModalOpen(true)} onGenerateMeta={handleGenerateMeta} isGeneratingMeta={isGeneratingMeta} + onTranslate={handleTranslate} + isTranslating={isTranslating} + hasOriginalText={originalText !== null} + onRestoreOriginal={handleRestoreOriginal} /> {/* 标题和字幕设置 */} @@ -207,8 +215,6 @@ export function HomePage() { onTitleTopMarginChange={setTitleTopMargin} subtitleBottomMargin={subtitleBottomMargin} onSubtitleBottomMarginChange={setSubtitleBottomMargin} - enableSubtitles={enableSubtitles} - onToggleSubtitles={setEnableSubtitles} resolveAssetUrl={resolveAssetUrl} getFontFormat={getFontFormat} buildTextShadow={buildTextShadow} @@ -276,7 +282,8 @@ export function HomePage() { diff --git a/frontend/src/features/home/ui/MaterialSelector.tsx b/frontend/src/features/home/ui/MaterialSelector.tsx index 11f364e..78f7ad7 100644 --- a/frontend/src/features/home/ui/MaterialSelector.tsx +++ b/frontend/src/features/home/ui/MaterialSelector.tsx @@ -1,17 +1,25 @@ -import type { ChangeEvent, MouseEvent } from "react"; -import { Upload, RefreshCw, Eye, Trash2, X, Pencil, Check } from "lucide-react"; - -interface Material { - id: string; - name: string; - scene: string; - size_mb: number; - path: string; -} +import { type ChangeEvent, type MouseEvent } from "react"; +import { Upload, RefreshCw, Eye, Trash2, X, Pencil, Check, GripVertical } from "lucide-react"; +import type { Material } from "@/shared/types/material"; +import { + DndContext, + closestCenter, + KeyboardSensor, + PointerSensor, + useSensor, + useSensors, + type DragEndEvent, +} from "@dnd-kit/core"; +import { + SortableContext, + horizontalListSortingStrategy, + useSortable, +} from "@dnd-kit/sortable"; +import { CSS } from "@dnd-kit/utilities"; interface MaterialSelectorProps { materials: Material[]; - selectedMaterial: string; + selectedMaterials: string[]; isFetching: boolean; lastMaterialCount: number; editingMaterialId: string | null; @@ -23,7 +31,8 @@ interface MaterialSelectorProps { apiBase: string; onUploadChange: (event: ChangeEvent) => void; onRefresh: () => void; - onSelectMaterial: (id: string) => void; + onToggleMaterial: (id: string) => void; + onReorderMaterials: (activeId: string, overId: string) => void; onPreviewMaterial: (path: string) => void; onStartEditing: (material: Material, event: MouseEvent) => void; onEditNameChange: (value: string) => void; @@ -34,9 +43,64 @@ interface MaterialSelectorProps { registerMaterialRef: (id: string, element: HTMLDivElement | null) => void; } +function SortableChip({ + id, + index, + label, + onRemove, +}: { + id: string; + index: number; + label: string; + onRemove: () => void; +}) { + const { + attributes, + listeners, + setNodeRef, + transform, + transition, + isDragging, + } = useSortable({ id }); + + const style = { + transform: CSS.Translate.toString(transform), + transition, + }; + + const circledNumbers = ["\u2460", "\u2461", "\u2462", "\u2463", "\u2464", "\u2465", "\u2466", "\u2467", "\u2468", "\u2469"]; + + return ( +
+ + + + {circledNumbers[index] || `${index + 1}`} + {label} + +
+ ); +} + export function MaterialSelector({ materials, - selectedMaterial, + selectedMaterials, isFetching, lastMaterialCount, editingMaterialId, @@ -48,7 +112,8 @@ export function MaterialSelector({ apiBase, onUploadChange, onRefresh, - onSelectMaterial, + onToggleMaterial, + onReorderMaterials, onPreviewMaterial, onStartEditing, onEditNameChange, @@ -58,20 +123,36 @@ export function MaterialSelector({ onClearUploadError, registerMaterialRef, }: MaterialSelectorProps) { + const sensors = useSensors( + useSensor(PointerSensor, { activationConstraint: { distance: 5 } }), + useSensor(KeyboardSensor) + ); + + const handleDragEnd = (event: DragEndEvent) => { + const { active, over } = event; + if (over && active.id !== over.id) { + onReorderMaterials(String(active.id), String(over.id)); + } + }; + + const selectedSet = new Set(selectedMaterials); + const isFull = selectedMaterials.length >= 4; + const circledNumbers = ["\u2460", "\u2461", "\u2462", "\u2463", "\u2464", "\u2465", "\u2466", "\u2467", "\u2468", "\u2469"]; + return (

📹 视频素材 - (上传自拍视频) + (可多选,最多4个)

@@ -119,6 +200,38 @@ export function MaterialSelector({
)} + {/* 已选素材排列(拖拽排序区) - 仅当选中 >= 2 个时显示 */} + {selectedMaterials.length >= 2 && ( +
+
🎬 机位顺序 (拖拽调整)
+ + +
+ {selectedMaterials.map((id, index) => { + const m = materials.find((x) => x.id === id); + return ( + onToggleMaterial(id)} + /> + ); + })} +
+
+
+
+ )} + {fetchError ? (
获取素材失败: {fetchError} @@ -126,7 +239,7 @@ export function MaterialSelector({ API: {apiBase}/api/materials/
) : isFetching && materials.length === 0 ? ( -
+
{Array.from({ length: Math.min(4, Math.max(1, lastMaterialCount || 1)) }).map((_, index) => (
) : (
- {materials.map((m) => ( -
registerMaterialRef(m.id, el)} - className={`p-3 rounded-lg border transition-all flex items-center justify-between group ${selectedMaterial === m.id - ? "border-purple-500 bg-purple-500/20" - : "border-white/10 bg-white/5 hover:border-white/30" - }`} - > - {editingMaterialId === m.id ? ( -
e.stopPropagation()}> - onEditNameChange(e.target.value)} - className="flex-1 bg-black/40 border border-white/20 rounded-md px-2 py-1 text-xs text-white" - autoFocus - /> - - -
- ) : ( - - )} -
- - {editingMaterialId !== m.id && ( - + +
+ ) : ( + )} - +
+ + {editingMaterialId !== m.id && ( + + )} + +
-
- ))} + ); + })}
)}
diff --git a/frontend/src/features/home/ui/ScriptEditor.tsx b/frontend/src/features/home/ui/ScriptEditor.tsx index 1830df8..e8f0875 100644 --- a/frontend/src/features/home/ui/ScriptEditor.tsx +++ b/frontend/src/features/home/ui/ScriptEditor.tsx @@ -1,4 +1,17 @@ -import { FileText, Loader2, Sparkles } from "lucide-react"; +import { useEffect, useRef, useState } from "react"; +import { FileText, Languages, Loader2, RotateCcw, Sparkles } from "lucide-react"; + +const LANGUAGES = [ + { code: "English", label: "英语 English" }, + { code: "日本語", label: "日语 日本語" }, + { code: "한국어", label: "韩语 한국어" }, + { code: "Français", label: "法语 Français" }, + { code: "Deutsch", label: "德语 Deutsch" }, + { code: "Español", label: "西班牙语 Español" }, + { code: "Русский", label: "俄语 Русский" }, + { code: "Italiano", label: "意大利语 Italiano" }, + { code: "Português", label: "葡萄牙语 Português" }, +]; interface ScriptEditorProps { text: string; @@ -6,6 +19,10 @@ interface ScriptEditorProps { onOpenExtractModal: () => void; onGenerateMeta: () => void; isGeneratingMeta: boolean; + onTranslate: (targetLang: string) => void; + isTranslating: boolean; + hasOriginalText: boolean; + onRestoreOriginal: () => void; } export function ScriptEditor({ @@ -14,14 +31,37 @@ export function ScriptEditor({ onOpenExtractModal, onGenerateMeta, isGeneratingMeta, + onTranslate, + isTranslating, + hasOriginalText, + onRestoreOriginal, }: ScriptEditorProps) { + const [showLangMenu, setShowLangMenu] = useState(false); + const langMenuRef = useRef(null); + + useEffect(() => { + if (!showLangMenu) return; + const handleClickOutside = (e: MouseEvent) => { + if (langMenuRef.current && !langMenuRef.current.contains(e.target as Node)) { + setShowLangMenu(false); + } + }; + document.addEventListener("mousedown", handleClickOutside); + return () => document.removeEventListener("mousedown", handleClickOutside); + }, [showLangMenu]); + + const handleSelectLang = (langCode: string) => { + setShowLangMenu(false); + onTranslate(langCode); + }; + return ( -
-
-

+
+
+

✍️ 文案提取与编辑

-
+
+
+ + {showLangMenu && ( +
+ {hasOriginalText && ( + <> + +
+ + )} + {LANGUAGES.map((lang) => ( + + ))} +
+ )} +
)} - {enableSubtitles && subtitleStyles.length > 0 && ( + {subtitleStyles.length > 0 && (
@@ -232,22 +228,6 @@ export function TitleSubtitlePanel({
)} - -
-
- 逐字高亮字幕 -

自动生成卡拉OK效果字幕

-
- -
); } diff --git a/frontend/src/shared/types/material.ts b/frontend/src/shared/types/material.ts new file mode 100644 index 0000000..accd834 --- /dev/null +++ b/frontend/src/shared/types/material.ts @@ -0,0 +1,7 @@ +export interface Material { + id: string; + name: string; + path: string; + size_mb: number; + scene?: string; +} diff --git a/models/Qwen3-TTS/qwen_tts_server.py b/models/Qwen3-TTS/qwen_tts_server.py index c53982c..a0d835c 100644 --- a/models/Qwen3-TTS/qwen_tts_server.py +++ b/models/Qwen3-TTS/qwen_tts_server.py @@ -134,10 +134,14 @@ async def generate( try: print(f"🎤 Generating: {text[:30]}...") print(f"📝 Ref text: {ref_text[:50]}...") + print(f"🌐 Language: {language}") start = time.time() - wavs, sr = _model.generate_voice_clone( + # 在线程池中运行,避免阻塞事件循环导致健康检查超时 + import asyncio + wavs, sr = await asyncio.to_thread( + _model.generate_voice_clone, text=text, language=language, ref_audio=ref_audio_path,