更新

2026-02-09 14:47:19 +08:00
parent e226224119
commit 3129d45b25
23 changed files with 1529 additions and 294 deletions
--- a/backend/app/modules/ai/router.py
+++ b/backend/app/modules/ai/router.py
@@ -24,6 +24,33 @@ class GenerateMetaResponse(BaseModel):
    tags: list[str]


+class TranslateRequest(BaseModel):
+    """翻译请求"""
+    text: str
+    target_lang: str
+
+
+@router.post("/translate")
+async def translate_text(req: TranslateRequest):
+    """
+    AI 翻译文案
+
+    将文案翻译为指定目标语言
+    """
+    if not req.text or not req.text.strip():
+        raise HTTPException(status_code=400, detail="文案不能为空")
+    if not req.target_lang or not req.target_lang.strip():
+        raise HTTPException(status_code=400, detail="目标语言不能为空")
+
+    try:
+        logger.info(f"Translating text to {req.target_lang}: {req.text[:50]}...")
+        translated = await glm_service.translate_text(req.text.strip(), req.target_lang.strip())
+        return success_response({"translated_text": translated})
+    except Exception as e:
+        logger.error(f"Translate failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
@router.post("/generate-meta")
 async def generate_meta(req: GenerateMetaRequest):
    """
--- a/backend/app/modules/videos/schemas.py
+++ b/backend/app/modules/videos/schemas.py
@@ -1,14 +1,16 @@
 from pydantic import BaseModel
-from typing import Optional
+from typing import Optional, List


 class GenerateRequest(BaseModel):
    text: str
    voice: str = "zh-CN-YunxiNeural"
    material_path: str
+    material_paths: Optional[List[str]] = None
    tts_mode: str = "edgetts"
    ref_audio_id: Optional[str] = None
    ref_text: Optional[str] = None
+    language: str = "zh-CN"
    title: Optional[str] = None
    enable_subtitles: bool = True
    subtitle_style_id: Optional[str] = None
--- a/backend/app/modules/videos/workflow.py
+++ b/backend/app/modules/videos/workflow.py
@@ -1,4 +1,4 @@
-from typing import Optional, Any
+from typing import Optional, Any, List
 from pathlib import Path
 import time
 import traceback
@@ -24,6 +24,17 @@ from .schemas import GenerateRequest
 from .task_store import task_store


+def _locale_to_whisper_lang(locale: str) -> str:
+    """'en-US' → 'en', 'zh-CN' → 'zh'"""
+    return locale.split("-")[0] if "-" in locale else locale
+
+
+def _locale_to_qwen_lang(locale: str) -> str:
+    """'zh-CN' → 'Chinese', 'en-US' → 'English', 其他 → 'Auto'"""
+    mapping = {"zh": "Chinese", "en": "English"}
+    return mapping.get(locale.split("-")[0], "Auto")
+
+
 _lipsync_service: Optional[LipSyncService] = None
 _lipsync_ready: Optional[bool] = None
 _lipsync_last_check: float = 0
@@ -79,19 +90,107 @@ def _update_task(task_id: str, **updates: Any) -> None:
    task_store.update(task_id, updates)


+# ── 多素材辅助函数 ──
+
+
+def _split_equal(segments: List[dict], material_paths: List[str]) -> List[dict]:
+    """按素材数量均分音频时长，对齐到最近的 Whisper 字边界。
+
+    Args:
+        segments: Whisper 产出的 segment 列表, 每个包含 words (字级时间戳)
+        material_paths: 素材路径列表
+
+    Returns:
+        [{"material_path": "...", "start": 0.0, "end": 5.2, "index": 0}, ...]
+    """
+    # 展平所有 Whisper 字符
+    all_chars: List[dict] = []
+    for seg in segments:
+        for w in seg.get("words", []):
+            all_chars.append(w)
+
+    n = len(material_paths)
+
+    if not all_chars or n == 0:
+        return [{"material_path": material_paths[0] if material_paths else "",
+                 "start": 0.0, "end": 99999.0, "index": 0}]
+
+    # 素材数不能超过字符数，否则边界会重复
+    if n > len(all_chars):
+        logger.warning(f"[MultiMat] 素材数({n}) > 字符数({len(all_chars)})，裁剪为 {len(all_chars)}")
+        n = len(all_chars)
+
+    total_start = all_chars[0]["start"]
+    total_end = all_chars[-1]["end"]
+    seg_dur = (total_end - total_start) / n
+
+    # 计算 N-1 个分割点，对齐到最近的字边界
+    boundaries = [0]  # 第一段从第 0 个字开始
+    for i in range(1, n):
+        target_time = total_start + i * seg_dur
+        # 找到 start 时间最接近 target_time 的字
+        best_idx = boundaries[-1] + 1  # 至少比上一个边界后移 1
+        best_diff = float("inf")
+        for j in range(boundaries[-1] + 1, len(all_chars)):
+            diff = abs(all_chars[j]["start"] - target_time)
+            if diff < best_diff:
+                best_diff = diff
+                best_idx = j
+            elif diff > best_diff:
+                break  # 时间递增，差值开始变大后可以停了
+        boundaries.append(min(best_idx, len(all_chars) - 1))
+    boundaries.append(len(all_chars))  # 最后一段到末尾
+
+    # 按边界生成分配结果
+    assignments: List[dict] = []
+    for i in range(n):
+        s_idx = boundaries[i]
+        e_idx = boundaries[i + 1]
+        if s_idx >= len(all_chars) or s_idx >= e_idx:
+            continue
+        assignments.append({
+            "material_path": material_paths[i],
+            "start": all_chars[s_idx]["start"],
+            "end": all_chars[e_idx - 1]["end"],
+            "text": "".join(c["word"] for c in all_chars[s_idx:e_idx]),
+            "index": len(assignments),
+        })
+
+    if not assignments:
+        return [{"material_path": material_paths[0], "start": 0.0, "end": 99999.0, "index": 0}]
+
+    logger.info(f"[MultiMat] 均分 {len(all_chars)} 字为 {len(assignments)} 段")
+    for a in assignments:
+        dur = a["end"] - a["start"]
+        logger.info(f"  段{a['index']}: [{a['start']:.2f}-{a['end']:.2f}s] ({dur:.1f}s) {a['text'][:20]}")
+
+    return assignments
+
+
 async def process_video_generation(task_id: str, req: GenerateRequest, user_id: str):
    temp_files = []
    try:
        start_time = time.time()
+
+        # ── 确定素材列表 ──
+        material_paths: List[str] = []
+        if req.material_paths and len(req.material_paths) > 1:
+            material_paths = req.material_paths
+        else:
+            material_paths = [req.material_path]
+
+        is_multi = len(material_paths) > 1
+
        _update_task(task_id, status="processing", progress=5, message="正在下载素材...")

        temp_dir = settings.UPLOAD_DIR / "temp"
        temp_dir.mkdir(parents=True, exist_ok=True)

-        input_material_path = temp_dir / f"{task_id}_input.mp4"
-        temp_files.append(input_material_path)
-
-        await _download_material(req.material_path, input_material_path)
+        # 单素材模式：下载主素材
+        if not is_multi:
+            input_material_path = temp_dir / f"{task_id}_input.mp4"
+            temp_files.append(input_material_path)
+            await _download_material(material_paths[0], input_material_path)

        _update_task(task_id, message="正在生成语音...", progress=10)

@@ -119,7 +218,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
                ref_audio_path=str(ref_audio_local),
                ref_text=req.ref_text,
                output_path=str(audio_path),
-                language="Chinese"
+                language=_locale_to_qwen_lang(req.language)
            )
        else:
            _update_task(task_id, message="正在生成语音 (EdgeTTS)...")
@@ -128,52 +227,183 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:

        tts_time = time.time() - start_time
        print(f"[Pipeline] TTS completed in {tts_time:.1f}s")
-        _update_task(task_id, progress=25)
-
-        _update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30)

        lipsync = _get_lipsync_service()
        lipsync_video_path = temp_dir / f"{task_id}_lipsync.mp4"
        temp_files.append(lipsync_video_path)

-        lipsync_start = time.time()
-        is_ready = await _check_lipsync_ready()
-
-        if is_ready:
-            print(f"[LipSync] Starting LatentSync inference...")
-            _update_task(task_id, progress=35, message="正在运行 LatentSync 推理...")
-            await lipsync.generate(str(input_material_path), str(audio_path), str(lipsync_video_path))
-        else:
-            print(f"[LipSync] LatentSync not ready, copying original video")
-            _update_task(task_id, message="唇形同步不可用，使用原始视频...")
-            import shutil
-            shutil.copy(str(input_material_path), lipsync_video_path)
-
-        lipsync_time = time.time() - lipsync_start
-        print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s")
-        _update_task(task_id, progress=80)
-
+        video = VideoService()
        captions_path = None
-        if req.enable_subtitles:
-            _update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82)
+
+        if is_multi:
+            # ══════════════════════════════════════
+            # 多素材流水线
+            # ══════════════════════════════════════
+            _update_task(task_id, progress=12, message="正在生成字幕 (Whisper)...")

            captions_path = temp_dir / f"{task_id}_captions.json"
            temp_files.append(captions_path)

            try:
-                await whisper_service.align(
+                captions_data = await whisper_service.align(
                    audio_path=str(audio_path),
                    text=req.text,
-                    output_path=str(captions_path)
+                    output_path=str(captions_path),
+                    language=_locale_to_whisper_lang(req.language),
                )
-                print(f"[Pipeline] Whisper alignment completed")
+                print(f"[Pipeline] Whisper alignment completed (multi-material)")
            except Exception as e:
-                logger.warning(f"Whisper alignment failed, skipping subtitles: {e}")
+                logger.warning(f"Whisper alignment failed: {e}")
+                captions_data = None
                captions_path = None

+            _update_task(task_id, progress=15, message="正在分配素材...")
+
+            if captions_data and captions_data.get("segments"):
+                assignments = _split_equal(captions_data["segments"], material_paths)
+            else:
+                # Whisper 失败 → 按时长均分（不依赖字符对齐）
+                logger.warning("[MultiMat] Whisper 无数据，按时长均分")
+                audio_dur = video._get_duration(str(audio_path))
+                if audio_dur <= 0:
+                    audio_dur = 30.0  # 安全兜底
+                seg_dur = audio_dur / len(material_paths)
+                assignments = [
+                    {"material_path": material_paths[i], "start": i * seg_dur,
+                     "end": (i + 1) * seg_dur, "index": i}
+                    for i in range(len(material_paths))
+                ]
+
+            # 扩展段覆盖完整音频范围：首段从0开始，末段到音频结尾
+            audio_duration = video._get_duration(str(audio_path))
+            if assignments and audio_duration > 0:
+                assignments[0]["start"] = 0.0
+                assignments[-1]["end"] = audio_duration
+
+            num_segments = len(assignments)
+            print(f"[Pipeline] Multi-material: {num_segments} segments, {len(material_paths)} materials")
+
+            if num_segments == 0:
+                raise RuntimeError("Multi-material: no valid segments after splitting")
+
+            lipsync_start = time.time()
+
+            # ── 第一步：下载所有素材并检测分辨率 ──
+            material_locals: List[Path] = []
+            resolutions = []
+
+            for i, assignment in enumerate(assignments):
+                material_local = temp_dir / f"{task_id}_material_{i}.mp4"
+                temp_files.append(material_local)
+                await _download_material(assignment["material_path"], material_local)
+                material_locals.append(material_local)
+                resolutions.append(video.get_resolution(str(material_local)))
+
+            # 分辨率不一致时，统一到第一个素材的分辨率
+            base_res = resolutions[0] if resolutions else (0, 0)
+            need_scale = any(r != base_res for r in resolutions) and base_res[0] > 0
+            if need_scale:
+                logger.info(f"[MultiMat] 素材分辨率不一致，统一到 {base_res[0]}x{base_res[1]}")
+
+            # ── 第二步：裁剪每段素材到对应时长 ──
+            prepared_segments: List[Path] = []
+
+            for i, assignment in enumerate(assignments):
+                seg_progress = 15 + int((i / num_segments) * 30)  # 15% → 45%
+                seg_dur = assignment["end"] - assignment["start"]
+                _update_task(
+                    task_id,
+                    progress=seg_progress,
+                    message=f"正在准备素材 {i+1}/{num_segments}..."
+                )
+
+                prepared_path = temp_dir / f"{task_id}_prepared_{i}.mp4"
+                temp_files.append(prepared_path)
+                video.prepare_segment(
+                    str(material_locals[i]), seg_dur, str(prepared_path),
+                    target_resolution=base_res if need_scale else None
+                )
+                prepared_segments.append(prepared_path)
+
+            # ── 第二步：拼接所有素材片段 ──
+            _update_task(task_id, progress=50, message="正在拼接素材片段...")
+            concat_path = temp_dir / f"{task_id}_concat.mp4"
+            temp_files.append(concat_path)
+            video.concat_videos(
+                [str(p) for p in prepared_segments],
+                str(concat_path)
+            )
+
+            # ── 第三步：一次 LatentSync 推理 ──
+            is_ready = await _check_lipsync_ready()
+
+            if is_ready:
+                _update_task(task_id, progress=55, message="正在合成唇形 (LatentSync)...")
+                print(f"[LipSync] Multi-material: single LatentSync on concatenated video")
+                try:
+                    await lipsync.generate(str(concat_path), str(audio_path), str(lipsync_video_path))
+                except Exception as e:
+                    logger.warning(f"[LipSync] Failed, fallback to concat without lipsync: {e}")
+                    import shutil
+                    shutil.copy(str(concat_path), str(lipsync_video_path))
+            else:
+                print(f"[LipSync] Not ready, using concatenated video without lipsync")
+                import shutil
+                shutil.copy(str(concat_path), str(lipsync_video_path))
+
+            lipsync_time = time.time() - lipsync_start
+            print(f"[Pipeline] Multi-material prepare + concat + LipSync completed in {lipsync_time:.1f}s")
+            _update_task(task_id, progress=80)
+
+            # 如果用户关闭了字幕，清除 captions_path（Whisper 仅用于句子切分）
+            if not req.enable_subtitles:
+                captions_path = None
+
+        else:
+            # ══════════════════════════════════════
+            # 单素材流水线（原有逻辑）
+            # ══════════════════════════════════════
+            _update_task(task_id, progress=25)
+            _update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30)
+
+            lipsync_start = time.time()
+            is_ready = await _check_lipsync_ready()
+
+            if is_ready:
+                print(f"[LipSync] Starting LatentSync inference...")
+                _update_task(task_id, progress=35, message="正在运行 LatentSync 推理...")
+                await lipsync.generate(str(input_material_path), str(audio_path), str(lipsync_video_path))
+            else:
+                print(f"[LipSync] LatentSync not ready, copying original video")
+                _update_task(task_id, message="唇形同步不可用，使用原始视频...")
+                import shutil
+                shutil.copy(str(input_material_path), lipsync_video_path)
+
+            lipsync_time = time.time() - lipsync_start
+            print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s")
+            _update_task(task_id, progress=80)
+
+            # 单素材模式：Whisper 在 LatentSync 之后
+            if req.enable_subtitles:
+                _update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82)
+
+                captions_path = temp_dir / f"{task_id}_captions.json"
+                temp_files.append(captions_path)
+
+                try:
+                    await whisper_service.align(
+                        audio_path=str(audio_path),
+                        text=req.text,
+                        output_path=str(captions_path),
+                        language=_locale_to_whisper_lang(req.language),
+                    )
+                    print(f"[Pipeline] Whisper alignment completed")
+                except Exception as e:
+                    logger.warning(f"Whisper alignment failed, skipping subtitles: {e}")
+                    captions_path = None
+
        _update_task(task_id, progress=85)

-        video = VideoService()
        final_audio_path = audio_path
        if req.bgm_id:
            _update_task(task_id, message="正在合成背景音乐...", progress=86)