更新

2026-02-27 16:11:34 +08:00
parent a1604979f0
commit 0e3502c6f0
113 changed files with 115723 additions and 490 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -57,7 +57,17 @@ class Settings(BaseSettings):
    LATENTSYNC_ENABLE_DEEPCACHE: bool = True        # 启用 DeepCache 加速
    LATENTSYNC_SEED: int = 1247                     # 随机种子 (-1 则随机)
    LATENTSYNC_USE_SERVER: bool = True              # 使用常驻服务 (Persistent Server) 加速
-    
+
+    # MuseTalk 配置
+    MUSETALK_GPU_ID: int = 0                        # GPU ID (默认使用 GPU0)
+    MUSETALK_API_URL: str = "http://localhost:8011"  # 常驻服务地址
+    MUSETALK_BATCH_SIZE: int = 8                    # 推理批大小
+    MUSETALK_VERSION: str = "v15"                   # 模型版本
+    MUSETALK_USE_FLOAT16: bool = True               # 半精度加速
+
+    # 混合唇形同步路由
+    LIPSYNC_DURATION_THRESHOLD: float = 120.0       # 秒，>=此值用 MuseTalk
+
    # Supabase 配置
    SUPABASE_URL: str = ""
    SUPABASE_PUBLIC_URL: str = ""  # 公网访问地址，用于生成前端可访问的 URL
@@ -93,6 +103,11 @@ class Settings(BaseSettings):
        """LatentSync 目录路径 (动态计算)"""
        return self.BASE_DIR.parent.parent / "models" / "LatentSync"

+    @property
+    def MUSETALK_DIR(self) -> Path:
+        """MuseTalk 目录路径 (动态计算)"""
+        return self.BASE_DIR.parent.parent / "models" / "MuseTalk"
+
    class Config:
        env_file = ".env"
        extra = "ignore"  # 忽略未知的环境变量
--- a/backend/app/modules/ai/router.py
+++ b/backend/app/modules/ai/router.py
@@ -2,6 +2,8 @@
 AI 相关 API 路由
 """

+from typing import Optional
+
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 from loguru import logger
@@ -25,6 +27,12 @@ class GenerateMetaResponse(BaseModel):
    tags: list[str]


+class RewriteRequest(BaseModel):
+    """改写请求"""
+    text: str
+    custom_prompt: Optional[str] = None
+
+
 class TranslateRequest(BaseModel):
    """翻译请求"""
    text: str
@@ -73,3 +81,18 @@ async def generate_meta(req: GenerateMetaRequest):
    except Exception as e:
        logger.error(f"Generate meta failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/rewrite")
+async def rewrite_script(req: RewriteRequest):
+    """AI 改写文案"""
+    if not req.text or not req.text.strip():
+        raise HTTPException(status_code=400, detail="文案不能为空")
+
+    try:
+        logger.info(f"Rewriting text: {req.text[:50]}...")
+        rewritten = await glm_service.rewrite_script(req.text.strip(), req.custom_prompt)
+        return success_response({"rewritten_text": rewritten})
+    except Exception as e:
+        logger.error(f"Rewrite failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
--- a/backend/app/modules/tools/service.py
+++ b/backend/app/modules/tools/service.py
@@ -63,11 +63,15 @@ async def extract_script(file=None, url: Optional[str] = None, rewrite: bool = T
        # 2. 提取文案 (Whisper)
        script = await whisper_service.transcribe(str(audio_path))

-        # 3. AI 改写 (GLM)
+        # 3. AI 改写 (GLM) — 失败时降级返回原文
        rewritten = None
        if rewrite and script and len(script.strip()) > 0:
            logger.info("Rewriting script...")
-            rewritten = await glm_service.rewrite_script(script, custom_prompt)
+            try:
+                rewritten = await glm_service.rewrite_script(script, custom_prompt)
+            except Exception as e:
+                logger.warning(f"GLM rewrite failed, returning original script: {e}")
+                rewritten = None

        return {
            "original_script": script,
--- a/backend/app/modules/videos/workflow.py
+++ b/backend/app/modules/videos/workflow.py
@@ -1,5 +1,6 @@
 from typing import Optional, Any, List
 from pathlib import Path
+import asyncio
 import time
 import traceback
 import httpx
@@ -415,18 +416,21 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:

            lipsync_start = time.time()

-            # ── 第一步：下载所有素材并检测分辨率 ──
+            # ── 第一步：并行下载所有素材并检测分辨率 ──
            material_locals: List[Path] = []
            resolutions = []

-            for i, assignment in enumerate(assignments):
+            async def _download_and_normalize(i: int, assignment: dict):
+                """下载单个素材并归一化方向"""
                material_local = temp_dir / f"{task_id}_material_{i}.mp4"
                temp_files.append(material_local)
                await _download_material(assignment["material_path"], material_local)

-                # 归一化旋转元数据，确保分辨率判断与后续推理一致
                normalized_material = temp_dir / f"{task_id}_material_{i}_norm.mp4"
-                normalized_result = video.normalize_orientation(
+                loop = asyncio.get_event_loop()
+                normalized_result = await loop.run_in_executor(
+                    None,
+                    video.normalize_orientation,
                    str(material_local),
                    str(normalized_material),
                )
@@ -434,8 +438,17 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
                    temp_files.append(normalized_material)
                    material_local = normalized_material

-                material_locals.append(material_local)
-                resolutions.append(video.get_resolution(str(material_local)))
+                res = video.get_resolution(str(material_local))
+                return material_local, res
+
+            download_tasks = [
+                _download_and_normalize(i, assignment)
+                for i, assignment in enumerate(assignments)
+            ]
+            download_results = await asyncio.gather(*download_tasks)
+            for local, res in download_results:
+                material_locals.append(local)
+                resolutions.append(res)

            # 按用户选择的画面比例统一分辨率
            base_res = target_resolution
@@ -443,29 +456,42 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
            if need_scale:
                logger.info(f"[MultiMat] 素材分辨率不一致，统一到 {base_res[0]}x{base_res[1]}")

-            # ── 第二步：裁剪每段素材到对应时长 ──
-            prepared_segments: List[Path] = []
+            # ── 第二步：并行裁剪每段素材到对应时长 ──
+            prepared_segments: List[Path] = [None] * num_segments

-            for i, assignment in enumerate(assignments):
-                seg_progress = 15 + int((i / num_segments) * 30)  # 15% → 45%
+            async def _prepare_one_segment(i: int, assignment: dict):
+                """将单个素材裁剪/循环到对应时长"""
                seg_dur = assignment["end"] - assignment["start"]
-                _update_task(
-                    task_id,
-                    progress=seg_progress,
-                    message=f"正在准备素材 {i+1}/{num_segments}..."
-                )
-
                prepared_path = temp_dir / f"{task_id}_prepared_{i}.mp4"
                temp_files.append(prepared_path)
-                video.prepare_segment(
-                    str(material_locals[i]), seg_dur, str(prepared_path),
-                    # 多素材拼接前统一重编码为同分辨率/同编码，避免 concat 仅保留首段
-                    target_resolution=base_res,
-                    source_start=assignment.get("source_start", 0.0),
-                    source_end=assignment.get("source_end"),
-                    target_fps=25,
+
+                loop = asyncio.get_event_loop()
+                await loop.run_in_executor(
+                    None,
+                    video.prepare_segment,
+                    str(material_locals[i]),
+                    seg_dur,
+                    str(prepared_path),
+                    base_res,
+                    assignment.get("source_start", 0.0),
+                    assignment.get("source_end"),
+                    25,
                )
-                prepared_segments.append(prepared_path)
+                return i, prepared_path
+
+            _update_task(
+                task_id,
+                progress=15,
+                message=f"正在并行准备 {num_segments} 个素材片段..."
+            )
+
+            prepare_tasks = [
+                _prepare_one_segment(i, assignment)
+                for i, assignment in enumerate(assignments)
+            ]
+            prepare_results = await asyncio.gather(*prepare_tasks)
+            for i, path in prepare_results:
+                prepared_segments[i] = path

            # ── 第二步：拼接所有素材片段 ──
            _update_task(task_id, progress=50, message="正在拼接素材片段...")
@@ -553,51 +579,89 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
            print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s")
            _update_task(task_id, progress=80)

-            # 单素材模式：Whisper 在 LatentSync 之后
-            if req.enable_subtitles:
+            # 单素材模式：Whisper 延迟到下方与 BGM 并行执行
+            if not req.enable_subtitles:
+                captions_path = None
+
+        _update_task(task_id, progress=85)
+
+        # ── Whisper 字幕 + BGM 混音 并行（两者都只依赖 audio_path）──
+        final_audio_path = audio_path
+        _whisper_task = None
+        _bgm_task = None
+
+        # 单素材模式下 Whisper 尚未执行，这里与 BGM 并行启动
+        need_whisper = not is_multi and req.enable_subtitles and captions_path is None
+        if need_whisper:
+            captions_path = temp_dir / f"{task_id}_captions.json"
+            temp_files.append(captions_path)
+            _captions_path_str = str(captions_path)
+
+            async def _run_whisper():
                _update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82)
-
-                captions_path = temp_dir / f"{task_id}_captions.json"
-                temp_files.append(captions_path)
-
                try:
                    await whisper_service.align(
                        audio_path=str(audio_path),
                        text=req.text,
-                        output_path=str(captions_path),
+                        output_path=_captions_path_str,
                        language=_locale_to_whisper_lang(req.language),
                        original_text=req.text,
                    )
                    print(f"[Pipeline] Whisper alignment completed")
+                    return True
                except Exception as e:
                    logger.warning(f"Whisper alignment failed, skipping subtitles: {e}")
-                    captions_path = None
+                    return False

-        _update_task(task_id, progress=85)
+            _whisper_task = _run_whisper()

-        final_audio_path = audio_path
        if req.bgm_id:
-            _update_task(task_id, message="正在合成背景音乐...", progress=86)
-
            bgm_path = resolve_bgm_path(req.bgm_id)
            if bgm_path:
                mix_output_path = temp_dir / f"{task_id}_audio_mix.wav"
                temp_files.append(mix_output_path)
                volume = req.bgm_volume if req.bgm_volume is not None else 0.2
                volume = max(0.0, min(float(volume), 1.0))
-                try:
-                    video.mix_audio(
-                        voice_path=str(audio_path),
-                        bgm_path=str(bgm_path),
-                        output_path=str(mix_output_path),
-                        bgm_volume=volume
-                    )
-                    final_audio_path = mix_output_path
-                except Exception as e:
-                    logger.warning(f"BGM mix failed, fallback to voice only: {e}")
+                _mix_output = str(mix_output_path)
+                _bgm_path = str(bgm_path)
+                _voice_path = str(audio_path)
+                _volume = volume
+
+                async def _run_bgm():
+                    _update_task(task_id, message="正在合成背景音乐...", progress=86)
+                    loop = asyncio.get_event_loop()
+                    try:
+                        await loop.run_in_executor(
+                            None,
+                            video.mix_audio,
+                            _voice_path,
+                            _bgm_path,
+                            _mix_output,
+                            _volume,
+                        )
+                        return True
+                    except Exception as e:
+                        logger.warning(f"BGM mix failed, fallback to voice only: {e}")
+                        return False
+
+                _bgm_task = _run_bgm()
            else:
                logger.warning(f"BGM not found: {req.bgm_id}")

+        # 并行等待 Whisper + BGM
+        parallel_tasks = [t for t in (_whisper_task, _bgm_task) if t is not None]
+        if parallel_tasks:
+            results = await asyncio.gather(*parallel_tasks)
+            result_idx = 0
+            if _whisper_task is not None:
+                if not results[result_idx]:
+                    captions_path = None
+                result_idx += 1
+            if _bgm_task is not None:
+                if results[result_idx]:
+                    final_audio_path = mix_output_path
+
+
        use_remotion = (captions_path and captions_path.exists()) or req.title or req.secondary_title

        subtitle_style = None
--- a/backend/app/services/lipsync_service.py
+++ b/backend/app/services/lipsync_service.py
@@ -1,7 +1,7 @@
 """
 唇形同步服务
-通过 subprocess 调用 LatentSync conda 环境进行推理
-配置为使用 GPU1 (CUDA:1)
+混合方案: 短视频用 LatentSync (高质量), 长视频用 MuseTalk (高速度)
+路由阈值: LIPSYNC_DURATION_THRESHOLD (默认 120s)
 """
 import os
 import shutil
@@ -17,15 +17,18 @@ from app.core.config import settings


 class LipSyncService:
-    """唇形同步服务 - LatentSync 1.6 集成 (Subprocess 方式)"""
-    
+    """唇形同步服务 - LatentSync 1.6 + MuseTalk 1.5 混合方案"""
+
    def __init__(self):
        self.use_local = settings.LATENTSYNC_LOCAL
        self.api_url = settings.LATENTSYNC_API_URL
        self.latentsync_dir = settings.LATENTSYNC_DIR
        self.gpu_id = settings.LATENTSYNC_GPU_ID
        self.use_server = settings.LATENTSYNC_USE_SERVER
-        
+
+        # MuseTalk 配置
+        self.musetalk_api_url = settings.MUSETALK_API_URL
+
        # GPU 并发锁 (Serial Queue)
        self._lock = asyncio.Lock()
        
@@ -103,7 +106,7 @@ class LipSyncService:
                "-t", str(target_duration),  # 截取到目标时长
                "-c:v", "libx264",
                "-preset", "fast",
-                "-crf", "18",
+                "-crf", "23",
                "-an",  # 去掉原音频
                output_path
            ]
@@ -268,6 +271,18 @@ class LipSyncService:
                else:
                    actual_video_path = video_path

+                # 混合路由: 长视频走 MuseTalk，短视频走 LatentSync
+                if audio_duration and audio_duration >= settings.LIPSYNC_DURATION_THRESHOLD:
+                    logger.info(
+                        f"🔄 音频 {audio_duration:.1f}s >= {settings.LIPSYNC_DURATION_THRESHOLD}s，路由到 MuseTalk"
+                    )
+                    musetalk_result = await self._call_musetalk_server(
+                        actual_video_path, audio_path, output_path
+                    )
+                    if musetalk_result:
+                        return musetalk_result
+                    logger.warning("⚠️ MuseTalk 不可用，回退到 LatentSync（长视频，会较慢）")
+
                if self.use_server:
                    # 模式 A: 调用常驻服务 (加速模式)
                    return await self._call_persistent_server(actual_video_path, audio_path, output_path)
@@ -352,6 +367,55 @@ class LipSyncService:
                    shutil.copy(video_path, output_path)
                    return output_path
    
+    async def _call_musetalk_server(
+        self, video_path: str, audio_path: str, output_path: str
+    ) -> Optional[str]:
+        """
+        调用 MuseTalk 常驻服务。
+        成功返回 output_path，不可用返回 None（信号上层回退到 LatentSync）。
+        """
+        server_url = self.musetalk_api_url
+        logger.info(f"⚡ 调用 MuseTalk 服务: {server_url}")
+
+        try:
+            async with httpx.AsyncClient(timeout=3600.0) as client:
+                # 健康检查
+                try:
+                    resp = await client.get(f"{server_url}/health", timeout=5.0)
+                    if resp.status_code != 200:
+                        logger.warning("⚠️ MuseTalk 健康检查失败")
+                        return None
+                    health = resp.json()
+                    if not health.get("model_loaded"):
+                        logger.warning("⚠️ MuseTalk 模型未加载")
+                        return None
+                except Exception:
+                    logger.warning("⚠️ 无法连接 MuseTalk 服务")
+                    return None
+
+                # 发送推理请求
+                payload = {
+                    "video_path": str(Path(video_path).resolve()),
+                    "audio_path": str(Path(audio_path).resolve()),
+                    "video_out_path": str(Path(output_path).resolve()),
+                    "batch_size": settings.MUSETALK_BATCH_SIZE,
+                }
+
+                response = await client.post(f"{server_url}/lipsync", json=payload)
+
+                if response.status_code == 200:
+                    result = response.json()
+                    if Path(result["output_path"]).exists():
+                        logger.info(f"✅ MuseTalk 推理完成: {output_path}")
+                        return output_path
+
+                logger.error(f"❌ MuseTalk 服务报错: {response.text}")
+                return None
+
+        except Exception as e:
+            logger.error(f"❌ MuseTalk 调用失败: {e}")
+            return None
+
    async def _call_persistent_server(self, video_path: str, audio_path: str, output_path: str) -> str:
        """调用本地常驻服务 (server.py)"""
        server_url = "http://localhost:8007"
@@ -477,8 +541,18 @@ class LipSyncService:
            except:
                pass
        
+        # 检查 MuseTalk 服务
+        musetalk_ready = False
+        try:
+            async with httpx.AsyncClient(timeout=5.0) as client:
+                resp = await client.get(f"{self.musetalk_api_url}/health")
+                if resp.status_code == 200:
+                    musetalk_ready = resp.json().get("model_loaded", False)
+        except Exception:
+            pass
+
        return {
-            "model": "LatentSync 1.6",
+            "model": "LatentSync 1.6 + MuseTalk 1.5",
            "conda_env": conda_ok,
            "weights": weights_ok,
            "gpu": gpu_ok,
@@ -486,5 +560,7 @@ class LipSyncService:
            "gpu_id": self.gpu_id,
            "inference_steps": settings.LATENTSYNC_INFERENCE_STEPS,
            "guidance_scale": settings.LATENTSYNC_GUIDANCE_SCALE,
-            "ready": conda_ok and weights_ok and gpu_ok
+            "ready": conda_ok and weights_ok and gpu_ok,
+            "musetalk_ready": musetalk_ready,
+            "lipsync_threshold": settings.LIPSYNC_DURATION_THRESHOLD,
        }
--- a/backend/app/services/video_service.py
+++ b/backend/app/services/video_service.py
@@ -1,14 +1,14 @@
-"""
-视频合成服务
-"""
-import os
-import subprocess
-import json
-import shlex
-from pathlib import Path
-from loguru import logger
-from typing import Optional
-
+"""
+视频合成服务
+"""
+import os
+import subprocess
+import json
+import shlex
+from pathlib import Path
+from loguru import logger
+from typing import Optional
+
 class VideoService:
    def __init__(self):
        pass
@@ -96,7 +96,7 @@ class VideoService:
            "-map", "0:a?",
            "-c:v", "libx264",
            "-preset", "fast",
-            "-crf", "18",
+            "-crf", "23",
            "-c:a", "copy",
            "-movflags", "+faststart",
            output_path,
@@ -113,146 +113,146 @@ class VideoService:

        logger.warning("视频方向归一化失败，回退使用原视频")
        return video_path
-
-    def _run_ffmpeg(self, cmd: list) -> bool:
-        cmd_str = ' '.join(shlex.quote(str(c)) for c in cmd)
-        logger.debug(f"FFmpeg CMD: {cmd_str}")
-        try:
-            # Synchronous call for BackgroundTasks compatibility
-            result = subprocess.run(
-                cmd,
-                shell=False,
-                capture_output=True,
-                text=True,
-                encoding='utf-8',
-            )
-            if result.returncode != 0:
-                logger.error(f"FFmpeg Error: {result.stderr}")
-                return False
-            return True
-        except Exception as e:
-            logger.error(f"FFmpeg Exception: {e}")
-            return False
-
-    def _get_duration(self, file_path: str) -> float:
-        # Synchronous call for BackgroundTasks compatibility
-        # 使用参数列表形式避免 shell=True 的命令注入风险
-        cmd = [
-            'ffprobe', '-v', 'error',
-            '-show_entries', 'format=duration',
-            '-of', 'default=noprint_wrappers=1:nokey=1',
-            file_path
-        ]
-        try:
-            result = subprocess.run(
-                cmd,
-                capture_output=True,
-                text=True,
-            )
-            return float(result.stdout.strip())
-        except Exception:
-            return 0.0
-
-    def mix_audio(
-        self,
-        voice_path: str,
-        bgm_path: str,
-        output_path: str,
-        bgm_volume: float = 0.2
-    ) -> str:
-        """混合人声与背景音乐"""
-        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
-        volume = max(0.0, min(float(bgm_volume), 1.0))
-        filter_complex = (
-            f"[0:a]volume=1.0[a0];"
-            f"[1:a]volume={volume}[a1];"
-            f"[a0][a1]amix=inputs=2:duration=first:dropout_transition=2:normalize=0[aout]"
-        )
-
-        cmd = [
-            "ffmpeg", "-y",
-            "-i", voice_path,
-            "-stream_loop", "-1", "-i", bgm_path,
-            "-filter_complex", filter_complex,
-            "-map", "[aout]",
-            "-c:a", "pcm_s16le",
-            "-shortest",
-            output_path,
-        ]
-
-        if self._run_ffmpeg(cmd):
-            return output_path
-        raise RuntimeError("FFmpeg audio mix failed")
-
-    async def compose(
-        self,
-        video_path: str,
-        audio_path: str,
-        output_path: str,
-        subtitle_path: Optional[str] = None
-    ) -> str:
-        """合成视频"""
-        # Ensure output dir
-        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-        
-        video_duration = self._get_duration(video_path)
-        audio_duration = self._get_duration(audio_path)
-        
-        # Audio loop if needed
-        loop_count = 1
-        if audio_duration > video_duration and video_duration > 0:
-            loop_count = int(audio_duration / video_duration) + 1
-            
-        cmd = ["ffmpeg", "-y"]
-        
-        # Input video (stream_loop must be before -i)
-        if loop_count > 1:
-            cmd.extend(["-stream_loop", str(loop_count)])
-        cmd.extend(["-i", video_path])
-        
-        # Input audio
-        cmd.extend(["-i", audio_path])
-        
-        # Filter complex
-        filter_complex = []
-        
-        # Subtitles (skip for now to mimic previous state or implement basic)
-        # Previous state: subtitles disabled due to font issues
-        # if subtitle_path: ...
-        
-        # Audio map with high quality encoding
-        cmd.extend([
-            "-c:v", "libx264",
-            "-preset", "slow",      # 慢速预设，更好的压缩效率
-            "-crf", "18",           # 高质量（与 LatentSync 一致）
-            "-c:a", "aac",
-            "-b:a", "192k",         # 音频比特率
-            "-shortest"
-        ])
-        # Use audio from input 1
-        cmd.extend(["-map", "0:v", "-map", "1:a"])
-        
-        cmd.append(output_path)
-        
-        if self._run_ffmpeg(cmd):
-            return output_path
-        else:
-            raise RuntimeError("FFmpeg composition failed")
-
+
+    def _run_ffmpeg(self, cmd: list) -> bool:
+        cmd_str = ' '.join(shlex.quote(str(c)) for c in cmd)
+        logger.debug(f"FFmpeg CMD: {cmd_str}")
+        try:
+            # Synchronous call for BackgroundTasks compatibility
+            result = subprocess.run(
+                cmd,
+                shell=False,
+                capture_output=True,
+                text=True,
+                encoding='utf-8',
+            )
+            if result.returncode != 0:
+                logger.error(f"FFmpeg Error: {result.stderr}")
+                return False
+            return True
+        except Exception as e:
+            logger.error(f"FFmpeg Exception: {e}")
+            return False
+
+    def _get_duration(self, file_path: str) -> float:
+        # Synchronous call for BackgroundTasks compatibility
+        # 使用参数列表形式避免 shell=True 的命令注入风险
+        cmd = [
+            'ffprobe', '-v', 'error',
+            '-show_entries', 'format=duration',
+            '-of', 'default=noprint_wrappers=1:nokey=1',
+            file_path
+        ]
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+            )
+            return float(result.stdout.strip())
+        except Exception:
+            return 0.0
+
+    def mix_audio(
+        self,
+        voice_path: str,
+        bgm_path: str,
+        output_path: str,
+        bgm_volume: float = 0.2
+    ) -> str:
+        """混合人声与背景音乐"""
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+        volume = max(0.0, min(float(bgm_volume), 1.0))
+        filter_complex = (
+            f"[0:a]volume=1.0[a0];"
+            f"[1:a]volume={volume}[a1];"
+            f"[a0][a1]amix=inputs=2:duration=first:dropout_transition=2:normalize=0[aout]"
+        )
+
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", voice_path,
+            "-stream_loop", "-1", "-i", bgm_path,
+            "-filter_complex", filter_complex,
+            "-map", "[aout]",
+            "-c:a", "pcm_s16le",
+            "-shortest",
+            output_path,
+        ]
+
+        if self._run_ffmpeg(cmd):
+            return output_path
+        raise RuntimeError("FFmpeg audio mix failed")
+
+    async def compose(
+        self,
+        video_path: str,
+        audio_path: str,
+        output_path: str,
+        subtitle_path: Optional[str] = None
+    ) -> str:
+        """合成视频"""
+        # Ensure output dir
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        
+        video_duration = self._get_duration(video_path)
+        audio_duration = self._get_duration(audio_path)
+        
+        # Audio loop if needed
+        loop_count = 1
+        if audio_duration > video_duration and video_duration > 0:
+            loop_count = int(audio_duration / video_duration) + 1
+            
+        cmd = ["ffmpeg", "-y"]
+        
+        # Input video (stream_loop must be before -i)
+        if loop_count > 1:
+            cmd.extend(["-stream_loop", str(loop_count)])
+        cmd.extend(["-i", video_path])
+        
+        # Input audio
+        cmd.extend(["-i", audio_path])
+        
+        # Filter complex
+        filter_complex = []
+        
+        # Subtitles (skip for now to mimic previous state or implement basic)
+        # Previous state: subtitles disabled due to font issues
+        # if subtitle_path: ...
+        
+        # Audio map with high quality encoding
+        cmd.extend([
+            "-c:v", "libx264",
+            "-preset", "medium",    # 平衡速度与压缩效率
+            "-crf", "20",           # 最终输出：高质量（肉眼无损）
+            "-c:a", "aac",
+            "-b:a", "192k",         # 音频比特率
+            "-shortest"
+        ])
+        # Use audio from input 1
+        cmd.extend(["-map", "0:v", "-map", "1:a"])
+        
+        cmd.append(output_path)
+        
+        if self._run_ffmpeg(cmd):
+            return output_path
+        else:
+            raise RuntimeError("FFmpeg composition failed")
+
    def concat_videos(self, video_paths: list, output_path: str, target_fps: int = 25) -> str:
        """使用 FFmpeg concat demuxer 拼接多个视频片段"""
-        if not video_paths:
-            raise ValueError("No video segments to concat")
-
-        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
-        # 生成 concat list 文件
-        list_path = Path(output_path).parent / f"{Path(output_path).stem}_concat.txt"
-        with open(list_path, "w", encoding="utf-8") as f:
-            for vp in video_paths:
-                f.write(f"file '{vp}'\n")
-
+        if not video_paths:
+            raise ValueError("No video segments to concat")
+
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+        # 生成 concat list 文件
+        list_path = Path(output_path).parent / f"{Path(output_path).stem}_concat.txt"
+        with open(list_path, "w", encoding="utf-8") as f:
+            for vp in video_paths:
+                f.write(f"file '{vp}'\n")
+
        cmd = [
            "ffmpeg", "-y",
            "-f", "concat",
@@ -264,44 +264,44 @@ class VideoService:
            "-r", str(target_fps),
            "-c:v", "libx264",
            "-preset", "fast",
-            "-crf", "18",
+            "-crf", "23",
            "-pix_fmt", "yuv420p",
            "-movflags", "+faststart",
            output_path,
        ]
-
-        try:
-            if self._run_ffmpeg(cmd):
-                return output_path
-            else:
-                raise RuntimeError("FFmpeg concat failed")
-        finally:
-            try:
-                list_path.unlink(missing_ok=True)
-            except Exception:
-                pass
-
-    def split_audio(self, audio_path: str, start: float, end: float, output_path: str) -> str:
-        """用 FFmpeg 按时间范围切分音频"""
-        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
-        duration = end - start
-        if duration <= 0:
-            raise ValueError(f"Invalid audio split range: start={start}, end={end}, duration={duration}")
-
-        cmd = [
-            "ffmpeg", "-y",
-            "-ss", str(start),
-            "-t", str(duration),
-            "-i", audio_path,
-            "-c", "copy",
-            output_path,
-        ]
-
-        if self._run_ffmpeg(cmd):
-            return output_path
-        raise RuntimeError(f"FFmpeg audio split failed: {start}-{end}")
-
+
+        try:
+            if self._run_ffmpeg(cmd):
+                return output_path
+            else:
+                raise RuntimeError("FFmpeg concat failed")
+        finally:
+            try:
+                list_path.unlink(missing_ok=True)
+            except Exception:
+                pass
+
+    def split_audio(self, audio_path: str, start: float, end: float, output_path: str) -> str:
+        """用 FFmpeg 按时间范围切分音频"""
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+        duration = end - start
+        if duration <= 0:
+            raise ValueError(f"Invalid audio split range: start={start}, end={end}, duration={duration}")
+
+        cmd = [
+            "ffmpeg", "-y",
+            "-ss", str(start),
+            "-t", str(duration),
+            "-i", audio_path,
+            "-c", "copy",
+            output_path,
+        ]
+
+        if self._run_ffmpeg(cmd):
+            return output_path
+        raise RuntimeError(f"FFmpeg audio split failed: {start}-{end}")
+
    def get_resolution(self, file_path: str) -> tuple[int, int]:
        """获取视频有效显示分辨率（考虑旋转元数据）。"""
        info = self.get_video_metadata(file_path)
@@ -309,7 +309,7 @@ class VideoService:
            int(info.get("effective_width") or 0),
            int(info.get("effective_height") or 0),
        )
-
+
    def prepare_segment(self, video_path: str, target_duration: float, output_path: str,
                        target_resolution: Optional[tuple] = None, source_start: float = 0.0,
                        source_end: Optional[float] = None, target_fps: Optional[int] = None) -> str:
@@ -353,21 +353,21 @@ class VideoService:
                "-i", video_path,
                "-t", str(available),
                "-an",
-                "-c:v", "libx264", "-preset", "fast", "-crf", "18",
+                "-c:v", "libx264", "-preset", "fast", "-crf", "23",
                trim_temp,
            ]
-            if not self._run_ffmpeg(trim_cmd):
-                raise RuntimeError(f"FFmpeg trim for loop failed: {video_path}")
-            actual_input = trim_temp
-            source_start = 0.0  # 已裁剪，不需要再 seek
-            # 重新计算循环次数（基于裁剪后文件）
-            available = self._get_duration(trim_temp) or available
-
-        loop_count = int(target_duration / available) + 1 if needs_loop else 0
-
-        cmd = ["ffmpeg", "-y"]
-        if needs_loop:
-            cmd.extend(["-stream_loop", str(loop_count)])
+            if not self._run_ffmpeg(trim_cmd):
+                raise RuntimeError(f"FFmpeg trim for loop failed: {video_path}")
+            actual_input = trim_temp
+            source_start = 0.0  # 已裁剪，不需要再 seek
+            # 重新计算循环次数（基于裁剪后文件）
+            available = self._get_duration(trim_temp) or available
+
+        loop_count = int(target_duration / available) + 1 if needs_loop else 0
+
+        cmd = ["ffmpeg", "-y"]
+        if needs_loop:
+            cmd.extend(["-stream_loop", str(loop_count)])
        if source_start > 0:
            cmd.extend(["-ss", str(source_start)])
        cmd.extend(["-i", actual_input, "-t", str(target_duration), "-an"])
@@ -386,20 +386,20 @@ class VideoService:

        # 需要循环、缩放或指定起点时必须重编码，否则用 stream copy 保持原画质
        if needs_loop or needs_scale or source_start > 0 or has_source_end or needs_fps:
-            cmd.extend(["-c:v", "libx264", "-preset", "fast", "-crf", "18"])
+            cmd.extend(["-c:v", "libx264", "-preset", "fast", "-crf", "23"])
        else:
            cmd.extend(["-c:v", "copy"])
-
-        cmd.append(output_path)
-
-        try:
-            if self._run_ffmpeg(cmd):
-                return output_path
-            raise RuntimeError(f"FFmpeg prepare_segment failed: {video_path}")
-        finally:
-            # 清理裁剪临时文件
-            if trim_temp:
-                try:
-                    Path(trim_temp).unlink(missing_ok=True)
-                except Exception:
-                    pass
+
+        cmd.append(output_path)
+
+        try:
+            if self._run_ffmpeg(cmd):
+                return output_path
+            raise RuntimeError(f"FFmpeg prepare_segment failed: {video_path}")
+        finally:
+            # 清理裁剪临时文件
+            if trim_temp:
+                try:
+                    Path(trim_temp).unlink(missing_ok=True)
+                except Exception:
+                    pass