更新

2026-02-25 17:51:58 +08:00
parent 0a5a17402c
commit 1717635bfd
27 changed files with 1172 additions and 662 deletions
--- a/backend/app/services/lipsync_service.py
+++ b/backend/app/services/lipsync_service.py
@@ -369,7 +369,7 @@ class LipSyncService:
        }
        
        try:
-            async with httpx.AsyncClient(timeout=1200.0) as client:
+            async with httpx.AsyncClient(timeout=3600.0) as client:
                # 先检查健康状态
                try:
                    resp = await client.get(f"{server_url}/health", timeout=5.0)
--- a/backend/app/services/whisper_service.py
+++ b/backend/app/services/whisper_service.py
@@ -247,19 +247,67 @@ class WhisperService:
                    line_segments = split_segment_to_lines(all_words, max_chars)
                    all_segments.extend(line_segments)

-            # 如果提供了 original_text，用原文替换 Whisper 转录文字
+            # 如果提供了 original_text，用原文替换 Whisper 转录文字，保留语音节奏
            if original_text and original_text.strip() and whisper_first_start is not None:
-                logger.info(f"Using original_text for subtitles (len={len(original_text)}), "
-                            f"Whisper time range: {whisper_first_start:.2f}-{whisper_last_end:.2f}s")
-                # 用 split_word_to_chars 拆分原文
+                # 收集 Whisper 逐字时间戳（保留真实语音节奏）
+                whisper_chars = []
+                for seg in all_segments:
+                    whisper_chars.extend(seg.get("words", []))
+
+                # 用原文字符 + Whisper 节奏生成新的时间戳
                orig_chars = split_word_to_chars(
                    original_text.strip(),
                    whisper_first_start,
                    whisper_last_end
                )
-                if orig_chars:
+
+                if orig_chars and len(whisper_chars) >= 2:
+                    # 将原文字符按比例映射到 Whisper 的时间节奏上
+                    n_w = len(whisper_chars)
+                    n_o = len(orig_chars)
+                    w_starts = [c["start"] for c in whisper_chars]
+                    w_final_end = whisper_chars[-1]["end"]
+
+                    logger.info(
+                        f"Using original_text for subtitles (len={len(original_text)}), "
+                        f"rhythm-mapping {n_o} orig chars onto {n_w} Whisper chars, "
+                        f"time range: {whisper_first_start:.2f}-{whisper_last_end:.2f}s"
+                    )
+
+                    remapped = []
+                    for i, oc in enumerate(orig_chars):
+                        # 原文第 i 个字符对应 Whisper 时间线的位置
+                        pos = (i / n_o) * n_w
+                        idx = min(int(pos), n_w - 1)
+                        frac = pos - idx
+                        t_start = (
+                            w_starts[idx] + frac * (w_starts[idx + 1] - w_starts[idx])
+                            if idx < n_w - 1
+                            else w_starts[idx] + frac * (w_final_end - w_starts[idx])
+                        )
+
+                        # 结束时间 = 下一个字符的开始时间
+                        pos_next = ((i + 1) / n_o) * n_w
+                        idx_n = min(int(pos_next), n_w - 1)
+                        frac_n = pos_next - idx_n
+                        t_end = (
+                            w_starts[idx_n] + frac_n * (w_starts[idx_n + 1] - w_starts[idx_n])
+                            if idx_n < n_w - 1
+                            else w_starts[idx_n] + frac_n * (w_final_end - w_starts[idx_n])
+                        )
+
+                        remapped.append({
+                            "word": oc["word"],
+                            "start": round(t_start, 3),
+                            "end": round(t_end, 3),
+                        })
+
+                    all_segments = split_segment_to_lines(remapped, max_chars)
+                    logger.info(f"Rebuilt {len(all_segments)} subtitle segments (rhythm-mapped)")
+                elif orig_chars:
+                    # Whisper 字符不足，退回线性插值
                    all_segments = split_segment_to_lines(orig_chars, max_chars)
-                    logger.info(f"Rebuilt {len(all_segments)} subtitle segments from original text")
+                    logger.info(f"Rebuilt {len(all_segments)} subtitle segments (linear fallback)")

            logger.info(f"Generated {len(all_segments)} subtitle segments")
            return {"segments": all_segments}