This commit is contained in:
Kevin Wong
2026-02-09 14:47:19 +08:00
parent e226224119
commit 3129d45b25
23 changed files with 1529 additions and 294 deletions

View File

@@ -24,6 +24,33 @@ class GenerateMetaResponse(BaseModel):
tags: list[str]
class TranslateRequest(BaseModel):
"""翻译请求"""
text: str
target_lang: str
@router.post("/translate")
async def translate_text(req: TranslateRequest):
"""
AI 翻译文案
将文案翻译为指定目标语言
"""
if not req.text or not req.text.strip():
raise HTTPException(status_code=400, detail="文案不能为空")
if not req.target_lang or not req.target_lang.strip():
raise HTTPException(status_code=400, detail="目标语言不能为空")
try:
logger.info(f"Translating text to {req.target_lang}: {req.text[:50]}...")
translated = await glm_service.translate_text(req.text.strip(), req.target_lang.strip())
return success_response({"translated_text": translated})
except Exception as e:
logger.error(f"Translate failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/generate-meta")
async def generate_meta(req: GenerateMetaRequest):
"""

View File

@@ -1,14 +1,16 @@
from pydantic import BaseModel
from typing import Optional
from typing import Optional, List
class GenerateRequest(BaseModel):
text: str
voice: str = "zh-CN-YunxiNeural"
material_path: str
material_paths: Optional[List[str]] = None
tts_mode: str = "edgetts"
ref_audio_id: Optional[str] = None
ref_text: Optional[str] = None
language: str = "zh-CN"
title: Optional[str] = None
enable_subtitles: bool = True
subtitle_style_id: Optional[str] = None

View File

@@ -1,4 +1,4 @@
from typing import Optional, Any
from typing import Optional, Any, List
from pathlib import Path
import time
import traceback
@@ -24,6 +24,17 @@ from .schemas import GenerateRequest
from .task_store import task_store
def _locale_to_whisper_lang(locale: str) -> str:
"""'en-US''en', 'zh-CN''zh'"""
return locale.split("-")[0] if "-" in locale else locale
def _locale_to_qwen_lang(locale: str) -> str:
"""'zh-CN''Chinese', 'en-US''English', 其他 → 'Auto'"""
mapping = {"zh": "Chinese", "en": "English"}
return mapping.get(locale.split("-")[0], "Auto")
_lipsync_service: Optional[LipSyncService] = None
_lipsync_ready: Optional[bool] = None
_lipsync_last_check: float = 0
@@ -79,19 +90,107 @@ def _update_task(task_id: str, **updates: Any) -> None:
task_store.update(task_id, updates)
# ── 多素材辅助函数 ──
def _split_equal(segments: List[dict], material_paths: List[str]) -> List[dict]:
"""按素材数量均分音频时长,对齐到最近的 Whisper 字边界。
Args:
segments: Whisper 产出的 segment 列表, 每个包含 words (字级时间戳)
material_paths: 素材路径列表
Returns:
[{"material_path": "...", "start": 0.0, "end": 5.2, "index": 0}, ...]
"""
# 展平所有 Whisper 字符
all_chars: List[dict] = []
for seg in segments:
for w in seg.get("words", []):
all_chars.append(w)
n = len(material_paths)
if not all_chars or n == 0:
return [{"material_path": material_paths[0] if material_paths else "",
"start": 0.0, "end": 99999.0, "index": 0}]
# 素材数不能超过字符数,否则边界会重复
if n > len(all_chars):
logger.warning(f"[MultiMat] 素材数({n}) > 字符数({len(all_chars)}),裁剪为 {len(all_chars)}")
n = len(all_chars)
total_start = all_chars[0]["start"]
total_end = all_chars[-1]["end"]
seg_dur = (total_end - total_start) / n
# 计算 N-1 个分割点,对齐到最近的字边界
boundaries = [0] # 第一段从第 0 个字开始
for i in range(1, n):
target_time = total_start + i * seg_dur
# 找到 start 时间最接近 target_time 的字
best_idx = boundaries[-1] + 1 # 至少比上一个边界后移 1
best_diff = float("inf")
for j in range(boundaries[-1] + 1, len(all_chars)):
diff = abs(all_chars[j]["start"] - target_time)
if diff < best_diff:
best_diff = diff
best_idx = j
elif diff > best_diff:
break # 时间递增,差值开始变大后可以停了
boundaries.append(min(best_idx, len(all_chars) - 1))
boundaries.append(len(all_chars)) # 最后一段到末尾
# 按边界生成分配结果
assignments: List[dict] = []
for i in range(n):
s_idx = boundaries[i]
e_idx = boundaries[i + 1]
if s_idx >= len(all_chars) or s_idx >= e_idx:
continue
assignments.append({
"material_path": material_paths[i],
"start": all_chars[s_idx]["start"],
"end": all_chars[e_idx - 1]["end"],
"text": "".join(c["word"] for c in all_chars[s_idx:e_idx]),
"index": len(assignments),
})
if not assignments:
return [{"material_path": material_paths[0], "start": 0.0, "end": 99999.0, "index": 0}]
logger.info(f"[MultiMat] 均分 {len(all_chars)} 字为 {len(assignments)}")
for a in assignments:
dur = a["end"] - a["start"]
logger.info(f"{a['index']}: [{a['start']:.2f}-{a['end']:.2f}s] ({dur:.1f}s) {a['text'][:20]}")
return assignments
async def process_video_generation(task_id: str, req: GenerateRequest, user_id: str):
temp_files = []
try:
start_time = time.time()
# ── 确定素材列表 ──
material_paths: List[str] = []
if req.material_paths and len(req.material_paths) > 1:
material_paths = req.material_paths
else:
material_paths = [req.material_path]
is_multi = len(material_paths) > 1
_update_task(task_id, status="processing", progress=5, message="正在下载素材...")
temp_dir = settings.UPLOAD_DIR / "temp"
temp_dir.mkdir(parents=True, exist_ok=True)
input_material_path = temp_dir / f"{task_id}_input.mp4"
temp_files.append(input_material_path)
await _download_material(req.material_path, input_material_path)
# 单素材模式:下载主素材
if not is_multi:
input_material_path = temp_dir / f"{task_id}_input.mp4"
temp_files.append(input_material_path)
await _download_material(material_paths[0], input_material_path)
_update_task(task_id, message="正在生成语音...", progress=10)
@@ -119,7 +218,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
ref_audio_path=str(ref_audio_local),
ref_text=req.ref_text,
output_path=str(audio_path),
language="Chinese"
language=_locale_to_qwen_lang(req.language)
)
else:
_update_task(task_id, message="正在生成语音 (EdgeTTS)...")
@@ -128,52 +227,183 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
tts_time = time.time() - start_time
print(f"[Pipeline] TTS completed in {tts_time:.1f}s")
_update_task(task_id, progress=25)
_update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30)
lipsync = _get_lipsync_service()
lipsync_video_path = temp_dir / f"{task_id}_lipsync.mp4"
temp_files.append(lipsync_video_path)
lipsync_start = time.time()
is_ready = await _check_lipsync_ready()
if is_ready:
print(f"[LipSync] Starting LatentSync inference...")
_update_task(task_id, progress=35, message="正在运行 LatentSync 推理...")
await lipsync.generate(str(input_material_path), str(audio_path), str(lipsync_video_path))
else:
print(f"[LipSync] LatentSync not ready, copying original video")
_update_task(task_id, message="唇形同步不可用,使用原始视频...")
import shutil
shutil.copy(str(input_material_path), lipsync_video_path)
lipsync_time = time.time() - lipsync_start
print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s")
_update_task(task_id, progress=80)
video = VideoService()
captions_path = None
if req.enable_subtitles:
_update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82)
if is_multi:
# ══════════════════════════════════════
# 多素材流水线
# ══════════════════════════════════════
_update_task(task_id, progress=12, message="正在生成字幕 (Whisper)...")
captions_path = temp_dir / f"{task_id}_captions.json"
temp_files.append(captions_path)
try:
await whisper_service.align(
captions_data = await whisper_service.align(
audio_path=str(audio_path),
text=req.text,
output_path=str(captions_path)
output_path=str(captions_path),
language=_locale_to_whisper_lang(req.language),
)
print(f"[Pipeline] Whisper alignment completed")
print(f"[Pipeline] Whisper alignment completed (multi-material)")
except Exception as e:
logger.warning(f"Whisper alignment failed, skipping subtitles: {e}")
logger.warning(f"Whisper alignment failed: {e}")
captions_data = None
captions_path = None
_update_task(task_id, progress=15, message="正在分配素材...")
if captions_data and captions_data.get("segments"):
assignments = _split_equal(captions_data["segments"], material_paths)
else:
# Whisper 失败 → 按时长均分(不依赖字符对齐)
logger.warning("[MultiMat] Whisper 无数据,按时长均分")
audio_dur = video._get_duration(str(audio_path))
if audio_dur <= 0:
audio_dur = 30.0 # 安全兜底
seg_dur = audio_dur / len(material_paths)
assignments = [
{"material_path": material_paths[i], "start": i * seg_dur,
"end": (i + 1) * seg_dur, "index": i}
for i in range(len(material_paths))
]
# 扩展段覆盖完整音频范围首段从0开始末段到音频结尾
audio_duration = video._get_duration(str(audio_path))
if assignments and audio_duration > 0:
assignments[0]["start"] = 0.0
assignments[-1]["end"] = audio_duration
num_segments = len(assignments)
print(f"[Pipeline] Multi-material: {num_segments} segments, {len(material_paths)} materials")
if num_segments == 0:
raise RuntimeError("Multi-material: no valid segments after splitting")
lipsync_start = time.time()
# ── 第一步:下载所有素材并检测分辨率 ──
material_locals: List[Path] = []
resolutions = []
for i, assignment in enumerate(assignments):
material_local = temp_dir / f"{task_id}_material_{i}.mp4"
temp_files.append(material_local)
await _download_material(assignment["material_path"], material_local)
material_locals.append(material_local)
resolutions.append(video.get_resolution(str(material_local)))
# 分辨率不一致时,统一到第一个素材的分辨率
base_res = resolutions[0] if resolutions else (0, 0)
need_scale = any(r != base_res for r in resolutions) and base_res[0] > 0
if need_scale:
logger.info(f"[MultiMat] 素材分辨率不一致,统一到 {base_res[0]}x{base_res[1]}")
# ── 第二步:裁剪每段素材到对应时长 ──
prepared_segments: List[Path] = []
for i, assignment in enumerate(assignments):
seg_progress = 15 + int((i / num_segments) * 30) # 15% → 45%
seg_dur = assignment["end"] - assignment["start"]
_update_task(
task_id,
progress=seg_progress,
message=f"正在准备素材 {i+1}/{num_segments}..."
)
prepared_path = temp_dir / f"{task_id}_prepared_{i}.mp4"
temp_files.append(prepared_path)
video.prepare_segment(
str(material_locals[i]), seg_dur, str(prepared_path),
target_resolution=base_res if need_scale else None
)
prepared_segments.append(prepared_path)
# ── 第二步:拼接所有素材片段 ──
_update_task(task_id, progress=50, message="正在拼接素材片段...")
concat_path = temp_dir / f"{task_id}_concat.mp4"
temp_files.append(concat_path)
video.concat_videos(
[str(p) for p in prepared_segments],
str(concat_path)
)
# ── 第三步:一次 LatentSync 推理 ──
is_ready = await _check_lipsync_ready()
if is_ready:
_update_task(task_id, progress=55, message="正在合成唇形 (LatentSync)...")
print(f"[LipSync] Multi-material: single LatentSync on concatenated video")
try:
await lipsync.generate(str(concat_path), str(audio_path), str(lipsync_video_path))
except Exception as e:
logger.warning(f"[LipSync] Failed, fallback to concat without lipsync: {e}")
import shutil
shutil.copy(str(concat_path), str(lipsync_video_path))
else:
print(f"[LipSync] Not ready, using concatenated video without lipsync")
import shutil
shutil.copy(str(concat_path), str(lipsync_video_path))
lipsync_time = time.time() - lipsync_start
print(f"[Pipeline] Multi-material prepare + concat + LipSync completed in {lipsync_time:.1f}s")
_update_task(task_id, progress=80)
# 如果用户关闭了字幕,清除 captions_pathWhisper 仅用于句子切分)
if not req.enable_subtitles:
captions_path = None
else:
# ══════════════════════════════════════
# 单素材流水线(原有逻辑)
# ══════════════════════════════════════
_update_task(task_id, progress=25)
_update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30)
lipsync_start = time.time()
is_ready = await _check_lipsync_ready()
if is_ready:
print(f"[LipSync] Starting LatentSync inference...")
_update_task(task_id, progress=35, message="正在运行 LatentSync 推理...")
await lipsync.generate(str(input_material_path), str(audio_path), str(lipsync_video_path))
else:
print(f"[LipSync] LatentSync not ready, copying original video")
_update_task(task_id, message="唇形同步不可用,使用原始视频...")
import shutil
shutil.copy(str(input_material_path), lipsync_video_path)
lipsync_time = time.time() - lipsync_start
print(f"[Pipeline] LipSync completed in {lipsync_time:.1f}s")
_update_task(task_id, progress=80)
# 单素材模式Whisper 在 LatentSync 之后
if req.enable_subtitles:
_update_task(task_id, message="正在生成字幕 (Whisper)...", progress=82)
captions_path = temp_dir / f"{task_id}_captions.json"
temp_files.append(captions_path)
try:
await whisper_service.align(
audio_path=str(audio_path),
text=req.text,
output_path=str(captions_path),
language=_locale_to_whisper_lang(req.language),
)
print(f"[Pipeline] Whisper alignment completed")
except Exception as e:
logger.warning(f"Whisper alignment failed, skipping subtitles: {e}")
captions_path = None
_update_task(task_id, progress=85)
video = VideoService()
final_audio_path = audio_path
if req.bgm_id:
_update_task(task_id, message="正在合成背景音乐...", progress=86)