This commit is contained in:
Kevin Wong
2026-02-11 13:48:45 +08:00
parent e33dfc3031
commit 96a298e51c
282 changed files with 93514 additions and 461 deletions

View File

@@ -2,11 +2,11 @@
依赖注入模块:认证和用户获取
"""
from typing import Optional, Any, Dict, cast
from fastapi import Request, HTTPException, Depends, status
from app.core.security import decode_access_token, TokenData
from app.repositories.sessions import get_session
from app.repositories.users import get_user_by_id
from loguru import logger
from fastapi import Request, HTTPException, Depends, status
from app.core.security import decode_access_token
from app.repositories.sessions import get_session, delete_sessions
from app.repositories.users import get_user_by_id, deactivate_user_if_expired
from loguru import logger
async def get_token_from_cookie(request: Request) -> Optional[str]:
@@ -35,8 +35,12 @@ async def get_current_user_optional(
logger.warning(f"Session token 无效: user_id={token_data.user_id}")
return None
user = get_user_by_id(token_data.user_id)
return cast(Optional[Dict[str, Any]], user)
user = cast(Optional[Dict[str, Any]], get_user_by_id(token_data.user_id))
if user and deactivate_user_if_expired(user):
delete_sessions(token_data.user_id)
return None
return user
except Exception as e:
logger.error(f"获取用户信息失败: {e}")
return None
@@ -82,14 +86,12 @@ async def get_current_user(
)
user = cast(Dict[str, Any], user)
if user.get("expires_at"):
from datetime import datetime, timezone
expires_at = datetime.fromisoformat(user["expires_at"].replace("Z", "+00:00"))
if datetime.now(timezone.utc) > expires_at:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="授权已过期,请联系管理员续期"
)
if deactivate_user_if_expired(user):
delete_sessions(token_data.user_id)
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="会员已到期,请续费"
)
return user
except HTTPException:

View File

@@ -1,7 +1,7 @@
"""
认证 API注册、登录、登出、修改密码
"""
from fastapi import APIRouter, HTTPException, Response, status, Request
from fastapi import APIRouter, HTTPException, Response, status, Request, Depends
from pydantic import BaseModel, field_validator
from app.core.security import (
get_password_hash,
@@ -13,7 +13,15 @@ from app.core.security import (
decode_access_token
)
from app.repositories.sessions import create_session, delete_sessions
from app.repositories.users import create_user, get_user_by_id, get_user_by_phone, user_exists_by_phone, update_user
from app.repositories.users import (
create_user,
get_user_by_id,
get_user_by_phone,
user_exists_by_phone,
update_user,
deactivate_user_if_expired,
)
from app.core.deps import get_current_user
from app.core.response import success_response
from loguru import logger
from typing import Optional, Any, cast
@@ -130,22 +138,20 @@ async def login(request: LoginRequest, response: Response):
detail="手机号或密码错误"
)
# 检查是否激活
if not user["is_active"]:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="账号未激活,请等待管理员审核"
)
# 检查授权是否过期
if user.get("expires_at"):
from datetime import datetime, timezone
expires_at = datetime.fromisoformat(user["expires_at"].replace("Z", "+00:00"))
if datetime.now(timezone.utc) > expires_at:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="授权已过期,请联系管理员续期"
)
# 授权过期时自动停用账号
if deactivate_user_if_expired(user):
delete_sessions(user["id"])
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="会员已到期,请续费"
)
# 检查是否激活
if not user["is_active"]:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="账号未激活,请等待管理员审核"
)
# 生成新的 session_token (后踢前)
session_token = generate_session_token()
@@ -258,31 +264,9 @@ async def change_password(request: ChangePasswordRequest, req: Request, response
)
@router.get("/me")
async def get_me(request: Request):
"""获取当前用户信息"""
# 从 Cookie 获取用户
token = request.cookies.get("access_token")
if not token:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="未登录"
)
token_data = decode_access_token(token)
if not token_data:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Token 无效"
)
user = cast(dict[str, Any], get_user_by_id(token_data.user_id) or {})
if not user:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="用户不存在"
)
@router.get("/me")
async def get_me(user: dict = Depends(get_current_user)):
"""获取当前用户信息"""
return success_response(UserResponse(
id=user["id"],
phone=user["phone"],

View File

@@ -9,6 +9,7 @@ class GenerateAudioRequest(BaseModel):
ref_audio_id: Optional[str] = None
ref_text: Optional[str] = None
language: str = "zh-CN"
speed: float = 1.0
class RenameAudioRequest(BaseModel):

View File

@@ -25,7 +25,7 @@ from app.modules.generated_audios.schemas import (
BUCKET = "generated-audios"
def _locale_to_qwen_lang(locale: str) -> str:
def _locale_to_tts_lang(locale: str) -> str:
mapping = {"zh": "Chinese", "en": "English"}
return mapping.get(locale.split("-")[0], "Auto")
@@ -73,19 +73,20 @@ async def generate_audio_task(task_id: str, req: GenerateAudioRequest, user_id:
async for chunk in resp.aiter_bytes():
f.write(chunk)
task_store.update(task_id, {"progress": 40, "message": "正在克隆声音 (Qwen3-TTS)..."})
task_store.update(task_id, {"progress": 40, "message": "正在克隆声音..."})
await voice_clone_service.generate_audio(
text=req.text,
ref_audio_path=ref_local,
ref_text=req.ref_text,
output_path=audio_path,
language=_locale_to_qwen_lang(req.language),
language=_locale_to_tts_lang(req.language),
speed=req.speed,
)
finally:
if os.path.exists(ref_local):
os.unlink(ref_local)
else:
task_store.update(task_id, {"progress": 30, "message": "正在生成语音 (EdgeTTS)..."})
task_store.update(task_id, {"progress": 30, "message": "正在生成语音..."})
tts = TTSService()
await tts.generate_audio(req.text, req.voice, audio_path)

View File

@@ -13,7 +13,7 @@ router = APIRouter()
@router.post("")
async def upload_ref_audio(
file: UploadFile = File(...),
ref_text: str = Form(...),
ref_text: str = Form(""),
user: dict = Depends(get_current_user)
):
"""上传参考音频"""
@@ -68,3 +68,21 @@ async def rename_ref_audio(
except Exception as e:
logger.error(f"重命名失败: {e}")
raise HTTPException(status_code=500, detail=f"重命名失败: {str(e)}")
@router.post("/{audio_id:path}/retranscribe")
async def retranscribe_ref_audio(
audio_id: str,
user: dict = Depends(get_current_user)
):
"""重新识别参考音频的文字内容"""
try:
result = await service.retranscribe_ref_audio(audio_id, user["id"])
return success_response(result, message="识别完成")
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"重新识别失败: {e}")
raise HTTPException(status_code=500, detail=f"识别失败: {str(e)}")

View File

@@ -41,16 +41,40 @@ def _get_audio_duration(file_path: str) -> float:
return 0.0
def _convert_to_wav(input_path: str, output_path: str) -> bool:
"""将音频转换为 WAV 格式 (16kHz, mono)"""
def _find_silence_cut_point(file_path: str, max_duration: float) -> float:
"""在 max_duration 附近找一个静音点作为截取位置,找不到则回退到 max_duration"""
try:
subprocess.run([
'ffmpeg', '-y', '-i', input_path,
'-ar', '16000',
'-ac', '1',
'-acodec', 'pcm_s16le',
output_path
], capture_output=True, timeout=60, check=True)
# 用 silencedetect 找所有静音段(阈值 -30dB最短 0.3 秒)
result = subprocess.run(
['ffmpeg', '-i', file_path, '-af',
'silencedetect=noise=-30dB:d=0.3', '-f', 'null', '-'],
capture_output=True, text=True, timeout=30
)
# 解析 silence_end 时间点
import re as _re
ends = [float(m) for m in _re.findall(r'silence_end:\s*([\d.]+)', result.stderr)]
# 找 max_duration 之前最后一个静音结束点(至少 3 秒)
candidates = [t for t in ends if 3.0 <= t <= max_duration]
if candidates:
cut = candidates[-1]
logger.info(f"Found silence cut point at {cut:.1f}s (max={max_duration}s)")
return cut
except Exception as e:
logger.warning(f"Silence detection failed: {e}")
return max_duration
def _convert_to_wav(input_path: str, output_path: str, max_duration: float = 0) -> bool:
"""将音频转换为 WAV 格式 (16kHz, mono),可选截取前 max_duration 秒并淡出"""
try:
cmd = ['ffmpeg', '-y', '-i', input_path]
if max_duration > 0:
cmd += ['-t', str(max_duration)]
# 末尾 0.1 秒淡出,避免截断爆音
fade_start = max(0, max_duration - 0.1)
cmd += ['-af', f'afade=t=out:st={fade_start}:d=0.1']
cmd += ['-ar', '16000', '-ac', '1', '-acodec', 'pcm_s16le', output_path]
subprocess.run(cmd, capture_output=True, timeout=60, check=True)
return True
except Exception as e:
logger.error(f"音频转换失败: {e}")
@@ -67,9 +91,6 @@ async def upload_ref_audio(file, ref_text: str, user_id: str) -> dict:
if ext not in ALLOWED_AUDIO_EXTENSIONS:
raise ValueError(f"不支持的音频格式: {ext}。支持的格式: {', '.join(ALLOWED_AUDIO_EXTENSIONS)}")
if not ref_text or len(ref_text.strip()) < 2:
raise ValueError("参考文字不能为空")
# 创建临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp_input:
content = await file.read()
@@ -86,8 +107,31 @@ async def upload_ref_audio(file, ref_text: str, user_id: str) -> dict:
duration = _get_audio_duration(tmp_wav_path)
if duration < 1.0:
raise ValueError("音频时长过短,至少需要 1 秒")
if duration > 60.0:
raise ValueError("音频时长过长,最多 60 秒")
# 超过 10 秒自动在静音点截取CosyVoice 对 3-10 秒效果最好)
MAX_REF_DURATION = 10.0
if duration > MAX_REF_DURATION:
cut_point = _find_silence_cut_point(tmp_wav_path, MAX_REF_DURATION)
logger.info(f"Ref audio {duration:.1f}s > {MAX_REF_DURATION}s, trimming at {cut_point:.1f}s")
trimmed_path = tmp_input_path + "_trimmed.wav"
if not _convert_to_wav(tmp_wav_path, trimmed_path, max_duration=cut_point):
raise RuntimeError("音频截取失败")
os.unlink(tmp_wav_path)
tmp_wav_path = trimmed_path
duration = _get_audio_duration(tmp_wav_path)
# 自动转写参考音频内容
try:
from app.services.whisper_service import whisper_service
transcribed = await whisper_service.transcribe(tmp_wav_path)
if transcribed.strip():
ref_text = transcribed.strip()
logger.info(f"Auto-transcribed ref audio: {ref_text[:50]}...")
except Exception as e:
logger.warning(f"Auto-transcribe failed: {e}")
if not ref_text or not ref_text.strip():
raise ValueError("无法识别音频内容,请确保音频包含清晰的语音")
# 检查重名
existing_files = await storage_service.list_files(BUCKET_REF_AUDIOS, user_id)
@@ -267,3 +311,85 @@ async def rename_ref_audio(audio_id: str, new_name: str, user_id: str) -> dict:
)
return {"name": new_name}
async def retranscribe_ref_audio(audio_id: str, user_id: str) -> dict:
"""重新转写参考音频的 ref_text并截取前 10 秒重新上传(用于迁移旧数据)"""
if not audio_id.startswith(f"{user_id}/"):
raise PermissionError("无权修改此文件")
# 下载音频到临时文件
audio_url = await storage_service.get_signed_url(BUCKET_REF_AUDIOS, audio_id)
tmp_wav_path = None
trimmed_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp_wav_path = tmp.name
timeout = httpx.Timeout(None)
async with httpx.AsyncClient(timeout=timeout) as client:
async with client.stream("GET", audio_url) as resp:
resp.raise_for_status()
async for chunk in resp.aiter_bytes():
tmp.write(chunk)
# 超过 10 秒则截取前 10 秒并重新上传音频
MAX_REF_DURATION = 10.0
duration = _get_audio_duration(tmp_wav_path)
transcribe_path = tmp_wav_path
need_reupload = False
if duration > MAX_REF_DURATION:
cut_point = _find_silence_cut_point(tmp_wav_path, MAX_REF_DURATION)
logger.info(f"Retranscribe: trimming {audio_id} from {duration:.1f}s at {cut_point:.1f}s")
trimmed_path = tmp_wav_path + "_trimmed.wav"
if _convert_to_wav(tmp_wav_path, trimmed_path, max_duration=cut_point):
transcribe_path = trimmed_path
duration = _get_audio_duration(trimmed_path)
need_reupload = True
# Whisper 转写
from app.services.whisper_service import whisper_service
transcribed = await whisper_service.transcribe(transcribe_path)
if not transcribed or not transcribed.strip():
raise ValueError("无法识别音频内容")
ref_text = transcribed.strip()
logger.info(f"Re-transcribed ref audio {audio_id}: {ref_text[:50]}...")
# 截取过的音频重新上传覆盖原文件
if need_reupload and trimmed_path:
with open(trimmed_path, "rb") as f:
await storage_service.upload_file(
bucket=BUCKET_REF_AUDIOS, path=audio_id,
file_data=f.read(), content_type="audio/wav",
)
logger.info(f"Re-uploaded trimmed audio: {audio_id} ({duration:.1f}s)")
# 更新 metadata
metadata_path = audio_id.replace(".wav", ".json")
try:
meta_url = await storage_service.get_signed_url(BUCKET_REF_AUDIOS, metadata_path)
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(meta_url)
if resp.status_code == 200:
metadata = resp.json()
else:
raise Exception(f"status {resp.status_code}")
except Exception:
metadata = {}
metadata["ref_text"] = ref_text
metadata["duration_sec"] = duration
await storage_service.upload_file(
bucket=BUCKET_REF_AUDIOS,
path=metadata_path,
file_data=json.dumps(metadata, ensure_ascii=False).encode('utf-8'),
content_type="application/json"
)
return {"ref_text": ref_text, "duration_sec": duration}
finally:
if tmp_wav_path and os.path.exists(tmp_wav_path):
os.unlink(tmp_wav_path)
if trimmed_path and os.path.exists(trimmed_path):
os.unlink(trimmed_path)

View File

@@ -1,5 +1,5 @@
from pydantic import BaseModel
from typing import Optional, List
from typing import Optional, List, Literal
class CustomAssignment(BaseModel):
@@ -7,6 +7,7 @@ class CustomAssignment(BaseModel):
start: float # 音频时间轴起点
end: float # 音频时间轴终点
source_start: float = 0.0 # 源视频截取起点
source_end: Optional[float] = None # 源视频截取终点(可选)
class GenerateRequest(BaseModel):
@@ -30,3 +31,4 @@ class GenerateRequest(BaseModel):
bgm_id: Optional[str] = None
bgm_volume: Optional[float] = 0.2
custom_assignments: Optional[List[CustomAssignment]] = None
output_aspect_ratio: Literal["9:16", "16:9"] = "9:16"

View File

@@ -29,7 +29,7 @@ def _locale_to_whisper_lang(locale: str) -> str:
return locale.split("-")[0] if "-" in locale else locale
def _locale_to_qwen_lang(locale: str) -> str:
def _locale_to_tts_lang(locale: str) -> str:
"""'zh-CN''Chinese', 'en-US''English', 其他 → 'Auto'"""
mapping = {"zh": "Chinese", "en": "English"}
return mapping.get(locale.split("-")[0], "Auto")
@@ -174,17 +174,27 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
# ── 确定素材列表 ──
material_paths: List[str] = []
if req.material_paths and len(req.material_paths) > 1:
if req.custom_assignments and len(req.custom_assignments) > 1:
material_paths = [a.material_path for a in req.custom_assignments if a.material_path]
elif req.material_paths and len(req.material_paths) > 1:
material_paths = req.material_paths
else:
material_paths = [req.material_path]
is_multi = len(material_paths) > 1
target_resolution = (1080, 1920) if req.output_aspect_ratio == "9:16" else (1920, 1080)
logger.info(
f"[Render] 输出画面比例: {req.output_aspect_ratio}, "
f"目标分辨率: {target_resolution[0]}x{target_resolution[1]}"
)
_update_task(task_id, status="processing", progress=5, message="正在下载素材...")
temp_dir = settings.UPLOAD_DIR / "temp"
temp_dir.mkdir(parents=True, exist_ok=True)
video = VideoService()
input_material_path: Optional[Path] = None
# 单素材模式:下载主素材
if not is_multi:
@@ -192,6 +202,16 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
temp_files.append(input_material_path)
await _download_material(material_paths[0], input_material_path)
# 归一化旋转元数据(如 iPhone MOV 1920x1080 + rotation=-90
normalized_input_path = temp_dir / f"{task_id}_input_norm.mp4"
normalized_result = video.normalize_orientation(
str(input_material_path),
str(normalized_input_path),
)
if normalized_result != str(input_material_path):
temp_files.append(normalized_input_path)
input_material_path = normalized_input_path
_update_task(task_id, message="正在生成语音...", progress=10)
audio_path = temp_dir / f"{task_id}_audio.wav"
@@ -218,8 +238,10 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
if resp.status_code == 200:
meta = resp.json()
req.language = meta.get("language", req.language)
if not req.text.strip():
req.text = meta.get("text", req.text)
# 无条件用配音元数据覆盖文案,确保字幕与配音语言一致
meta_text = meta.get("text", "")
if meta_text:
req.text = meta_text
except Exception as e:
logger.warning(f"读取配音元数据失败: {e}")
@@ -238,13 +260,13 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
)
await _download_material(ref_audio_url, ref_audio_local)
_update_task(task_id, message="正在克隆声音 (Qwen3-TTS)...")
_update_task(task_id, message="正在克隆声音...")
await voice_clone_service.generate_audio(
text=req.text,
ref_audio_path=str(ref_audio_local),
ref_text=req.ref_text,
output_path=str(audio_path),
language=_locale_to_qwen_lang(req.language)
language=_locale_to_tts_lang(req.language)
)
else:
_update_task(task_id, message="正在生成语音 (EdgeTTS)...")
@@ -258,7 +280,6 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
lipsync_video_path = temp_dir / f"{task_id}_lipsync.mp4"
temp_files.append(lipsync_video_path)
video = VideoService()
captions_path = None
if is_multi:
@@ -267,7 +288,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
# ══════════════════════════════════════
_update_task(task_id, progress=12, message="正在分配素材...")
if req.custom_assignments:
if req.custom_assignments and len(req.custom_assignments) == len(material_paths):
# 用户自定义分配,跳过 Whisper 均分
assignments = [
{
@@ -275,6 +296,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
"start": a.start,
"end": a.end,
"source_start": a.source_start,
"source_end": a.source_end,
"index": i,
}
for i, a in enumerate(req.custom_assignments)
@@ -290,6 +312,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
text=req.text,
output_path=str(captions_path),
language=_locale_to_whisper_lang(req.language),
original_text=req.text,
)
print(f"[Pipeline] Whisper alignment completed (custom assignments)")
except Exception as e:
@@ -297,6 +320,49 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
captions_path = None
else:
captions_path = None
elif req.custom_assignments:
logger.warning(
f"[MultiMat] custom_assignments 数量({len(req.custom_assignments)})"
f" 与素材数量({len(material_paths)})不一致,回退自动分配"
)
# 原有逻辑Whisper → _split_equal
_update_task(task_id, message="正在生成字幕 (Whisper)...")
captions_path = temp_dir / f"{task_id}_captions.json"
temp_files.append(captions_path)
try:
captions_data = await whisper_service.align(
audio_path=str(audio_path),
text=req.text,
output_path=str(captions_path),
language=_locale_to_whisper_lang(req.language),
original_text=req.text,
)
print(f"[Pipeline] Whisper alignment completed (multi-material)")
except Exception as e:
logger.warning(f"Whisper alignment failed: {e}")
captions_data = None
captions_path = None
_update_task(task_id, progress=15, message="正在分配素材...")
if captions_data and captions_data.get("segments"):
assignments = _split_equal(captions_data["segments"], material_paths)
else:
# Whisper 失败 → 按时长均分(不依赖字符对齐)
logger.warning("[MultiMat] Whisper 无数据,按时长均分")
audio_dur = video._get_duration(str(audio_path))
if audio_dur <= 0:
audio_dur = 30.0 # 安全兜底
seg_dur = audio_dur / len(material_paths)
assignments = [
{"material_path": material_paths[i], "start": i * seg_dur,
"end": (i + 1) * seg_dur, "index": i}
for i in range(len(material_paths))
]
else:
# 原有逻辑Whisper → _split_equal
_update_task(task_id, message="正在生成字幕 (Whisper)...")
@@ -310,6 +376,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
text=req.text,
output_path=str(captions_path),
language=_locale_to_whisper_lang(req.language),
original_text=req.text,
)
print(f"[Pipeline] Whisper alignment completed (multi-material)")
except Exception as e:
@@ -356,12 +423,23 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
material_local = temp_dir / f"{task_id}_material_{i}.mp4"
temp_files.append(material_local)
await _download_material(assignment["material_path"], material_local)
# 归一化旋转元数据,确保分辨率判断与后续推理一致
normalized_material = temp_dir / f"{task_id}_material_{i}_norm.mp4"
normalized_result = video.normalize_orientation(
str(material_local),
str(normalized_material),
)
if normalized_result != str(material_local):
temp_files.append(normalized_material)
material_local = normalized_material
material_locals.append(material_local)
resolutions.append(video.get_resolution(str(material_local)))
# 分辨率不一致时,统一到第一个素材的分辨率
base_res = resolutions[0] if resolutions else (0, 0)
need_scale = any(r != base_res for r in resolutions) and base_res[0] > 0
# 按用户选择的画面比例统一分辨率
base_res = target_resolution
need_scale = any(r != base_res for r in resolutions)
if need_scale:
logger.info(f"[MultiMat] 素材分辨率不一致,统一到 {base_res[0]}x{base_res[1]}")
@@ -381,8 +459,11 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
temp_files.append(prepared_path)
video.prepare_segment(
str(material_locals[i]), seg_dur, str(prepared_path),
target_resolution=base_res if need_scale else None,
# 多素材拼接前统一重编码为同分辨率/同编码,避免 concat 仅保留首段
target_resolution=base_res,
source_start=assignment.get("source_start", 0.0),
source_end=assignment.get("source_end"),
target_fps=25,
)
prepared_segments.append(prepared_path)
@@ -392,7 +473,8 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
temp_files.append(concat_path)
video.concat_videos(
[str(p) for p in prepared_segments],
str(concat_path)
str(concat_path),
target_fps=25,
)
# ── 第三步:一次 LatentSync 推理 ──
@@ -425,23 +507,31 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
# 单素材流水线(原有逻辑)
# ══════════════════════════════════════
# 单素材 + source_start先截取片段
if input_material_path is None:
raise RuntimeError("单素材流程缺少输入素材")
# 单素材:按用户选择画面比例统一到目标分辨率,并应用 source_start
single_source_start = 0.0
single_source_end = None
if req.custom_assignments and len(req.custom_assignments) == 1:
single_source_start = req.custom_assignments[0].source_start
single_source_end = req.custom_assignments[0].source_end
if single_source_start > 0:
_update_task(task_id, progress=20, message="正在截取素材片段...")
audio_dur = video._get_duration(str(audio_path))
if audio_dur <= 0:
audio_dur = 30.0
trimmed_path = temp_dir / f"{task_id}_trimmed.mp4"
temp_files.append(trimmed_path)
video.prepare_segment(
str(input_material_path), audio_dur, str(trimmed_path),
source_start=single_source_start,
)
input_material_path = trimmed_path
_update_task(task_id, progress=20, message="正在准备素材片段...")
audio_dur = video._get_duration(str(audio_path))
if audio_dur <= 0:
audio_dur = 30.0
prepared_single_path = temp_dir / f"{task_id}_prepared_single.mp4"
temp_files.append(prepared_single_path)
video.prepare_segment(
str(input_material_path),
audio_dur,
str(prepared_single_path),
target_resolution=target_resolution,
source_start=single_source_start,
source_end=single_source_end,
)
input_material_path = prepared_single_path
_update_task(task_id, progress=25)
_update_task(task_id, message="正在合成唇形 (LatentSync)...", progress=30)
@@ -476,6 +566,7 @@ async def process_video_generation(task_id: str, req: GenerateRequest, user_id:
text=req.text,
output_path=str(captions_path),
language=_locale_to_whisper_lang(req.language),
original_text=req.text,
)
print(f"[Pipeline] Whisper alignment completed")
except Exception as e:

View File

@@ -1,3 +1,4 @@
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, cast
from app.core.supabase import get_supabase
@@ -37,3 +38,33 @@ def update_user(user_id: str, payload: Dict[str, Any]) -> List[Dict[str, Any]]:
supabase = get_supabase()
result = supabase.table("users").update(payload).eq("id", user_id).execute()
return cast(List[Dict[str, Any]], result.data or [])
def _parse_expires_at(expires_at: Any) -> Optional[datetime]:
try:
expires_at_dt = datetime.fromisoformat(str(expires_at).replace("Z", "+00:00"))
except Exception:
return None
if expires_at_dt.tzinfo is None:
expires_at_dt = expires_at_dt.replace(tzinfo=timezone.utc)
return expires_at_dt.astimezone(timezone.utc)
def deactivate_user_if_expired(user: Dict[str, Any]) -> bool:
expires_at = user.get("expires_at")
if not expires_at:
return False
expires_at_dt = _parse_expires_at(expires_at)
if not expires_at_dt:
return False
if datetime.now(timezone.utc) <= expires_at_dt:
return False
user_id = user.get("id")
if user.get("is_active") and user_id:
update_user(cast(str, user_id), {"is_active": False})
return True

View File

@@ -9,9 +9,110 @@ from pathlib import Path
from loguru import logger
from typing import Optional
class VideoService:
def __init__(self):
pass
class VideoService:
def __init__(self):
pass
def get_video_metadata(self, file_path: str) -> dict:
"""获取视频元信息(含旋转角与有效显示分辨率)"""
cmd = [
"ffprobe", "-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=width,height:stream_side_data=rotation",
"-of", "json",
file_path,
]
default_info = {
"width": 0,
"height": 0,
"rotation": 0,
"effective_width": 0,
"effective_height": 0,
}
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
if result.returncode != 0:
return default_info
payload = json.loads(result.stdout or "{}")
streams = payload.get("streams") or []
if not streams:
return default_info
stream = streams[0]
width = int(stream.get("width") or 0)
height = int(stream.get("height") or 0)
rotation = 0
for side_data in stream.get("side_data_list") or []:
if not isinstance(side_data, dict):
continue
raw_rotation = side_data.get("rotation")
if raw_rotation is None:
continue
try:
rotation = int(round(float(str(raw_rotation))))
except Exception:
rotation = 0
break
norm_rotation = rotation % 360
if norm_rotation > 180:
norm_rotation -= 360
swap_wh = abs(norm_rotation) == 90
effective_width = height if swap_wh else width
effective_height = width if swap_wh else height
return {
"width": width,
"height": height,
"rotation": norm_rotation,
"effective_width": effective_width,
"effective_height": effective_height,
}
except Exception as e:
logger.warning(f"获取视频元信息失败: {e}")
return default_info
def normalize_orientation(self, video_path: str, output_path: str) -> str:
"""将带旋转元数据的视频转为物理方向,避免后续流程忽略 rotation。"""
info = self.get_video_metadata(video_path)
rotation = int(info.get("rotation") or 0)
if rotation == 0:
return video_path
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
logger.info(
f"检测到旋转元数据 rotation={rotation},归一化方向: "
f"{info.get('effective_width', 0)}x{info.get('effective_height', 0)}"
)
cmd = [
"ffmpeg", "-y",
"-i", video_path,
"-map", "0:v:0",
"-map", "0:a?",
"-c:v", "libx264",
"-preset", "fast",
"-crf", "18",
"-c:a", "copy",
"-movflags", "+faststart",
output_path,
]
if self._run_ffmpeg(cmd):
normalized = self.get_video_metadata(output_path)
logger.info(
"视频方向归一化完成: "
f"coded={normalized.get('width', 0)}x{normalized.get('height', 0)}, "
f"rotation={normalized.get('rotation', 0)}"
)
return output_path
logger.warning("视频方向归一化失败,回退使用原视频")
return video_path
def _run_ffmpeg(self, cmd: list) -> bool:
cmd_str = ' '.join(shlex.quote(str(c)) for c in cmd)
@@ -139,8 +240,8 @@ class VideoService:
else:
raise RuntimeError("FFmpeg composition failed")
def concat_videos(self, video_paths: list, output_path: str) -> str:
"""使用 FFmpeg concat demuxer 拼接多个视频片段"""
def concat_videos(self, video_paths: list, output_path: str, target_fps: int = 25) -> str:
"""使用 FFmpeg concat demuxer 拼接多个视频片段"""
if not video_paths:
raise ValueError("No video segments to concat")
@@ -152,14 +253,22 @@ class VideoService:
for vp in video_paths:
f.write(f"file '{vp}'\n")
cmd = [
"ffmpeg", "-y",
"-f", "concat",
"-safe", "0",
"-i", str(list_path),
"-c", "copy",
output_path,
]
cmd = [
"ffmpeg", "-y",
"-f", "concat",
"-safe", "0",
"-fflags", "+genpts",
"-i", str(list_path),
"-an",
"-vsync", "cfr",
"-r", str(target_fps),
"-c:v", "libx264",
"-preset", "fast",
"-crf", "18",
"-pix_fmt", "yuv420p",
"-movflags", "+faststart",
output_path,
]
try:
if self._run_ffmpeg(cmd):
@@ -193,54 +302,60 @@ class VideoService:
return output_path
raise RuntimeError(f"FFmpeg audio split failed: {start}-{end}")
def get_resolution(self, file_path: str) -> tuple:
"""获取视频分辨率,返回 (width, height)"""
cmd = [
'ffprobe', '-v', 'error',
'-select_streams', 'v:0',
'-show_entries', 'stream=width,height',
'-of', 'csv=p=0',
file_path
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
parts = result.stdout.strip().split(',')
return (int(parts[0]), int(parts[1]))
except Exception:
return (0, 0)
def get_resolution(self, file_path: str) -> tuple[int, int]:
"""获取视频有效显示分辨率(考虑旋转元数据)。"""
info = self.get_video_metadata(file_path)
return (
int(info.get("effective_width") or 0),
int(info.get("effective_height") or 0),
)
def prepare_segment(self, video_path: str, target_duration: float, output_path: str,
target_resolution: tuple = None, source_start: float = 0.0) -> str:
"""将素材视频裁剪或循环到指定时长(无音频)。
target_resolution: (width, height) 如需统一分辨率则传入,否则保持原分辨率
source_start: 源视频截取起点(秒),默认 0
"""
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
video_dur = self._get_duration(video_path)
if video_dur <= 0:
video_dur = target_duration
# 可用时长 = 从 source_start 到视频结尾
available = max(video_dur - source_start, 0.1)
needs_loop = target_duration > available
needs_scale = target_resolution is not None
# 当需要循环且有 source_start 时,先裁剪出片段,再循环裁剪后的文件
# 避免 stream_loop 循环整个视频(而不是从 source_start 开始的片段)
actual_input = video_path
trim_temp = None
if needs_loop and source_start > 0:
trim_temp = str(Path(output_path).parent / (Path(output_path).stem + "_trim_tmp.mp4"))
trim_cmd = [
"ffmpeg", "-y",
"-ss", str(source_start),
"-i", video_path,
"-t", str(available),
"-an",
"-c:v", "libx264", "-preset", "fast", "-crf", "18",
trim_temp,
]
def prepare_segment(self, video_path: str, target_duration: float, output_path: str,
target_resolution: Optional[tuple] = None, source_start: float = 0.0,
source_end: Optional[float] = None, target_fps: Optional[int] = None) -> str:
"""将素材视频裁剪或循环到指定时长(无音频)
target_resolution: (width, height) 如需统一分辨率则传入,否则保持原分辨率
source_start: 源视频截取起点(秒),默认 0。
source_end: 源视频截取终点(秒),默认到素材结尾。
target_fps: 输出帧率(可选),用于多素材拼接前统一时间基。
"""
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
video_dur = self._get_duration(video_path)
if video_dur <= 0:
video_dur = target_duration
clip_end = video_dur
if source_end is not None:
try:
source_end_value = float(source_end)
if source_end_value > source_start:
clip_end = min(source_end_value, video_dur)
except Exception:
pass
# 可用时长 = 从 source_start 到视频结尾
available = max(clip_end - source_start, 0.1)
needs_loop = target_duration > available
needs_scale = target_resolution is not None
needs_fps = bool(target_fps and target_fps > 0)
has_source_end = clip_end < video_dur
# 当需要循环且存在截取范围时,先裁剪出片段,再循环裁剪后的文件
# 避免 stream_loop 循环整个视频(而不是截取后的片段)
actual_input = video_path
trim_temp = None
if needs_loop and (source_start > 0 or has_source_end):
trim_temp = str(Path(output_path).parent / (Path(output_path).stem + "_trim_tmp.mp4"))
trim_cmd = [
"ffmpeg", "-y",
"-ss", str(source_start),
"-i", video_path,
"-t", str(available),
"-an",
"-c:v", "libx264", "-preset", "fast", "-crf", "18",
trim_temp,
]
if not self._run_ffmpeg(trim_cmd):
raise RuntimeError(f"FFmpeg trim for loop failed: {video_path}")
actual_input = trim_temp
@@ -253,19 +368,27 @@ class VideoService:
cmd = ["ffmpeg", "-y"]
if needs_loop:
cmd.extend(["-stream_loop", str(loop_count)])
if source_start > 0:
cmd.extend(["-ss", str(source_start)])
cmd.extend(["-i", actual_input, "-t", str(target_duration), "-an"])
if needs_scale:
w, h = target_resolution
cmd.extend(["-vf", f"scale={w}:{h}:force_original_aspect_ratio=decrease,pad={w}:{h}:(ow-iw)/2:(oh-ih)/2"])
# 需要循环、缩放或指定起点时必须重编码,否则用 stream copy 保持原画质
if needs_loop or needs_scale or source_start > 0:
cmd.extend(["-c:v", "libx264", "-preset", "fast", "-crf", "18"])
else:
cmd.extend(["-c:v", "copy"])
if source_start > 0:
cmd.extend(["-ss", str(source_start)])
cmd.extend(["-i", actual_input, "-t", str(target_duration), "-an"])
filters = []
if needs_fps:
filters.append(f"fps={int(target_fps)}")
if needs_scale:
w, h = target_resolution
filters.append(f"scale={w}:{h}:force_original_aspect_ratio=decrease,pad={w}:{h}:(ow-iw)/2:(oh-ih)/2")
if filters:
cmd.extend(["-vf", ",".join(filters)])
if needs_fps:
cmd.extend(["-vsync", "cfr", "-r", str(int(target_fps))])
# 需要循环、缩放或指定起点时必须重编码,否则用 stream copy 保持原画质
if needs_loop or needs_scale or source_start > 0 or has_source_end or needs_fps:
cmd.extend(["-c:v", "libx264", "-preset", "fast", "-crf", "18"])
else:
cmd.extend(["-c:v", "copy"])
cmd.append(output_path)

View File

@@ -1,37 +1,104 @@
"""
声音克隆服务
通过 HTTP 调用 Qwen3-TTS 独立服务 (端口 8009)
通过 HTTP 调用 CosyVoice 3.0 独立服务 (端口 8010)
"""
import httpx
import asyncio
from pathlib import Path
from typing import Optional
import httpx
from loguru import logger
from app.core.config import settings
# Qwen3-TTS 服务地址
QWEN_TTS_URL = "http://localhost:8009"
# CosyVoice 3.0 服务地址
VOICE_CLONE_URL = "http://localhost:8010"
class VoiceCloneService:
"""声音克隆服务 - 调用 Qwen3-TTS HTTP API"""
"""声音克隆服务 - 调用 CosyVoice 3.0 HTTP API"""
def __init__(self):
self.base_url = QWEN_TTS_URL
self.base_url = VOICE_CLONE_URL
# 健康状态缓存
self._health_cache: Optional[dict] = None
self._health_cache_time: float = 0
# GPU 并发锁 (Serial Queue)
self._lock = asyncio.Lock()
async def _generate_once(
self,
*,
text: str,
ref_audio_data: bytes,
ref_text: str,
language: str,
speed: float = 1.0,
max_retries: int = 4,
) -> bytes:
timeout = httpx.Timeout(240.0)
for attempt in range(max_retries):
try:
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.post(
f"{self.base_url}/generate",
files={"ref_audio": ("ref.wav", ref_audio_data, "audio/wav")},
data={
"text": text,
"ref_text": ref_text,
"language": language,
"speed": str(speed),
},
)
retryable = False
reason = ""
if response.status_code in (429, 502, 503, 504):
retryable = True
reason = f"HTTP {response.status_code}"
elif response.status_code == 500 and (
"生成超时" in response.text or "timeout" in response.text.lower()
):
retryable = True
reason = "upstream timeout"
if retryable and attempt < max_retries - 1:
wait = 8 * (attempt + 1)
logger.warning(
f"Voice clone retryable error ({reason}), retrying in {wait}s "
f"(attempt {attempt + 1}/{max_retries})"
)
await asyncio.sleep(wait)
continue
response.raise_for_status()
return response.content
except httpx.HTTPStatusError as e:
logger.error(f"Voice clone API error: {e.response.status_code} - {e.response.text}")
raise RuntimeError(f"声音克隆服务错误: {e.response.text}")
except httpx.RequestError as e:
if attempt < max_retries - 1:
wait = 6 * (attempt + 1)
logger.warning(
f"Voice clone connection error: {e}; retrying in {wait}s "
f"(attempt {attempt + 1}/{max_retries})"
)
await asyncio.sleep(wait)
continue
logger.error(f"Voice clone connection error: {e}")
raise RuntimeError("无法连接声音克隆服务,请检查服务是否启动")
raise RuntimeError("声音克隆服务繁忙,请稍后重试")
async def generate_audio(
self,
text: str,
ref_audio_path: str,
ref_text: str,
output_path: str,
language: str = "Chinese"
language: str = "Chinese",
speed: float = 1.0,
) -> str:
"""
使用声音克隆生成语音
@@ -51,60 +118,49 @@ class VoiceCloneService:
logger.info(f"🎤 Voice Clone: {text[:30]}... (language={language})")
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
# 读取参考音频
text = text.strip()
if not text:
raise RuntimeError("文本为空,无法生成语音")
with open(ref_audio_path, "rb") as f:
ref_audio_data = f.read()
# 调用 Qwen3-TTS 服务
timeout = httpx.Timeout(300.0) # 5分钟超时
async with httpx.AsyncClient(timeout=timeout) as client:
try:
response = await client.post(
f"{self.base_url}/generate",
files={"ref_audio": ("ref.wav", ref_audio_data, "audio/wav")},
data={
"text": text,
"ref_text": ref_text,
"language": language
}
)
response.raise_for_status()
# 保存返回的音频
with open(output_path, "wb") as f:
f.write(response.content)
logger.info(f"✅ Voice clone saved: {output_path}")
return output_path
except httpx.HTTPStatusError as e:
logger.error(f"Qwen3-TTS API error: {e.response.status_code} - {e.response.text}")
raise RuntimeError(f"声音克隆服务错误: {e.response.text}")
except httpx.RequestError as e:
logger.error(f"Qwen3-TTS connection error: {e}")
raise RuntimeError("无法连接声音克隆服务,请检查服务是否启动")
# CosyVoice 内部自带 text_normalize 分段,无需客户端切分
audio_bytes = await self._generate_once(
text=text,
ref_audio_data=ref_audio_data,
ref_text=ref_text,
language=language,
speed=speed,
)
with open(output_path, "wb") as f:
f.write(audio_bytes)
logger.info(f"✅ Voice clone saved: {output_path}")
return output_path
async def check_health(self) -> dict:
"""健康检查"""
import time
# 5分钟缓存
# 30秒缓存
now = time.time()
if self._health_cache and (now - self._health_cache_time) < 300:
return self._health_cache
cached = self._health_cache
if cached is not None and (now - self._health_cache_time) < 30:
return cached
try:
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(f"{self.base_url}/health")
response.raise_for_status()
self._health_cache = response.json()
payload = response.json()
self._health_cache = payload
self._health_cache_time = now
return self._health_cache
return payload
except Exception as e:
logger.warning(f"Qwen3-TTS health check failed: {e}")
logger.warning(f"Voice clone health check failed: {e}")
return {
"service": "Qwen3-TTS Voice Clone",
"model": "0.6B-Base",
"service": "CosyVoice 3.0 Voice Clone",
"model": "unknown",
"ready": False,
"gpu_id": 0,
"error": str(e)

View File

@@ -39,12 +39,22 @@ def split_word_to_chars(word: str, start: float, end: float) -> list:
tokens = []
ascii_buffer = ""
pending_space = False # 记录是否有待处理的空格(用于英文单词间距)
for char in word:
if not char.strip():
# 空格flush ascii_buffer标记下一个 token 需要前导空格
if ascii_buffer:
tokens.append(ascii_buffer)
ascii_buffer = ""
if tokens: # 仅在已有 token 时标记(避免开头重复空格)
pending_space = True
continue
if char.isascii() and char.isalnum():
if pending_space and not ascii_buffer:
ascii_buffer = " " # 将空格前置到新英文单词
pending_space = False
ascii_buffer += char
continue
@@ -52,7 +62,9 @@ def split_word_to_chars(word: str, start: float, end: float) -> list:
tokens.append(ascii_buffer)
ascii_buffer = ""
tokens.append(char)
prefix = " " if pending_space else ""
pending_space = False
tokens.append(prefix + char)
if ascii_buffer:
tokens.append(ascii_buffer)
@@ -175,6 +187,7 @@ class WhisperService:
text: str,
output_path: Optional[str] = None,
language: str = "zh",
original_text: Optional[str] = None,
) -> dict:
"""
对音频进行转录,生成字级别时间戳
@@ -184,6 +197,8 @@ class WhisperService:
text: 原始文本(用于参考,但实际使用 whisper 转录结果)
output_path: 可选,输出 JSON 文件路径
language: 语言代码 (zh/en 等)
original_text: 原始文案。非空时Whisper 仅用于检测总时间范围,
字幕文字用此原文替换(解决语言不匹配问题)
Returns:
包含字级别时间戳的字典
@@ -208,16 +223,19 @@ class WhisperService:
logger.info(f"Detected language: {info.language} (prob: {info.language_probability:.2f})")
# 收集 Whisper 转录结果(始终需要,用于获取时间范围)
all_segments = []
whisper_first_start = None
whisper_last_end = None
for segment in segments_iter:
# 提取每个字的时间戳,并拆分成单字
all_words = []
if segment.words:
for word_info in segment.words:
word_text = word_info.word
if word_text.strip():
# 将词拆分成单字,时间戳线性插值
# 保留前导空格用于英文词间距
if whisper_first_start is None:
whisper_first_start = word_info.start
whisper_last_end = word_info.end
chars = split_word_to_chars(
word_text,
word_info.start,
@@ -225,11 +243,24 @@ class WhisperService:
)
all_words.extend(chars)
# 将长段落按标点和字数拆分成多行
if all_words:
line_segments = split_segment_to_lines(all_words, max_chars)
all_segments.extend(line_segments)
# 如果提供了 original_text用原文替换 Whisper 转录文字
if original_text and original_text.strip() and whisper_first_start is not None:
logger.info(f"Using original_text for subtitles (len={len(original_text)}), "
f"Whisper time range: {whisper_first_start:.2f}-{whisper_last_end:.2f}s")
# 用 split_word_to_chars 拆分原文
orig_chars = split_word_to_chars(
original_text.strip(),
whisper_first_start,
whisper_last_end
)
if orig_chars:
all_segments = split_segment_to_lines(orig_chars, max_chars)
logger.info(f"Rebuilt {len(all_segments)} subtitle segments from original text")
logger.info(f"Generated {len(all_segments)} subtitle segments")
return {"segments": all_segments}
@@ -247,12 +278,13 @@ class WhisperService:
return result
async def transcribe(self, audio_path: str) -> str:
async def transcribe(self, audio_path: str, language: str | None = None) -> str:
"""
仅转录文本(用于提取文案)
Args:
audio_path: 音频/视频文件路径
language: 语言代码None 表示自动检测
Returns:
纯文本内容
@@ -266,7 +298,7 @@ class WhisperService:
# 转录 (无需字级时间戳)
segments_iter, _ = model.transcribe(
audio_path,
language="zh",
language=language,
word_timestamps=False,
vad_filter=True,
)

View File

@@ -20,14 +20,14 @@ logger = logging.getLogger("Watchdog")
# 服务配置
SERVICES = [
{
"name": "vigent2-qwen-tts",
"url": "http://localhost:8009/health",
"name": "vigent2-cosyvoice",
"url": "http://localhost:8010/health",
"failures": 0,
"threshold": 5, # 连续5次失败才重启(5×30s = 2.5分钟容忍期)
"threshold": 3, # 连续3次失败才重启(3×15s ≈ 45秒容忍期)
"timeout": 10.0,
"restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"],
"restart_cmd": ["pm2", "restart", "vigent2-cosyvoice"],
"cooldown_until": 0, # 重启后的冷却截止时间戳
"cooldown_sec": 120, # 重启后等待120秒再开始检查
"cooldown_sec": 45, # 重启后等待45秒再开始检查
}
]
@@ -45,10 +45,20 @@ async def check_service(service):
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.get(service["url"])
if response.status_code == 200:
if service["failures"] > 0:
logger.info(f"✅ 服务 {service['name']} 已恢复正常")
service["failures"] = 0
return True
ready = True
try:
payload = response.json()
ready = bool(payload.get("ready", True))
except Exception:
payload = {}
if ready:
if service["failures"] > 0:
logger.info(f"✅ 服务 {service['name']} 已恢复正常")
service["failures"] = 0
return True
logger.warning(f"⚠️ 服务 {service['name']} ready=false健康检查未通过: {payload}")
else:
logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}")
except Exception as e:
@@ -83,8 +93,8 @@ async def main():
for service in SERVICES:
await check_service(service)
# 每 30 秒检查一次
await asyncio.sleep(30)
# 每 15 秒检查一次
await asyncio.sleep(15)
if __name__ == "__main__":
try: