ViGent2/backend/app/services/voice_clone_service.py

"""
声音克隆服务
通过 HTTP 调用 CosyVoice 3.0 独立服务 (端口 8010)
"""
import asyncio
from pathlib import Path
from typing import Optional

import httpx
from loguru import logger

# CosyVoice 3.0 服务地址
VOICE_CLONE_URL = "http://localhost:8010"


class VoiceCloneService:
    """声音克隆服务 - 调用 CosyVoice 3.0 HTTP API"""

    def __init__(self):
        self.base_url = VOICE_CLONE_URL
        # 健康状态缓存
        self._health_cache: Optional[dict] = None
        self._health_cache_time: float = 0
        # GPU 并发锁 (Serial Queue)
        self._lock = asyncio.Lock()

    async def _generate_once(
        self,
        *,
        text: str,
        ref_audio_data: bytes,
        ref_text: str,
        language: str,
        speed: float = 1.0,
        max_retries: int = 4,
    ) -> bytes:
        timeout = httpx.Timeout(240.0)

        for attempt in range(max_retries):
            try:
                async with httpx.AsyncClient(timeout=timeout) as client:
                    response = await client.post(
                        f"{self.base_url}/generate",
                        files={"ref_audio": ("ref.wav", ref_audio_data, "audio/wav")},
                        data={
                            "text": text,
                            "ref_text": ref_text,
                            "language": language,
                            "speed": str(speed),
                        },
                    )

                retryable = False
                reason = ""

                if response.status_code in (429, 502, 503, 504):
                    retryable = True
                    reason = f"HTTP {response.status_code}"
                elif response.status_code == 500 and (
                    "生成超时" in response.text or "timeout" in response.text.lower()
                ):
                    retryable = True
                    reason = "upstream timeout"

                if retryable and attempt < max_retries - 1:
                    wait = 8 * (attempt + 1)
                    logger.warning(
                        f"Voice clone retryable error ({reason}), retrying in {wait}s "
                        f"(attempt {attempt + 1}/{max_retries})"
                    )
                    await asyncio.sleep(wait)
                    continue

                response.raise_for_status()
                return response.content

            except httpx.HTTPStatusError as e:
                logger.error(f"Voice clone API error: {e.response.status_code} - {e.response.text}")
                raise RuntimeError(f"声音克隆服务错误: {e.response.text}")
            except httpx.RequestError as e:
                if attempt < max_retries - 1:
                    wait = 6 * (attempt + 1)
                    logger.warning(
                        f"Voice clone connection error: {e}; retrying in {wait}s "
                        f"(attempt {attempt + 1}/{max_retries})"
                    )
                    await asyncio.sleep(wait)
                    continue
                logger.error(f"Voice clone connection error: {e}")
                raise RuntimeError("无法连接声音克隆服务，请检查服务是否启动")

        raise RuntimeError("声音克隆服务繁忙，请稍后重试")

    async def generate_audio(
        self,
        text: str,
        ref_audio_path: str,
        ref_text: str,
        output_path: str,
        language: str = "Chinese",
        speed: float = 1.0,
    ) -> str:
        """
        使用声音克隆生成语音

        Args:
            text: 要合成的文本
            ref_audio_path: 参考音频本地路径
            ref_text: 参考音频的转写文字
            output_path: 输出 wav 路径
            language: 语言 (Chinese/English/Auto)

        Returns:
            输出文件路径
        """
        # 使用锁确保串行执行，避免 GPU 显存溢出
        async with self._lock:
            logger.info(f"🎤 Voice Clone: {text[:30]}... (language={language})")
            Path(output_path).parent.mkdir(parents=True, exist_ok=True)

            text = text.strip()
            if not text:
                raise RuntimeError("文本为空，无法生成语音")

            with open(ref_audio_path, "rb") as f:
                ref_audio_data = f.read()

            # CosyVoice 内部自带 text_normalize 分段，无需客户端切分
            audio_bytes = await self._generate_once(
                text=text,
                ref_audio_data=ref_audio_data,
                ref_text=ref_text,
                language=language,
                speed=speed,
            )
            with open(output_path, "wb") as f:
                f.write(audio_bytes)
            logger.info(f"✅ Voice clone saved: {output_path}")
            return output_path

    async def check_health(self) -> dict:
        """健康检查"""
        import time

        # 30秒缓存
        now = time.time()
        cached = self._health_cache
        if cached is not None and (now - self._health_cache_time) < 30:
            return cached

        try:
            async with httpx.AsyncClient(timeout=5.0) as client:
                response = await client.get(f"{self.base_url}/health")
                response.raise_for_status()
                payload = response.json()
                self._health_cache = payload
                self._health_cache_time = now
                return payload
        except Exception as e:
            logger.warning(f"Voice clone health check failed: {e}")
            return {
                "service": "CosyVoice 3.0 Voice Clone",
                "model": "unknown",
                "ready": False,
                "gpu_id": 0,
                "error": str(e)
            }


# 单例
voice_clone_service = VoiceCloneService()