ViGent2/backend/app/services/voice_clone_service.py

"""
声音克隆服务
通过 HTTP 调用 Qwen3-TTS 独立服务 (端口 8009)
"""
import httpx
import asyncio
from pathlib import Path
from typing import Optional
from loguru import logger

from app.core.config import settings

# Qwen3-TTS 服务地址
QWEN_TTS_URL = "http://localhost:8009"


class VoiceCloneService:
    """声音克隆服务 - 调用 Qwen3-TTS HTTP API"""

    def __init__(self):
        self.base_url = QWEN_TTS_URL
        # 健康状态缓存
        self._health_cache: Optional[dict] = None
        self._health_cache_time: float = 0
        # GPU 并发锁 (Serial Queue)
        self._lock = asyncio.Lock()

    async def generate_audio(
        self,
        text: str,
        ref_audio_path: str,
        ref_text: str,
        output_path: str,
        language: str = "Chinese"
    ) -> str:
        """
        使用声音克隆生成语音

        Args:
            text: 要合成的文本
            ref_audio_path: 参考音频本地路径
            ref_text: 参考音频的转写文字
            output_path: 输出 wav 路径
            language: 语言 (Chinese/English/Auto)

        Returns:
            输出文件路径
        """
        # 使用锁确保串行执行，避免 GPU 显存溢出
        async with self._lock:
            logger.info(f"🎤 Voice Clone: {text[:30]}...")
            Path(output_path).parent.mkdir(parents=True, exist_ok=True)

            # 读取参考音频
            with open(ref_audio_path, "rb") as f:
                ref_audio_data = f.read()

            # 调用 Qwen3-TTS 服务
            timeout = httpx.Timeout(300.0)  # 5分钟超时
            async with httpx.AsyncClient(timeout=timeout) as client:
                try:
                    response = await client.post(
                        f"{self.base_url}/generate",
                        files={"ref_audio": ("ref.wav", ref_audio_data, "audio/wav")},
                        data={
                            "text": text,
                            "ref_text": ref_text,
                            "language": language
                        }
                    )
                    response.raise_for_status()

                    # 保存返回的音频
                    with open(output_path, "wb") as f:
                        f.write(response.content)

                    logger.info(f"✅ Voice clone saved: {output_path}")
                    return output_path

                except httpx.HTTPStatusError as e:
                    logger.error(f"Qwen3-TTS API error: {e.response.status_code} - {e.response.text}")
                    raise RuntimeError(f"声音克隆服务错误: {e.response.text}")
                except httpx.RequestError as e:
                    logger.error(f"Qwen3-TTS connection error: {e}")
                    raise RuntimeError("无法连接声音克隆服务，请检查服务是否启动")

    async def check_health(self) -> dict:
        """健康检查"""
        import time

        # 5分钟缓存
        now = time.time()
        if self._health_cache and (now - self._health_cache_time) < 300:
            return self._health_cache

        try:
            async with httpx.AsyncClient(timeout=5.0) as client:
                response = await client.get(f"{self.base_url}/health")
                response.raise_for_status()
                self._health_cache = response.json()
                self._health_cache_time = now
                return self._health_cache
        except Exception as e:
            logger.warning(f"Qwen3-TTS health check failed: {e}")
            return {
                "service": "Qwen3-TTS Voice Clone",
                "model": "0.6B-Base",
                "ready": False,
                "gpu_id": 0,
                "error": str(e)
            }


# 单例
voice_clone_service = VoiceCloneService()