ViGent2/models/CosyVoice/cosyvoice_server.py

"""
CosyVoice 3.0 声音克隆服务
端口: 8010
GPU: 0

启动方式:
    conda activate cosyvoice
    python cosyvoice_server.py

PM2 启动:
    pm2 start run_cosyvoice.sh --name vigent2-cosyvoice
"""
import os
import sys
import tempfile
import time
import asyncio
from pathlib import Path

# 设置 GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# CosyVoice 需要 Matcha-TTS 子模块
SCRIPT_DIR = Path(__file__).parent
sys.path.append(str(SCRIPT_DIR / "third_party" / "Matcha-TTS"))

from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from fastapi.responses import FileResponse
from pydantic import BaseModel
import uvicorn

app = FastAPI(title="CosyVoice 3.0 Voice Clone Service", version="1.0")

MODEL_DIR = SCRIPT_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B"

# 全局模型实例
_model = None
_model_loaded = False
_poisoned = False

# GPU 推理锁
_inference_lock = asyncio.Lock()


def _schedule_force_exit(reason: str, delay_sec: float = 1.5):
    """超时后强制退出进程，让 PM2 立即拉起新进程。"""
    import threading

    def _killer():
        time.sleep(delay_sec)
        print(f"💥 Force exiting process: {reason}")
        os._exit(1)

    threading.Thread(target=_killer, daemon=True).start()


def load_model():
    """加载模型（启动时调用）"""
    global _model, _model_loaded

    if _model_loaded:
        return

    print(f"🔄 Loading CosyVoice 3.0 model from {MODEL_DIR}...")
    start = time.time()

    from cosyvoice.cli.cosyvoice import AutoModel
    _model = AutoModel(model_dir=str(MODEL_DIR), fp16=True)

    _model_loaded = True
    print(f"✅ CosyVoice 3.0 model loaded in {time.time() - start:.1f}s")


class HealthResponse(BaseModel):
    service: str
    model: str
    ready: bool
    gpu_id: int


def _startup_selftest():
    """启动自检：用短文本做一次推理，验证 GPU 推理链路可用。"""
    import torch

    print("🔍 Running startup self-test inference...")
    start = time.time()

    test_text = "你好"
    # 使用一段静音作为参考音频（0.5秒 24kHz）
    ref_audio_path = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            ref_audio_path = tmp.name
            import torchaudio
            silence = torch.zeros(1, 12000)  # 0.5s @ 24kHz
            torchaudio.save(ref_audio_path, silence, 24000)

        prompt_text = f"You are a helpful assistant.<|endofprompt|>你好"
        results = list(_model.inference_zero_shot(
            test_text,
            prompt_text,
            ref_audio_path,
            stream=False,
            text_frontend=True,
        ))
        if not results:
            raise RuntimeError("Self-test returned empty results")

        segments = [r["tts_speech"] for r in results if isinstance(r, dict) and "tts_speech" in r]
        if not segments:
            raise RuntimeError("Self-test returned no tts_speech segments")

        torch.cuda.empty_cache()
        print(f"✅ Self-test passed in {time.time() - start:.1f}s "
              f"(output shape: {segments[0].shape})")
        return True
    except Exception as e:
        print(f"❌ Self-test FAILED: {e}")
        import traceback
        traceback.print_exc()
        try:
            torch.cuda.empty_cache()
        except:
            pass
        return False
    finally:
        if ref_audio_path:
            try:
                os.unlink(ref_audio_path)
            except:
                pass


@app.on_event("startup")
async def startup():
    """服务启动时预加载模型并自检推理"""
    try:
        load_model()
    except Exception as e:
        print(f"❌ Model loading failed: {e}")
        import traceback
        traceback.print_exc()
        return

    # 自检推理 — 失败则标记为不可用
    global _model_loaded
    if not _startup_selftest():
        _model_loaded = False
        print("⚠️ Self-test failed, marking service as NOT ready")


@app.get("/health", response_model=HealthResponse)
async def health():
    """健康检查"""
    gpu_ok = False
    try:
        import torch
        gpu_ok = torch.cuda.is_available()
    except:
        pass

    return HealthResponse(
        service="CosyVoice 3.0 Voice Clone",
        model="Fun-CosyVoice3-0.5B",
        ready=_model_loaded and gpu_ok and not _poisoned,
        gpu_id=0
    )


@app.post("/generate")
async def generate(
    ref_audio: UploadFile = File(...),
    text: str = Form(...),
    ref_text: str = Form(...),
    language: str = Form("Chinese"),
    speed: float = Form(1.0),
):
    """
    声音克隆生成

    Args:
        ref_audio: 参考音频文件 (WAV)
        text: 要合成的文本
        ref_text: 参考音频的转写文字
        language: 语言（兼容参数，CosyVoice 自动检测语言）

    Returns:
        生成的音频文件 (WAV)
    """
    global _poisoned

    if not _model_loaded:
        raise HTTPException(status_code=503, detail="Model not loaded")

    if _poisoned:
        raise HTTPException(status_code=503, detail="Service poisoned after timeout, waiting for restart")

    if _inference_lock.locked():
        raise HTTPException(status_code=429, detail="GPU busy, please retry later")

    import torch
    import torchaudio

    # 保存上传的参考音频到临时文件
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_ref:
        content = await ref_audio.read()
        tmp_ref.write(content)
        ref_audio_path = tmp_ref.name

    # 参考音频过长时自动截取前 10 秒（CosyVoice 建议 3-10 秒）
    MAX_REF_SEC = 10
    try:
        info = torchaudio.info(ref_audio_path)
        ref_dur = info.num_frames / info.sample_rate
        if ref_dur > MAX_REF_SEC:
            print(f"✂️ Ref audio too long ({ref_dur:.1f}s), trimming to {MAX_REF_SEC}s")
            wav, sr = torchaudio.load(ref_audio_path, num_frames=int(info.sample_rate * MAX_REF_SEC))
            torchaudio.save(ref_audio_path, wav, sr)
    except Exception as e:
        print(f"⚠️ Could not check ref audio duration: {e}")

    output_path = tempfile.mktemp(suffix=".wav")

    try:
        async with _inference_lock:
            print(f"🎤 Generating: {text[:50]}... ({len(text)} chars)")
            print(f"📝 Ref text: {ref_text[:50]}...")
            print(f"🌐 Language: {language}")
            print(f"⚡ Speed: {speed}")

            start = time.time()

            # 超时保护：基础60秒 + 每字符2秒，上限300秒
            timeout_sec = min(60 + len(text) * 2, 300)

            # CosyVoice3 的 prompt_text 格式
            prompt_text = f"You are a helpful assistant.<|endofprompt|>{ref_text}"

            def _do_inference():
                """在线程池中执行推理"""
                results = list(_model.inference_zero_shot(
                    text,
                    prompt_text,
                    ref_audio_path,
                    stream=False,
                    speed=speed,
                    text_frontend=True,
                ))
                if not results:
                    raise RuntimeError("CosyVoice returned empty results")

                segments = [r["tts_speech"] for r in results if isinstance(r, dict) and "tts_speech" in r]
                if not segments:
                    raise RuntimeError("CosyVoice returned no tts_speech segments")

                if len(segments) == 1:
                    merged = segments[0]
                else:
                    gap = torch.zeros((segments[0].shape[0], int(_model.sample_rate * 0.05)), dtype=segments[0].dtype)
                    parts = [segments[0]]
                    for seg in segments[1:]:
                        parts.append(gap)
                        parts.append(seg)
                    merged = torch.cat(parts, dim=-1)

                return merged, _model.sample_rate

            try:
                speech, sr = await asyncio.wait_for(
                    asyncio.to_thread(_do_inference),
                    timeout=timeout_sec,
                )
            except asyncio.TimeoutError:
                _poisoned = True
                print(f"⏰ Generation timed out after {timeout_sec}s for {len(text)} chars — service POISONED")
                torch.cuda.empty_cache()
                _schedule_force_exit("generation timeout")
                raise HTTPException(status_code=500, detail=f"生成超时({timeout_sec}s)，请缩短文本后重试")

            torch.cuda.empty_cache()

            torchaudio.save(output_path, speech, sr)

            duration = speech.shape[-1] / sr
            print(f"✅ Generated in {time.time() - start:.1f}s, duration: {duration:.1f}s")

        return FileResponse(
            output_path,
            media_type="audio/wav",
            filename="output.wav",
            background=None
        )

    except HTTPException:
        raise

    except Exception as e:
        print(f"❌ Generation failed: {e}")
        try:
            torch.cuda.empty_cache()
        except:
            pass
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        try:
            os.unlink(ref_audio_path)
        except:
            pass


@app.on_event("shutdown")
async def shutdown():
    """清理临时文件"""
    import glob
    for f in glob.glob("/tmp/tmp*.wav"):
        try:
            os.unlink(f)
        except:
            pass


if __name__ == "__main__":
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=8010,
        log_level="info"
    )