ViGent2/backend/scripts/watchdog.py


import asyncio
import httpx
import logging
import subprocess
import time
from datetime import datetime

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("watchdog.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("Watchdog")

# 服务配置
SERVICES = [
    {
        "name": "vigent2-cosyvoice",
        "url": "http://localhost:8010/health",
        "failures": 0,
        "threshold": 3,          # 连续3次失败才重启（3×15s ≈ 45秒容忍期）
        "timeout": 10.0,
        "restart_cmd": ["pm2", "restart", "vigent2-cosyvoice"],
        "cooldown_until": 0,     # 重启后的冷却截止时间戳
        "cooldown_sec": 45,      # 重启后等待45秒再开始检查
    }
]

async def check_service(service):
    """检查单个服务健康状态"""
    # 冷却期内跳过检查
    now = time.time()
    if now < service.get("cooldown_until", 0):
        remaining = int(service["cooldown_until"] - now)
        logger.debug(f"⏳ 服务 {service['name']} 冷却中，剩余 {remaining}s")
        return True

    try:
        timeout = service.get("timeout", 10.0)
        async with httpx.AsyncClient(timeout=timeout) as client:
            response = await client.get(service["url"])
            if response.status_code == 200:
                ready = True
                try:
                    payload = response.json()
                    ready = bool(payload.get("ready", True))
                except Exception:
                    payload = {}

                if ready:
                    if service["failures"] > 0:
                        logger.info(f"✅ 服务 {service['name']} 已恢复正常")
                    service["failures"] = 0
                    return True

                logger.warning(f"⚠️ 服务 {service['name']} ready=false，健康检查未通过: {payload}")
            else:
                logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}")
    except Exception as e:
        logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}")

    # 失败处理
    service["failures"] += 1
    logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']} 次")

    if service["failures"] >= service['threshold']:
        logger.error(f"🚨 服务 {service['name']} 已达到失败阈值，正在重启...")
        try:
            subprocess.run(service["restart_cmd"], check=True)
            logger.info(f"♻️ 服务 {service['name']} 重启命令已发送")
            service["failures"] = 0
            # 设置冷却期，等待服务完成启动和模型加载
            service["cooldown_until"] = time.time() + service.get("cooldown_sec", 120)
            return "restarting"
        except Exception as restart_error:
            logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}")

    return False

async def main():
    logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动")

    # 启动时给所有服务一个初始冷却期，避免服务还没起来就被判定失败
    for service in SERVICES:
        service["cooldown_until"] = time.time() + 60

    while True:
        for service in SERVICES:
            await check_service(service)

        # 每 15 秒检查一次
        await asyncio.sleep(15)

if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        logger.info("🛑 看门狗已停止")