import asyncio import httpx import logging import subprocess import time from datetime import datetime # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("watchdog.log"), logging.StreamHandler() ] ) logger = logging.getLogger("Watchdog") # 服务配置 SERVICES = [ { "name": "vigent2-cosyvoice", "url": "http://localhost:8010/health", "failures": 0, "threshold": 3, # 连续3次失败才重启(3×15s ≈ 45秒容忍期) "timeout": 10.0, "restart_cmd": ["pm2", "restart", "vigent2-cosyvoice"], "cooldown_until": 0, # 重启后的冷却截止时间戳 "cooldown_sec": 45, # 重启后等待45秒再开始检查 } ] async def check_service(service): """检查单个服务健康状态""" # 冷却期内跳过检查 now = time.time() if now < service.get("cooldown_until", 0): remaining = int(service["cooldown_until"] - now) logger.debug(f"⏳ 服务 {service['name']} 冷却中,剩余 {remaining}s") return True try: timeout = service.get("timeout", 10.0) async with httpx.AsyncClient(timeout=timeout) as client: response = await client.get(service["url"]) if response.status_code == 200: ready = True try: payload = response.json() ready = bool(payload.get("ready", True)) except Exception: payload = {} if ready: if service["failures"] > 0: logger.info(f"✅ 服务 {service['name']} 已恢复正常") service["failures"] = 0 return True logger.warning(f"⚠️ 服务 {service['name']} ready=false,健康检查未通过: {payload}") else: logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}") except Exception as e: logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}") # 失败处理 service["failures"] += 1 logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']} 次") if service["failures"] >= service['threshold']: logger.error(f"🚨 服务 {service['name']} 已达到失败阈值,正在重启...") try: subprocess.run(service["restart_cmd"], check=True) logger.info(f"♻️ 服务 {service['name']} 重启命令已发送") service["failures"] = 0 # 设置冷却期,等待服务完成启动和模型加载 service["cooldown_until"] = time.time() + service.get("cooldown_sec", 120) return "restarting" except Exception as restart_error: logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}") return False async def main(): logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动") # 启动时给所有服务一个初始冷却期,避免服务还没起来就被判定失败 for service in SERVICES: service["cooldown_until"] = time.time() + 60 while True: for service in SERVICES: await check_service(service) # 每 15 秒检查一次 await asyncio.sleep(15) if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: logger.info("🛑 看门狗已停止")