import asyncio import httpx import logging import subprocess import time from datetime import datetime # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("watchdog.log"), logging.StreamHandler() ] ) logger = logging.getLogger("Watchdog") # 服务配置 SERVICES = [ { "name": "vigent2-qwen-tts", "url": "http://localhost:8009/health", "failures": 0, "threshold": 3, "timeout": 10.0, "restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"] } ] async def check_service(service): """检查单个服务健康状态""" try: timeout = service.get("timeout", 10.0) async with httpx.AsyncClient(timeout=timeout) as client: response = await client.get(service["url"]) if response.status_code == 200: # 成功 if service["failures"] > 0: logger.info(f"✅ 服务 {service['name']} 已恢复正常") service["failures"] = 0 return True else: logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}") except Exception as e: logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}") # 失败处理 service["failures"] += 1 logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']} 次") if service["failures"] >= service['threshold']: logger.error(f"🚨 服务 {service['name']} 已达到失败阈值,正在重启...") try: subprocess.run(service["restart_cmd"], check=True) logger.info(f"♻️ 服务 {service['name']} 重启命令已发送") # 重启后给予一段宽限期 (例如 60秒) 不检查,等待服务启动 service["failures"] = 0 # 重置计数 return "restarting" except Exception as restart_error: logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}") return False async def main(): logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动") while True: # 并发检查所有服务 for service in SERVICES: result = await check_service(service) if result == "restarting": # 如果有服务重启,额外等待包含启动时间 pass # 每 30 秒检查一次 await asyncio.sleep(30) if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: logger.info("🛑 看门狗已停止")