This commit is contained in:
Kevin Wong
2026-02-10 13:31:29 +08:00
parent 3129d45b25
commit e33dfc3031
38 changed files with 2956 additions and 282 deletions

View File

@@ -23,20 +23,28 @@ SERVICES = [
"name": "vigent2-qwen-tts",
"url": "http://localhost:8009/health",
"failures": 0,
"threshold": 3,
"threshold": 5, # 连续5次失败才重启5×30s = 2.5分钟容忍期)
"timeout": 10.0,
"restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"]
"restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"],
"cooldown_until": 0, # 重启后的冷却截止时间戳
"cooldown_sec": 120, # 重启后等待120秒再开始检查
}
]
async def check_service(service):
"""检查单个服务健康状态"""
# 冷却期内跳过检查
now = time.time()
if now < service.get("cooldown_until", 0):
remaining = int(service["cooldown_until"] - now)
logger.debug(f"⏳ 服务 {service['name']} 冷却中,剩余 {remaining}s")
return True
try:
timeout = service.get("timeout", 10.0)
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.get(service["url"])
if response.status_code == 200:
# 成功
if service["failures"] > 0:
logger.info(f"✅ 服务 {service['name']} 已恢复正常")
service["failures"] = 0
@@ -45,35 +53,36 @@ async def check_service(service):
logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}")
except Exception as e:
logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}")
# 失败处理
service["failures"] += 1
logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']}")
if service["failures"] >= service['threshold']:
logger.error(f"🚨 服务 {service['name']} 已达到失败阈值,正在重启...")
try:
subprocess.run(service["restart_cmd"], check=True)
logger.info(f"♻️ 服务 {service['name']} 重启命令已发送")
# 重启后给予一段宽限期 (例如 60秒) 不检查,等待服务启动
service["failures"] = 0 # 重置计数
return "restarting"
service["failures"] = 0
# 设置冷却期,等待服务完成启动和模型加载
service["cooldown_until"] = time.time() + service.get("cooldown_sec", 120)
return "restarting"
except Exception as restart_error:
logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}")
return False
async def main():
logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动")
# 启动时给所有服务一个初始冷却期,避免服务还没起来就被判定失败
for service in SERVICES:
service["cooldown_until"] = time.time() + 60
while True:
# 并发检查所有服务
for service in SERVICES:
result = await check_service(service)
if result == "restarting":
# 如果有服务重启,额外等待包含启动时间
pass
await check_service(service)
# 每 30 秒检查一次
await asyncio.sleep(30)