更新
This commit is contained in:
@@ -23,20 +23,28 @@ SERVICES = [
|
||||
"name": "vigent2-qwen-tts",
|
||||
"url": "http://localhost:8009/health",
|
||||
"failures": 0,
|
||||
"threshold": 3,
|
||||
"threshold": 5, # 连续5次失败才重启(5×30s = 2.5分钟容忍期)
|
||||
"timeout": 10.0,
|
||||
"restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"]
|
||||
"restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"],
|
||||
"cooldown_until": 0, # 重启后的冷却截止时间戳
|
||||
"cooldown_sec": 120, # 重启后等待120秒再开始检查
|
||||
}
|
||||
]
|
||||
|
||||
async def check_service(service):
|
||||
"""检查单个服务健康状态"""
|
||||
# 冷却期内跳过检查
|
||||
now = time.time()
|
||||
if now < service.get("cooldown_until", 0):
|
||||
remaining = int(service["cooldown_until"] - now)
|
||||
logger.debug(f"⏳ 服务 {service['name']} 冷却中,剩余 {remaining}s")
|
||||
return True
|
||||
|
||||
try:
|
||||
timeout = service.get("timeout", 10.0)
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
response = await client.get(service["url"])
|
||||
if response.status_code == 200:
|
||||
# 成功
|
||||
if service["failures"] > 0:
|
||||
logger.info(f"✅ 服务 {service['name']} 已恢复正常")
|
||||
service["failures"] = 0
|
||||
@@ -45,35 +53,36 @@ async def check_service(service):
|
||||
logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}")
|
||||
|
||||
|
||||
# 失败处理
|
||||
service["failures"] += 1
|
||||
logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']} 次")
|
||||
|
||||
|
||||
if service["failures"] >= service['threshold']:
|
||||
logger.error(f"🚨 服务 {service['name']} 已达到失败阈值,正在重启...")
|
||||
try:
|
||||
subprocess.run(service["restart_cmd"], check=True)
|
||||
logger.info(f"♻️ 服务 {service['name']} 重启命令已发送")
|
||||
# 重启后给予一段宽限期 (例如 60秒) 不检查,等待服务启动
|
||||
service["failures"] = 0 # 重置计数
|
||||
return "restarting"
|
||||
service["failures"] = 0
|
||||
# 设置冷却期,等待服务完成启动和模型加载
|
||||
service["cooldown_until"] = time.time() + service.get("cooldown_sec", 120)
|
||||
return "restarting"
|
||||
except Exception as restart_error:
|
||||
logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}")
|
||||
|
||||
|
||||
return False
|
||||
|
||||
async def main():
|
||||
logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动")
|
||||
|
||||
|
||||
# 启动时给所有服务一个初始冷却期,避免服务还没起来就被判定失败
|
||||
for service in SERVICES:
|
||||
service["cooldown_until"] = time.time() + 60
|
||||
|
||||
while True:
|
||||
# 并发检查所有服务
|
||||
for service in SERVICES:
|
||||
result = await check_service(service)
|
||||
if result == "restarting":
|
||||
# 如果有服务重启,额外等待包含启动时间
|
||||
pass
|
||||
|
||||
await check_service(service)
|
||||
|
||||
# 每 30 秒检查一次
|
||||
await asyncio.sleep(30)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user