104 lines
3.5 KiB
Python
104 lines
3.5 KiB
Python
|
||
import asyncio
|
||
import httpx
|
||
import logging
|
||
import subprocess
|
||
import time
|
||
from datetime import datetime
|
||
|
||
# 配置日志
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler("watchdog.log"),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger("Watchdog")
|
||
|
||
# 服务配置
|
||
SERVICES = [
|
||
{
|
||
"name": "vigent2-cosyvoice",
|
||
"url": "http://localhost:8010/health",
|
||
"failures": 0,
|
||
"threshold": 3, # 连续3次失败才重启(3×15s ≈ 45秒容忍期)
|
||
"timeout": 10.0,
|
||
"restart_cmd": ["pm2", "restart", "vigent2-cosyvoice"],
|
||
"cooldown_until": 0, # 重启后的冷却截止时间戳
|
||
"cooldown_sec": 45, # 重启后等待45秒再开始检查
|
||
}
|
||
]
|
||
|
||
async def check_service(service):
|
||
"""检查单个服务健康状态"""
|
||
# 冷却期内跳过检查
|
||
now = time.time()
|
||
if now < service.get("cooldown_until", 0):
|
||
remaining = int(service["cooldown_until"] - now)
|
||
logger.debug(f"⏳ 服务 {service['name']} 冷却中,剩余 {remaining}s")
|
||
return True
|
||
|
||
try:
|
||
timeout = service.get("timeout", 10.0)
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
response = await client.get(service["url"])
|
||
if response.status_code == 200:
|
||
ready = True
|
||
try:
|
||
payload = response.json()
|
||
ready = bool(payload.get("ready", True))
|
||
except Exception:
|
||
payload = {}
|
||
|
||
if ready:
|
||
if service["failures"] > 0:
|
||
logger.info(f"✅ 服务 {service['name']} 已恢复正常")
|
||
service["failures"] = 0
|
||
return True
|
||
|
||
logger.warning(f"⚠️ 服务 {service['name']} ready=false,健康检查未通过: {payload}")
|
||
else:
|
||
logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}")
|
||
except Exception as e:
|
||
logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}")
|
||
|
||
# 失败处理
|
||
service["failures"] += 1
|
||
logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']} 次")
|
||
|
||
if service["failures"] >= service['threshold']:
|
||
logger.error(f"🚨 服务 {service['name']} 已达到失败阈值,正在重启...")
|
||
try:
|
||
subprocess.run(service["restart_cmd"], check=True)
|
||
logger.info(f"♻️ 服务 {service['name']} 重启命令已发送")
|
||
service["failures"] = 0
|
||
# 设置冷却期,等待服务完成启动和模型加载
|
||
service["cooldown_until"] = time.time() + service.get("cooldown_sec", 120)
|
||
return "restarting"
|
||
except Exception as restart_error:
|
||
logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}")
|
||
|
||
return False
|
||
|
||
async def main():
|
||
logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动")
|
||
|
||
# 启动时给所有服务一个初始冷却期,避免服务还没起来就被判定失败
|
||
for service in SERVICES:
|
||
service["cooldown_until"] = time.time() + 60
|
||
|
||
while True:
|
||
for service in SERVICES:
|
||
await check_service(service)
|
||
|
||
# 每 15 秒检查一次
|
||
await asyncio.sleep(15)
|
||
|
||
if __name__ == "__main__":
|
||
try:
|
||
asyncio.run(main())
|
||
except KeyboardInterrupt:
|
||
logger.info("🛑 看门狗已停止")
|