85 lines
2.7 KiB
Python
85 lines
2.7 KiB
Python
|
|
import asyncio
|
|
import httpx
|
|
import logging
|
|
import subprocess
|
|
import time
|
|
from datetime import datetime
|
|
|
|
# 配置日志
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler("watchdog.log"),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger("Watchdog")
|
|
|
|
# 服务配置
|
|
SERVICES = [
|
|
{
|
|
"name": "vigent2-qwen-tts",
|
|
"url": "http://localhost:8009/health",
|
|
"failures": 0,
|
|
"threshold": 3,
|
|
"timeout": 10.0,
|
|
"restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"]
|
|
}
|
|
]
|
|
|
|
async def check_service(service):
|
|
"""检查单个服务健康状态"""
|
|
try:
|
|
timeout = service.get("timeout", 10.0)
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
response = await client.get(service["url"])
|
|
if response.status_code == 200:
|
|
# 成功
|
|
if service["failures"] > 0:
|
|
logger.info(f"✅ 服务 {service['name']} 已恢复正常")
|
|
service["failures"] = 0
|
|
return True
|
|
else:
|
|
logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}")
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}")
|
|
|
|
# 失败处理
|
|
service["failures"] += 1
|
|
logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']} 次")
|
|
|
|
if service["failures"] >= service['threshold']:
|
|
logger.error(f"🚨 服务 {service['name']} 已达到失败阈值,正在重启...")
|
|
try:
|
|
subprocess.run(service["restart_cmd"], check=True)
|
|
logger.info(f"♻️ 服务 {service['name']} 重启命令已发送")
|
|
# 重启后给予一段宽限期 (例如 60秒) 不检查,等待服务启动
|
|
service["failures"] = 0 # 重置计数
|
|
return "restarting"
|
|
except Exception as restart_error:
|
|
logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}")
|
|
|
|
return False
|
|
|
|
async def main():
|
|
logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动")
|
|
|
|
while True:
|
|
# 并发检查所有服务
|
|
for service in SERVICES:
|
|
result = await check_service(service)
|
|
if result == "restarting":
|
|
# 如果有服务重启,额外等待包含启动时间
|
|
pass
|
|
|
|
# 每 30 秒检查一次
|
|
await asyncio.sleep(30)
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
asyncio.run(main())
|
|
except KeyboardInterrupt:
|
|
logger.info("🛑 看门狗已停止")
|