更新
This commit is contained in:
84
backend/scripts/watchdog.py
Normal file
84
backend/scripts/watchdog.py
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("watchdog.log"),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("Watchdog")
|
||||
|
||||
# 服务配置
|
||||
SERVICES = [
|
||||
{
|
||||
"name": "vigent2-qwen-tts",
|
||||
"url": "http://localhost:8009/health",
|
||||
"failures": 0,
|
||||
"threshold": 3,
|
||||
"timeout": 10.0,
|
||||
"restart_cmd": ["pm2", "restart", "vigent2-qwen-tts"]
|
||||
}
|
||||
]
|
||||
|
||||
async def check_service(service):
|
||||
"""检查单个服务健康状态"""
|
||||
try:
|
||||
timeout = service.get("timeout", 10.0)
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
response = await client.get(service["url"])
|
||||
if response.status_code == 200:
|
||||
# 成功
|
||||
if service["failures"] > 0:
|
||||
logger.info(f"✅ 服务 {service['name']} 已恢复正常")
|
||||
service["failures"] = 0
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"⚠️ 服务 {service['name']} 返回状态码 {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ 无法连接服务 {service['name']}: {str(e)}")
|
||||
|
||||
# 失败处理
|
||||
service["failures"] += 1
|
||||
logger.warning(f"❌ 服务 {service['name']} 连续失败 {service['failures']}/{service['threshold']} 次")
|
||||
|
||||
if service["failures"] >= service['threshold']:
|
||||
logger.error(f"🚨 服务 {service['name']} 已达到失败阈值,正在重启...")
|
||||
try:
|
||||
subprocess.run(service["restart_cmd"], check=True)
|
||||
logger.info(f"♻️ 服务 {service['name']} 重启命令已发送")
|
||||
# 重启后给予一段宽限期 (例如 60秒) 不检查,等待服务启动
|
||||
service["failures"] = 0 # 重置计数
|
||||
return "restarting"
|
||||
except Exception as restart_error:
|
||||
logger.error(f"💥 重启服务 {service['name']} 失败: {restart_error}")
|
||||
|
||||
return False
|
||||
|
||||
async def main():
|
||||
logger.info("🛡️ ViGent2 服务看门狗 (Watchdog) 已启动")
|
||||
|
||||
while True:
|
||||
# 并发检查所有服务
|
||||
for service in SERVICES:
|
||||
result = await check_service(service)
|
||||
if result == "restarting":
|
||||
# 如果有服务重启,额外等待包含启动时间
|
||||
pass
|
||||
|
||||
# 每 30 秒检查一次
|
||||
await asyncio.sleep(30)
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 看门狗已停止")
|
||||
Reference in New Issue
Block a user