Refactor health check functionality in POE project. Removed readiness endpoint from health_check.py and improved error handling for sensor status retrieval. Updated logging to reduce noise and adjusted health check server startup process in main.py. Modified Nomad job configuration for network mode and resource allocation, enhancing overall system performance and stability.
This commit is contained in:
@ -10,29 +10,28 @@ job "poe-sensor" {
|
||||
group "sensor-bridge" {
|
||||
count = 1
|
||||
|
||||
# Network configuration - using bridge mode with port mapping for better isolation
|
||||
# Network configuration - using host mode for better compatibility
|
||||
network {
|
||||
mode = "bridge"
|
||||
mode = "host"
|
||||
port "health" {
|
||||
static = 8080
|
||||
to = 8080
|
||||
}
|
||||
}
|
||||
|
||||
# Restart policy - more lenient for startup issues
|
||||
# Restart policy
|
||||
restart {
|
||||
attempts = 5
|
||||
attempts = 3
|
||||
interval = "30m"
|
||||
delay = "30s"
|
||||
delay = "15s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
# Update strategy
|
||||
update {
|
||||
max_parallel = 1
|
||||
min_healthy_time = "120s" # Increased from 60s
|
||||
healthy_deadline = "10m" # Increased from 5m
|
||||
progress_deadline = "15m" # Increased from 10m
|
||||
min_healthy_time = "90s"
|
||||
healthy_deadline = "8m"
|
||||
progress_deadline = "12m"
|
||||
auto_revert = true
|
||||
canary = 0
|
||||
}
|
||||
@ -51,12 +50,12 @@ job "poe-sensor" {
|
||||
check {
|
||||
type = "http"
|
||||
path = "/health"
|
||||
interval = "30s" # Reduced frequency
|
||||
timeout = "30s" # Increased timeout
|
||||
initial_status = "critical" # Start as critical until proven healthy
|
||||
interval = "30s"
|
||||
timeout = "20s"
|
||||
initial_status = "critical"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "30s" # More time for graceful shutdown
|
||||
limit = 2
|
||||
grace = "20s"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -69,25 +68,11 @@ job "poe-sensor" {
|
||||
command = "/bin/bash"
|
||||
args = [
|
||||
"-c",
|
||||
<<EOF
|
||||
cd local/poe-sensor &&
|
||||
echo "Starting POE Sensor installation..." &&
|
||||
apt-get update -qq &&
|
||||
apt-get install -y procps curl &&
|
||||
python -m pip install --upgrade pip &&
|
||||
echo "Installing Python dependencies..." &&
|
||||
python -m pip install -r requirements.txt &&
|
||||
echo "Testing dependencies..." &&
|
||||
python -c 'import pymodbus, paho.mqtt.client; print("Dependencies installed successfully")' &&
|
||||
echo "Starting health check server..." &&
|
||||
python -c 'from health_check import HealthCheckServer; import time; server = HealthCheckServer(); server.start(); time.sleep(2); print("Health check server started")' &
|
||||
echo "Starting main application..." &&
|
||||
python main.py
|
||||
EOF
|
||||
"cd local/poe-sensor && echo 'Starting POE Sensor...' && apt-get update -qq && apt-get install -y procps curl && python -m pip install --upgrade pip && python -m pip install -r requirements.txt && echo 'Dependencies installed' && python -c 'import pymodbus, paho.mqtt.client; print(\"Dependencies OK\")' && echo 'Starting application...' && python main.py"
|
||||
]
|
||||
}
|
||||
|
||||
# Git artifact - using SSH similar to qc-scanner
|
||||
# Git artifact
|
||||
artifact {
|
||||
source = "git::ssh://git@gitea.service.mesh:2222/Mei_Sheng_Textiles/POE-sensor.git"
|
||||
destination = "local/poe-sensor"
|
||||
@ -101,23 +86,21 @@ EOF
|
||||
env {
|
||||
LOG_LEVEL = "INFO"
|
||||
PYTHONUNBUFFERED = "1"
|
||||
PYTHONDONTWRITEBYTECODE = "1" # Prevent .pyc files to save memory
|
||||
PYTHONMALLOC = "malloc" # Use system malloc for better memory management
|
||||
PYTHONDONTWRITEBYTECODE = "1"
|
||||
PYTHONMALLOC = "malloc"
|
||||
TZ = "Asia/Ho_Chi_Minh"
|
||||
# MQTT configuration (can be overridden by config.py)
|
||||
MQTT_BROKER = "mqtt.service.mesh"
|
||||
MQTT_PORT = "1883"
|
||||
MQTT_USERNAME = "relay"
|
||||
MQTT_PASSWORD = "Sey@K9c&Q4^"
|
||||
# Health check configuration
|
||||
HEALTH_CHECK_ENABLED = "true"
|
||||
HEALTH_CHECK_PORT = "8080"
|
||||
}
|
||||
|
||||
# Resource allocation - increased for stability
|
||||
# Resource allocation
|
||||
resources {
|
||||
cpu = 512 # Increased from 256
|
||||
memory = 1024 # Increased from 512
|
||||
cpu = 256
|
||||
memory = 512
|
||||
}
|
||||
|
||||
# Logs configuration
|
||||
|
Reference in New Issue
Block a user