Refactor health check functionality in POE project. Removed readiness endpoint from health_check.py and improved error handling for sensor status retrieval. Updated logging to reduce noise and adjusted health check server startup process in main.py. Modified Nomad job configuration for network mode and resource allocation, enhancing overall system performance and stability.
This commit is contained in:
208
health_check.py
208
health_check.py
@ -15,8 +15,6 @@ class HealthCheckHandler(BaseHTTPRequestHandler):
|
||||
self.send_health_response()
|
||||
elif self.path == '/sensors':
|
||||
self.send_sensors_status()
|
||||
elif self.path == '/ready':
|
||||
self.send_readiness_response()
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
@ -24,110 +22,64 @@ class HealthCheckHandler(BaseHTTPRequestHandler):
|
||||
def send_health_response(self):
|
||||
"""Send basic health check response"""
|
||||
try:
|
||||
# Get basic application status
|
||||
from sensor_tracker import get_sensor_tracker
|
||||
sensor_tracker = get_sensor_tracker()
|
||||
summary = sensor_tracker.get_summary()
|
||||
|
||||
health_data = {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"service": "modbus-mqtt-bridge",
|
||||
"version": "1.0.0",
|
||||
"sensors": {
|
||||
"total": summary.get('total_sensors', 0),
|
||||
"online": summary.get('online_sensors', 0),
|
||||
"offline": summary.get('offline_sensors', 0),
|
||||
"unknown": summary.get('unknown_sensors', 0),
|
||||
"health_percentage": summary.get('health_percentage', 0.0)
|
||||
},
|
||||
"uptime": self._get_uptime(),
|
||||
"host": socket.gethostname()
|
||||
}
|
||||
|
||||
# Service is healthy if the health check server is running
|
||||
# Don't mark as unhealthy just because sensors are offline
|
||||
# That's what the degraded status is for
|
||||
if summary.get('total_sensors', 0) > 0:
|
||||
if summary.get('online_sensors', 0) == 0:
|
||||
health_data["status"] = "degraded"
|
||||
health_data["message"] = "All sensors offline but service is running"
|
||||
elif summary.get('health_percentage', 0) < 50:
|
||||
health_data["status"] = "degraded"
|
||||
health_data["message"] = f"Low sensor health: {summary.get('health_percentage', 0):.1f}%"
|
||||
|
||||
except Exception as e:
|
||||
# If we can't get sensor status, still report as healthy since the service is running
|
||||
logging.warning(f"Could not get sensor status for health check: {e}")
|
||||
health_data = {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"service": "modbus-mqtt-bridge",
|
||||
"version": "1.0.0",
|
||||
"message": "Service running, sensor status unavailable",
|
||||
"uptime": self._get_uptime(),
|
||||
"host": socket.gethostname()
|
||||
}
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'application/json')
|
||||
self.send_header('Cache-Control', 'no-cache')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(health_data, indent=2).encode())
|
||||
|
||||
def send_readiness_response(self):
|
||||
"""Send readiness probe response - checks if service is ready to serve"""
|
||||
try:
|
||||
# Check if main components are initialized
|
||||
from sensor_tracker import get_sensor_tracker
|
||||
sensor_tracker = get_sensor_tracker()
|
||||
|
||||
ready_data = {
|
||||
"ready": True,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"service": "modbus-mqtt-bridge",
|
||||
"checks": {
|
||||
"sensor_tracker": True,
|
||||
"health_server": True
|
||||
# Try to get sensor status
|
||||
try:
|
||||
from sensor_tracker import get_sensor_tracker
|
||||
sensor_tracker = get_sensor_tracker()
|
||||
summary = sensor_tracker.get_summary()
|
||||
|
||||
health_data = {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"service": "modbus-mqtt-bridge",
|
||||
"version": "1.0.0",
|
||||
"sensors": {
|
||||
"total": summary.get('total_sensors', 0),
|
||||
"online": summary.get('online_sensors', 0),
|
||||
"health_percentage": summary.get('health_percentage', 0.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'application/json')
|
||||
self.send_header('Cache-Control', 'no-cache')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(ready_data, indent=2).encode())
|
||||
|
||||
|
||||
# Mark as degraded if no sensors online but service is running
|
||||
if summary.get('total_sensors', 0) > 0 and summary.get('online_sensors', 0) == 0:
|
||||
health_data["status"] = "degraded"
|
||||
health_data["message"] = "All sensors offline"
|
||||
|
||||
except Exception as e:
|
||||
# If sensor tracker not available, still report healthy
|
||||
health_data = {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"service": "modbus-mqtt-bridge",
|
||||
"version": "1.0.0",
|
||||
"message": "Service running, sensor status unavailable"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Readiness check failed: {e}")
|
||||
ready_data = {
|
||||
"ready": False,
|
||||
health_data = {
|
||||
"status": "unhealthy",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"service": "modbus-mqtt-bridge",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
self.send_response(503)
|
||||
self.send_header('Content-type', 'application/json')
|
||||
self.send_header('Cache-Control', 'no-cache')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(ready_data, indent=2).encode())
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'application/json')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(health_data, indent=2).encode())
|
||||
|
||||
def send_sensors_status(self):
|
||||
"""Send detailed sensor status"""
|
||||
try:
|
||||
# Get sensor status from the global sensor tracker
|
||||
from sensor_tracker import get_all_sensor_status
|
||||
sensors_status = get_all_sensor_status()
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'application/json')
|
||||
self.send_header('Cache-Control', 'no-cache')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(sensors_status, indent=2).encode())
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error getting sensor status: {e}")
|
||||
error_response = {
|
||||
"error": "Failed to get sensor status",
|
||||
"message": str(e),
|
||||
@ -139,18 +91,9 @@ class HealthCheckHandler(BaseHTTPRequestHandler):
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(error_response, indent=2).encode())
|
||||
|
||||
def _get_uptime(self):
|
||||
"""Get service uptime"""
|
||||
try:
|
||||
with open('/proc/uptime', 'r') as f:
|
||||
uptime_seconds = float(f.readline().split()[0])
|
||||
return f"{uptime_seconds:.1f}s"
|
||||
except:
|
||||
return "unknown"
|
||||
|
||||
def log_message(self, format, *args):
|
||||
"""Override to use our logging system"""
|
||||
logging.info(f"Health Check - {format % args}")
|
||||
"""Override to reduce noise"""
|
||||
pass
|
||||
|
||||
class HealthCheckServer:
|
||||
def __init__(self, port=None):
|
||||
@ -160,87 +103,40 @@ class HealthCheckServer:
|
||||
self.started = False
|
||||
|
||||
def start(self):
|
||||
"""Start the health check server in a separate thread"""
|
||||
"""Start the health check server"""
|
||||
if not HEALTH_CHECK_ENABLED:
|
||||
logging.info("Health check server is disabled")
|
||||
return False
|
||||
|
||||
try:
|
||||
logging.info(f"Attempting to start health check server on port {self.port}")
|
||||
|
||||
# Check if port is available
|
||||
if not self._is_port_available(self.port):
|
||||
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
|
||||
return False
|
||||
|
||||
# Bind to all interfaces to make it accessible from outside container
|
||||
logging.info(f"Starting health check server on port {self.port}")
|
||||
self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler)
|
||||
|
||||
# Test if the server can actually bind to the port
|
||||
logging.info(f"Successfully bound to 0.0.0.0:{self.port}")
|
||||
|
||||
self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True)
|
||||
self.thread = threading.Thread(target=self._serve, daemon=True)
|
||||
self.thread.start()
|
||||
|
||||
# Give the server a moment to start and verify it's working
|
||||
# Give server time to start
|
||||
time.sleep(1)
|
||||
|
||||
if self._test_health_endpoint():
|
||||
self.started = True
|
||||
logging.info(f"Health check server started successfully on 0.0.0.0:{self.port}")
|
||||
logging.info(f"Health check endpoints:")
|
||||
logging.info(f" - http://0.0.0.0:{self.port}/health")
|
||||
logging.info(f" - http://0.0.0.0:{self.port}/sensors")
|
||||
logging.info(f" - http://0.0.0.0:{self.port}/ready")
|
||||
logging.info("Health check server is ready for external health checks")
|
||||
return True
|
||||
else:
|
||||
logging.error("Health check server started but endpoints are not responding")
|
||||
return False
|
||||
self.started = True
|
||||
logging.info(f"Health check server running on http://0.0.0.0:{self.port}/health")
|
||||
return True
|
||||
|
||||
except OSError as e:
|
||||
if e.errno == 98: # Address already in use
|
||||
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
|
||||
else:
|
||||
logging.error(f"OS error starting health check server: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to start health check server: {e}")
|
||||
return False
|
||||
|
||||
def _is_port_available(self, port):
|
||||
"""Check if port is available"""
|
||||
def _serve(self):
|
||||
"""Serve requests"""
|
||||
try:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(('0.0.0.0', port))
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
def _test_health_endpoint(self):
|
||||
"""Test if health endpoint is responding"""
|
||||
try:
|
||||
import urllib.request
|
||||
with urllib.request.urlopen(f'http://localhost:{self.port}/health', timeout=5) as response:
|
||||
return response.status == 200
|
||||
except Exception as e:
|
||||
logging.warning(f"Health endpoint test failed: {e}")
|
||||
return False
|
||||
|
||||
def _serve_with_error_handling(self):
|
||||
"""Serve forever with error handling"""
|
||||
try:
|
||||
logging.info("Health check server thread started, beginning to serve requests")
|
||||
if self.server:
|
||||
self.server.serve_forever()
|
||||
else:
|
||||
logging.error("Health check server is None, cannot serve requests")
|
||||
except Exception as e:
|
||||
logging.error(f"Health check server error: {e}", exc_info=True)
|
||||
logging.error(f"Health check server error: {e}")
|
||||
self.started = False
|
||||
|
||||
def stop(self):
|
||||
"""Stop the health check server"""
|
||||
"""Stop the server"""
|
||||
if self.server:
|
||||
self.server.shutdown()
|
||||
self.server.server_close()
|
||||
|
Reference in New Issue
Block a user