Files
POE-sensor/health_check.py

138 lines
5.5 KiB
Python

import json
import threading
from http.server import HTTPServer, BaseHTTPRequestHandler
from datetime import datetime, timezone
import logging
import time
from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED
class HealthCheckHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == '/health':
self.send_health_response()
elif self.path == '/sensors':
self.send_sensors_status()
else:
self.send_response(404)
self.end_headers()
def send_health_response(self):
"""Send basic health check response"""
try:
# Get basic application status
from sensor_tracker import get_sensor_tracker
sensor_tracker = get_sensor_tracker()
summary = sensor_tracker.get_summary()
health_data = {
"status": "healthy",
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"version": "1.0.0",
"sensors": {
"total": summary.get('total_sensors', 0),
"online": summary.get('online_sensors', 0),
"health_percentage": summary.get('health_percentage', 0.0)
}
}
# Consider the service unhealthy if no sensors are working
if summary.get('total_sensors', 0) > 0 and summary.get('online_sensors', 0) == 0:
# If we have sensors configured but none are online, report as degraded but still healthy
# (since the health check server itself is working)
health_data["status"] = "degraded"
health_data["message"] = "All sensors offline"
except Exception as e:
# If we can't get sensor status, still report as healthy since the service is running
logging.warning(f"Could not get sensor status for health check: {e}")
health_data = {
"status": "healthy",
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"version": "1.0.0",
"message": "Service running, sensor status unavailable"
}
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
self.wfile.write(json.dumps(health_data, indent=2).encode())
def send_sensors_status(self):
"""Send detailed sensor status"""
# Get sensor status from the global sensor tracker
from sensor_tracker import get_all_sensor_status
sensors_status = get_all_sensor_status()
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
self.wfile.write(json.dumps(sensors_status, indent=2).encode())
def log_message(self, format, *args):
"""Override to use our logging system"""
logging.info(f"Health Check - {format % args}")
class HealthCheckServer:
def __init__(self, port=HEALTH_CHECK_PORT):
self.port = port
self.server = None
self.thread = None
def start(self):
"""Start the health check server in a separate thread"""
if not HEALTH_CHECK_ENABLED:
logging.info("Health check server is disabled")
return
try:
logging.info(f"Attempting to start health check server on port {self.port}")
# Bind to all interfaces to make it accessible from outside container
self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler)
# Test if the server can actually bind to the port
logging.info(f"Successfully bound to 0.0.0.0:{self.port}")
self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True)
self.thread.start()
# Give the server a moment to start
time.sleep(0.5)
logging.info(f"Health check server started on 0.0.0.0:{self.port}")
logging.info(f"Health check endpoints:")
logging.info(f" - http://0.0.0.0:{self.port}/health")
logging.info(f" - http://0.0.0.0:{self.port}/sensors")
logging.info("Health check server is ready for external health checks")
except OSError as e:
if e.errno == 98: # Address already in use
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
else:
logging.error(f"OS error starting health check server: {e}")
raise e
except Exception as e:
logging.error(f"Failed to start health check server: {e}")
raise e # Re-raise to make the issue visible
def _serve_with_error_handling(self):
"""Serve forever with error handling"""
try:
logging.info("Health check server thread started, beginning to serve requests")
if self.server:
self.server.serve_forever()
else:
logging.error("Health check server is None, cannot serve requests")
except Exception as e:
logging.error(f"Health check server error: {e}", exc_info=True)
def stop(self):
"""Stop the health check server"""
if self.server:
self.server.shutdown()
self.server.server_close()
logging.info("Health check server stopped")