Files
POE-sensor/health_check.py

253 lines
10 KiB
Python

import json
import threading
from http.server import HTTPServer, BaseHTTPRequestHandler
from datetime import datetime, timezone
import logging
import time
import socket
import os
from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED
class HealthCheckHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == '/health':
self.send_health_response()
elif self.path == '/sensors':
self.send_sensors_status()
elif self.path == '/ready':
self.send_readiness_response()
else:
self.send_response(404)
self.end_headers()
def send_health_response(self):
"""Send basic health check response"""
try:
# Get basic application status
from sensor_tracker import get_sensor_tracker
sensor_tracker = get_sensor_tracker()
summary = sensor_tracker.get_summary()
health_data = {
"status": "healthy",
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"version": "1.0.0",
"sensors": {
"total": summary.get('total_sensors', 0),
"online": summary.get('online_sensors', 0),
"offline": summary.get('offline_sensors', 0),
"unknown": summary.get('unknown_sensors', 0),
"health_percentage": summary.get('health_percentage', 0.0)
},
"uptime": self._get_uptime(),
"host": socket.gethostname()
}
# Service is healthy if the health check server is running
# Don't mark as unhealthy just because sensors are offline
# That's what the degraded status is for
if summary.get('total_sensors', 0) > 0:
if summary.get('online_sensors', 0) == 0:
health_data["status"] = "degraded"
health_data["message"] = "All sensors offline but service is running"
elif summary.get('health_percentage', 0) < 50:
health_data["status"] = "degraded"
health_data["message"] = f"Low sensor health: {summary.get('health_percentage', 0):.1f}%"
except Exception as e:
# If we can't get sensor status, still report as healthy since the service is running
logging.warning(f"Could not get sensor status for health check: {e}")
health_data = {
"status": "healthy",
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"version": "1.0.0",
"message": "Service running, sensor status unavailable",
"uptime": self._get_uptime(),
"host": socket.gethostname()
}
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.send_header('Cache-Control', 'no-cache')
self.end_headers()
self.wfile.write(json.dumps(health_data, indent=2).encode())
def send_readiness_response(self):
"""Send readiness probe response - checks if service is ready to serve"""
try:
# Check if main components are initialized
from sensor_tracker import get_sensor_tracker
sensor_tracker = get_sensor_tracker()
ready_data = {
"ready": True,
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"checks": {
"sensor_tracker": True,
"health_server": True
}
}
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.send_header('Cache-Control', 'no-cache')
self.end_headers()
self.wfile.write(json.dumps(ready_data, indent=2).encode())
except Exception as e:
logging.error(f"Readiness check failed: {e}")
ready_data = {
"ready": False,
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"error": str(e)
}
self.send_response(503)
self.send_header('Content-type', 'application/json')
self.send_header('Cache-Control', 'no-cache')
self.end_headers()
self.wfile.write(json.dumps(ready_data, indent=2).encode())
def send_sensors_status(self):
"""Send detailed sensor status"""
try:
# Get sensor status from the global sensor tracker
from sensor_tracker import get_all_sensor_status
sensors_status = get_all_sensor_status()
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.send_header('Cache-Control', 'no-cache')
self.end_headers()
self.wfile.write(json.dumps(sensors_status, indent=2).encode())
except Exception as e:
logging.error(f"Error getting sensor status: {e}")
error_response = {
"error": "Failed to get sensor status",
"message": str(e),
"timestamp": datetime.now(timezone.utc).isoformat()
}
self.send_response(500)
self.send_header('Content-type', 'application/json')
self.end_headers()
self.wfile.write(json.dumps(error_response, indent=2).encode())
def _get_uptime(self):
"""Get service uptime"""
try:
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
return f"{uptime_seconds:.1f}s"
except:
return "unknown"
def log_message(self, format, *args):
"""Override to use our logging system"""
logging.info(f"Health Check - {format % args}")
class HealthCheckServer:
def __init__(self, port=None):
self.port = port or int(os.getenv('HEALTH_CHECK_PORT', HEALTH_CHECK_PORT))
self.server = None
self.thread = None
self.started = False
def start(self):
"""Start the health check server in a separate thread"""
if not HEALTH_CHECK_ENABLED:
logging.info("Health check server is disabled")
return False
try:
logging.info(f"Attempting to start health check server on port {self.port}")
# Check if port is available
if not self._is_port_available(self.port):
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
return False
# Bind to all interfaces to make it accessible from outside container
self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler)
# Test if the server can actually bind to the port
logging.info(f"Successfully bound to 0.0.0.0:{self.port}")
self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True)
self.thread.start()
# Give the server a moment to start and verify it's working
time.sleep(1)
if self._test_health_endpoint():
self.started = True
logging.info(f"Health check server started successfully on 0.0.0.0:{self.port}")
logging.info(f"Health check endpoints:")
logging.info(f" - http://0.0.0.0:{self.port}/health")
logging.info(f" - http://0.0.0.0:{self.port}/sensors")
logging.info(f" - http://0.0.0.0:{self.port}/ready")
logging.info("Health check server is ready for external health checks")
return True
else:
logging.error("Health check server started but endpoints are not responding")
return False
except OSError as e:
if e.errno == 98: # Address already in use
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
else:
logging.error(f"OS error starting health check server: {e}")
return False
except Exception as e:
logging.error(f"Failed to start health check server: {e}")
return False
def _is_port_available(self, port):
"""Check if port is available"""
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('0.0.0.0', port))
return True
except OSError:
return False
def _test_health_endpoint(self):
"""Test if health endpoint is responding"""
try:
import urllib.request
with urllib.request.urlopen(f'http://localhost:{self.port}/health', timeout=5) as response:
return response.status == 200
except Exception as e:
logging.warning(f"Health endpoint test failed: {e}")
return False
def _serve_with_error_handling(self):
"""Serve forever with error handling"""
try:
logging.info("Health check server thread started, beginning to serve requests")
if self.server:
self.server.serve_forever()
else:
logging.error("Health check server is None, cannot serve requests")
except Exception as e:
logging.error(f"Health check server error: {e}", exc_info=True)
self.started = False
def stop(self):
"""Stop the health check server"""
if self.server:
self.server.shutdown()
self.server.server_close()
self.started = False
logging.info("Health check server stopped")
def is_running(self):
"""Check if server is running"""
return self.started and self.thread and self.thread.is_alive()