import json import threading from http.server import HTTPServer, BaseHTTPRequestHandler from datetime import datetime, timezone import logging import time import socket import os from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED class HealthCheckHandler(BaseHTTPRequestHandler): def do_GET(self): if self.path == '/health': self.send_health_response() elif self.path == '/sensors': self.send_sensors_status() elif self.path == '/ready': self.send_readiness_response() else: self.send_response(404) self.end_headers() def send_health_response(self): """Send basic health check response""" try: # Get basic application status from sensor_tracker import get_sensor_tracker sensor_tracker = get_sensor_tracker() summary = sensor_tracker.get_summary() health_data = { "status": "healthy", "timestamp": datetime.now(timezone.utc).isoformat(), "service": "modbus-mqtt-bridge", "version": "1.0.0", "sensors": { "total": summary.get('total_sensors', 0), "online": summary.get('online_sensors', 0), "offline": summary.get('offline_sensors', 0), "unknown": summary.get('unknown_sensors', 0), "health_percentage": summary.get('health_percentage', 0.0) }, "uptime": self._get_uptime(), "host": socket.gethostname() } # Service is healthy if the health check server is running # Don't mark as unhealthy just because sensors are offline # That's what the degraded status is for if summary.get('total_sensors', 0) > 0: if summary.get('online_sensors', 0) == 0: health_data["status"] = "degraded" health_data["message"] = "All sensors offline but service is running" elif summary.get('health_percentage', 0) < 50: health_data["status"] = "degraded" health_data["message"] = f"Low sensor health: {summary.get('health_percentage', 0):.1f}%" except Exception as e: # If we can't get sensor status, still report as healthy since the service is running logging.warning(f"Could not get sensor status for health check: {e}") health_data = { "status": "healthy", "timestamp": datetime.now(timezone.utc).isoformat(), "service": "modbus-mqtt-bridge", "version": "1.0.0", "message": "Service running, sensor status unavailable", "uptime": self._get_uptime(), "host": socket.gethostname() } self.send_response(200) self.send_header('Content-type', 'application/json') self.send_header('Cache-Control', 'no-cache') self.end_headers() self.wfile.write(json.dumps(health_data, indent=2).encode()) def send_readiness_response(self): """Send readiness probe response - checks if service is ready to serve""" try: # Check if main components are initialized from sensor_tracker import get_sensor_tracker sensor_tracker = get_sensor_tracker() ready_data = { "ready": True, "timestamp": datetime.now(timezone.utc).isoformat(), "service": "modbus-mqtt-bridge", "checks": { "sensor_tracker": True, "health_server": True } } self.send_response(200) self.send_header('Content-type', 'application/json') self.send_header('Cache-Control', 'no-cache') self.end_headers() self.wfile.write(json.dumps(ready_data, indent=2).encode()) except Exception as e: logging.error(f"Readiness check failed: {e}") ready_data = { "ready": False, "timestamp": datetime.now(timezone.utc).isoformat(), "service": "modbus-mqtt-bridge", "error": str(e) } self.send_response(503) self.send_header('Content-type', 'application/json') self.send_header('Cache-Control', 'no-cache') self.end_headers() self.wfile.write(json.dumps(ready_data, indent=2).encode()) def send_sensors_status(self): """Send detailed sensor status""" try: # Get sensor status from the global sensor tracker from sensor_tracker import get_all_sensor_status sensors_status = get_all_sensor_status() self.send_response(200) self.send_header('Content-type', 'application/json') self.send_header('Cache-Control', 'no-cache') self.end_headers() self.wfile.write(json.dumps(sensors_status, indent=2).encode()) except Exception as e: logging.error(f"Error getting sensor status: {e}") error_response = { "error": "Failed to get sensor status", "message": str(e), "timestamp": datetime.now(timezone.utc).isoformat() } self.send_response(500) self.send_header('Content-type', 'application/json') self.end_headers() self.wfile.write(json.dumps(error_response, indent=2).encode()) def _get_uptime(self): """Get service uptime""" try: with open('/proc/uptime', 'r') as f: uptime_seconds = float(f.readline().split()[0]) return f"{uptime_seconds:.1f}s" except: return "unknown" def log_message(self, format, *args): """Override to use our logging system""" logging.info(f"Health Check - {format % args}") class HealthCheckServer: def __init__(self, port=None): self.port = port or int(os.getenv('HEALTH_CHECK_PORT', HEALTH_CHECK_PORT)) self.server = None self.thread = None self.started = False def start(self): """Start the health check server in a separate thread""" if not HEALTH_CHECK_ENABLED: logging.info("Health check server is disabled") return False try: logging.info(f"Attempting to start health check server on port {self.port}") # Check if port is available if not self._is_port_available(self.port): logging.error(f"Port {self.port} is already in use. Cannot start health check server.") return False # Bind to all interfaces to make it accessible from outside container self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler) # Test if the server can actually bind to the port logging.info(f"Successfully bound to 0.0.0.0:{self.port}") self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True) self.thread.start() # Give the server a moment to start and verify it's working time.sleep(1) if self._test_health_endpoint(): self.started = True logging.info(f"Health check server started successfully on 0.0.0.0:{self.port}") logging.info(f"Health check endpoints:") logging.info(f" - http://0.0.0.0:{self.port}/health") logging.info(f" - http://0.0.0.0:{self.port}/sensors") logging.info(f" - http://0.0.0.0:{self.port}/ready") logging.info("Health check server is ready for external health checks") return True else: logging.error("Health check server started but endpoints are not responding") return False except OSError as e: if e.errno == 98: # Address already in use logging.error(f"Port {self.port} is already in use. Cannot start health check server.") else: logging.error(f"OS error starting health check server: {e}") return False except Exception as e: logging.error(f"Failed to start health check server: {e}") return False def _is_port_available(self, port): """Check if port is available""" try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('0.0.0.0', port)) return True except OSError: return False def _test_health_endpoint(self): """Test if health endpoint is responding""" try: import urllib.request with urllib.request.urlopen(f'http://localhost:{self.port}/health', timeout=5) as response: return response.status == 200 except Exception as e: logging.warning(f"Health endpoint test failed: {e}") return False def _serve_with_error_handling(self): """Serve forever with error handling""" try: logging.info("Health check server thread started, beginning to serve requests") if self.server: self.server.serve_forever() else: logging.error("Health check server is None, cannot serve requests") except Exception as e: logging.error(f"Health check server error: {e}", exc_info=True) self.started = False def stop(self): """Stop the health check server""" if self.server: self.server.shutdown() self.server.server_close() self.started = False logging.info("Health check server stopped") def is_running(self): """Check if server is running""" return self.started and self.thread and self.thread.is_alive()