diff --git a/health_check.py b/health_check.py index de5f4fa..9b278b3 100644 --- a/health_check.py +++ b/health_check.py @@ -3,6 +3,7 @@ import threading from http.server import HTTPServer, BaseHTTPRequestHandler from datetime import datetime, timezone import logging +import time from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED @@ -18,12 +19,41 @@ class HealthCheckHandler(BaseHTTPRequestHandler): def send_health_response(self): """Send basic health check response""" - health_data = { - "status": "healthy", - "timestamp": datetime.now(timezone.utc).isoformat(), - "service": "modbus-mqtt-bridge", - "version": "1.0.0" - } + try: + # Get basic application status + from sensor_tracker import get_sensor_tracker + sensor_tracker = get_sensor_tracker() + summary = sensor_tracker.get_summary() + + health_data = { + "status": "healthy", + "timestamp": datetime.now(timezone.utc).isoformat(), + "service": "modbus-mqtt-bridge", + "version": "1.0.0", + "sensors": { + "total": summary.get('total_sensors', 0), + "online": summary.get('online_sensors', 0), + "health_percentage": summary.get('health_percentage', 0.0) + } + } + + # Consider the service unhealthy if no sensors are working + if summary.get('total_sensors', 0) > 0 and summary.get('online_sensors', 0) == 0: + # If we have sensors configured but none are online, report as degraded but still healthy + # (since the health check server itself is working) + health_data["status"] = "degraded" + health_data["message"] = "All sensors offline" + + except Exception as e: + # If we can't get sensor status, still report as healthy since the service is running + logging.warning(f"Could not get sensor status for health check: {e}") + health_data = { + "status": "healthy", + "timestamp": datetime.now(timezone.utc).isoformat(), + "service": "modbus-mqtt-bridge", + "version": "1.0.0", + "message": "Service running, sensor status unavailable" + } self.send_response(200) self.send_header('Content-type', 'application/json') @@ -59,17 +89,45 @@ class HealthCheckServer: return try: + logging.info(f"Attempting to start health check server on port {self.port}") # Bind to all interfaces to make it accessible from outside container self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler) - self.thread = threading.Thread(target=self.server.serve_forever, daemon=True) + + # Test if the server can actually bind to the port + logging.info(f"Successfully bound to 0.0.0.0:{self.port}") + + self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True) self.thread.start() + + # Give the server a moment to start + time.sleep(0.5) + logging.info(f"Health check server started on 0.0.0.0:{self.port}") logging.info(f"Health check endpoints:") logging.info(f" - http://0.0.0.0:{self.port}/health") logging.info(f" - http://0.0.0.0:{self.port}/sensors") + logging.info("Health check server is ready for external health checks") + + except OSError as e: + if e.errno == 98: # Address already in use + logging.error(f"Port {self.port} is already in use. Cannot start health check server.") + else: + logging.error(f"OS error starting health check server: {e}") + raise e except Exception as e: logging.error(f"Failed to start health check server: {e}") raise e # Re-raise to make the issue visible + + def _serve_with_error_handling(self): + """Serve forever with error handling""" + try: + logging.info("Health check server thread started, beginning to serve requests") + if self.server: + self.server.serve_forever() + else: + logging.error("Health check server is None, cannot serve requests") + except Exception as e: + logging.error(f"Health check server error: {e}", exc_info=True) def stop(self): """Stop the health check server""" diff --git a/main.py b/main.py index 3d35d87..ded51d4 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,16 @@ Usage: Author: POE Project """ +import time +import logging from sensor_bridge import main_loop if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + logging.info("POE Sensor Bridge starting up...") + + # Give the system a moment to initialize + time.sleep(2) + + logging.info("Starting main sensor loop...") main_loop() \ No newline at end of file diff --git a/poe-sensor.nomad b/poe-sensor.nomad index 21c6b3f..c31f778 100644 --- a/poe-sensor.nomad +++ b/poe-sensor.nomad @@ -29,8 +29,8 @@ job "poe-sensor" { # Update strategy update { max_parallel = 1 - min_healthy_time = "30s" - healthy_deadline = "3m" + min_healthy_time = "60s" + healthy_deadline = "5m" progress_deadline = "10m" auto_revert = true canary = 0 @@ -50,11 +50,12 @@ job "poe-sensor" { check { type = "http" path = "/health" - interval = "30s" - timeout = "10s" + interval = "60s" + timeout = "15s" + initial_status = "passing" check_restart { - limit = 3 - grace = "10s" + limit = 2 + grace = "15s" } } } diff --git a/sensor_bridge.py b/sensor_bridge.py index 0a9e3d9..d1d43b6 100644 --- a/sensor_bridge.py +++ b/sensor_bridge.py @@ -236,8 +236,8 @@ def main_loop(): logging.info(f"Starting monitoring of {len(MODBUS_HOSTS)} sensors") logging.info("System status can be monitored at:") - logging.info(f" - Health: http://localhost:8080/health") - logging.info(f" - Sensors: http://localhost:8080/sensors") + logging.info(f" - Health: http://0.0.0.0:8080/health") + logging.info(f" - Sensors: http://0.0.0.0:8080/sensors") # Main loop to read and publish data from all hosts while True: