From cb0ab4c4d3d1f97feed814f1e7f58544b1f6418f Mon Sep 17 00:00:00 2001 From: Naab2k3 Date: Mon, 23 Jun 2025 10:00:08 +0700 Subject: [PATCH] Enhance health check functionality and logging in POE project. Added detailed sensor status reporting to health check response, improved error handling during server startup, and updated logging format for better visibility. Adjusted health check parameters in Nomad configuration for increased stability and monitoring accuracy. --- health_check.py | 72 +++++++++++++++++++++++++++++++++++++++++++----- main.py | 9 ++++++ poe-sensor.nomad | 13 +++++---- sensor_bridge.py | 4 +-- 4 files changed, 83 insertions(+), 15 deletions(-) diff --git a/health_check.py b/health_check.py index de5f4fa..9b278b3 100644 --- a/health_check.py +++ b/health_check.py @@ -3,6 +3,7 @@ import threading from http.server import HTTPServer, BaseHTTPRequestHandler from datetime import datetime, timezone import logging +import time from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED @@ -18,12 +19,41 @@ class HealthCheckHandler(BaseHTTPRequestHandler): def send_health_response(self): """Send basic health check response""" - health_data = { - "status": "healthy", - "timestamp": datetime.now(timezone.utc).isoformat(), - "service": "modbus-mqtt-bridge", - "version": "1.0.0" - } + try: + # Get basic application status + from sensor_tracker import get_sensor_tracker + sensor_tracker = get_sensor_tracker() + summary = sensor_tracker.get_summary() + + health_data = { + "status": "healthy", + "timestamp": datetime.now(timezone.utc).isoformat(), + "service": "modbus-mqtt-bridge", + "version": "1.0.0", + "sensors": { + "total": summary.get('total_sensors', 0), + "online": summary.get('online_sensors', 0), + "health_percentage": summary.get('health_percentage', 0.0) + } + } + + # Consider the service unhealthy if no sensors are working + if summary.get('total_sensors', 0) > 0 and summary.get('online_sensors', 0) == 0: + # If we have sensors configured but none are online, report as degraded but still healthy + # (since the health check server itself is working) + health_data["status"] = "degraded" + health_data["message"] = "All sensors offline" + + except Exception as e: + # If we can't get sensor status, still report as healthy since the service is running + logging.warning(f"Could not get sensor status for health check: {e}") + health_data = { + "status": "healthy", + "timestamp": datetime.now(timezone.utc).isoformat(), + "service": "modbus-mqtt-bridge", + "version": "1.0.0", + "message": "Service running, sensor status unavailable" + } self.send_response(200) self.send_header('Content-type', 'application/json') @@ -59,17 +89,45 @@ class HealthCheckServer: return try: + logging.info(f"Attempting to start health check server on port {self.port}") # Bind to all interfaces to make it accessible from outside container self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler) - self.thread = threading.Thread(target=self.server.serve_forever, daemon=True) + + # Test if the server can actually bind to the port + logging.info(f"Successfully bound to 0.0.0.0:{self.port}") + + self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True) self.thread.start() + + # Give the server a moment to start + time.sleep(0.5) + logging.info(f"Health check server started on 0.0.0.0:{self.port}") logging.info(f"Health check endpoints:") logging.info(f" - http://0.0.0.0:{self.port}/health") logging.info(f" - http://0.0.0.0:{self.port}/sensors") + logging.info("Health check server is ready for external health checks") + + except OSError as e: + if e.errno == 98: # Address already in use + logging.error(f"Port {self.port} is already in use. Cannot start health check server.") + else: + logging.error(f"OS error starting health check server: {e}") + raise e except Exception as e: logging.error(f"Failed to start health check server: {e}") raise e # Re-raise to make the issue visible + + def _serve_with_error_handling(self): + """Serve forever with error handling""" + try: + logging.info("Health check server thread started, beginning to serve requests") + if self.server: + self.server.serve_forever() + else: + logging.error("Health check server is None, cannot serve requests") + except Exception as e: + logging.error(f"Health check server error: {e}", exc_info=True) def stop(self): """Stop the health check server""" diff --git a/main.py b/main.py index 3d35d87..ded51d4 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,16 @@ Usage: Author: POE Project """ +import time +import logging from sensor_bridge import main_loop if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + logging.info("POE Sensor Bridge starting up...") + + # Give the system a moment to initialize + time.sleep(2) + + logging.info("Starting main sensor loop...") main_loop() \ No newline at end of file diff --git a/poe-sensor.nomad b/poe-sensor.nomad index 21c6b3f..c31f778 100644 --- a/poe-sensor.nomad +++ b/poe-sensor.nomad @@ -29,8 +29,8 @@ job "poe-sensor" { # Update strategy update { max_parallel = 1 - min_healthy_time = "30s" - healthy_deadline = "3m" + min_healthy_time = "60s" + healthy_deadline = "5m" progress_deadline = "10m" auto_revert = true canary = 0 @@ -50,11 +50,12 @@ job "poe-sensor" { check { type = "http" path = "/health" - interval = "30s" - timeout = "10s" + interval = "60s" + timeout = "15s" + initial_status = "passing" check_restart { - limit = 3 - grace = "10s" + limit = 2 + grace = "15s" } } } diff --git a/sensor_bridge.py b/sensor_bridge.py index 0a9e3d9..d1d43b6 100644 --- a/sensor_bridge.py +++ b/sensor_bridge.py @@ -236,8 +236,8 @@ def main_loop(): logging.info(f"Starting monitoring of {len(MODBUS_HOSTS)} sensors") logging.info("System status can be monitored at:") - logging.info(f" - Health: http://localhost:8080/health") - logging.info(f" - Sensors: http://localhost:8080/sensors") + logging.info(f" - Health: http://0.0.0.0:8080/health") + logging.info(f" - Sensors: http://0.0.0.0:8080/sensors") # Main loop to read and publish data from all hosts while True: