Enhance health check functionality and logging in POE project. Added detailed sensor status reporting to health check response, improved error handling during server startup, and updated logging format for better visibility. Adjusted health check parameters in Nomad configuration for increased stability and monitoring accuracy.

This commit is contained in:
Naab2k3
2025-06-23 10:00:08 +07:00
parent b1e111e3f4
commit cb0ab4c4d3
4 changed files with 83 additions and 15 deletions

View File

@ -3,6 +3,7 @@ import threading
from http.server import HTTPServer, BaseHTTPRequestHandler from http.server import HTTPServer, BaseHTTPRequestHandler
from datetime import datetime, timezone from datetime import datetime, timezone
import logging import logging
import time
from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED
@ -18,12 +19,41 @@ class HealthCheckHandler(BaseHTTPRequestHandler):
def send_health_response(self): def send_health_response(self):
"""Send basic health check response""" """Send basic health check response"""
health_data = { try:
"status": "healthy", # Get basic application status
"timestamp": datetime.now(timezone.utc).isoformat(), from sensor_tracker import get_sensor_tracker
"service": "modbus-mqtt-bridge", sensor_tracker = get_sensor_tracker()
"version": "1.0.0" summary = sensor_tracker.get_summary()
}
health_data = {
"status": "healthy",
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"version": "1.0.0",
"sensors": {
"total": summary.get('total_sensors', 0),
"online": summary.get('online_sensors', 0),
"health_percentage": summary.get('health_percentage', 0.0)
}
}
# Consider the service unhealthy if no sensors are working
if summary.get('total_sensors', 0) > 0 and summary.get('online_sensors', 0) == 0:
# If we have sensors configured but none are online, report as degraded but still healthy
# (since the health check server itself is working)
health_data["status"] = "degraded"
health_data["message"] = "All sensors offline"
except Exception as e:
# If we can't get sensor status, still report as healthy since the service is running
logging.warning(f"Could not get sensor status for health check: {e}")
health_data = {
"status": "healthy",
"timestamp": datetime.now(timezone.utc).isoformat(),
"service": "modbus-mqtt-bridge",
"version": "1.0.0",
"message": "Service running, sensor status unavailable"
}
self.send_response(200) self.send_response(200)
self.send_header('Content-type', 'application/json') self.send_header('Content-type', 'application/json')
@ -59,18 +89,46 @@ class HealthCheckServer:
return return
try: try:
logging.info(f"Attempting to start health check server on port {self.port}")
# Bind to all interfaces to make it accessible from outside container # Bind to all interfaces to make it accessible from outside container
self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler) self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler)
self.thread = threading.Thread(target=self.server.serve_forever, daemon=True)
# Test if the server can actually bind to the port
logging.info(f"Successfully bound to 0.0.0.0:{self.port}")
self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True)
self.thread.start() self.thread.start()
# Give the server a moment to start
time.sleep(0.5)
logging.info(f"Health check server started on 0.0.0.0:{self.port}") logging.info(f"Health check server started on 0.0.0.0:{self.port}")
logging.info(f"Health check endpoints:") logging.info(f"Health check endpoints:")
logging.info(f" - http://0.0.0.0:{self.port}/health") logging.info(f" - http://0.0.0.0:{self.port}/health")
logging.info(f" - http://0.0.0.0:{self.port}/sensors") logging.info(f" - http://0.0.0.0:{self.port}/sensors")
logging.info("Health check server is ready for external health checks")
except OSError as e:
if e.errno == 98: # Address already in use
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
else:
logging.error(f"OS error starting health check server: {e}")
raise e
except Exception as e: except Exception as e:
logging.error(f"Failed to start health check server: {e}") logging.error(f"Failed to start health check server: {e}")
raise e # Re-raise to make the issue visible raise e # Re-raise to make the issue visible
def _serve_with_error_handling(self):
"""Serve forever with error handling"""
try:
logging.info("Health check server thread started, beginning to serve requests")
if self.server:
self.server.serve_forever()
else:
logging.error("Health check server is None, cannot serve requests")
except Exception as e:
logging.error(f"Health check server error: {e}", exc_info=True)
def stop(self): def stop(self):
"""Stop the health check server""" """Stop the health check server"""
if self.server: if self.server:

View File

@ -11,7 +11,16 @@ Usage:
Author: POE Project Author: POE Project
""" """
import time
import logging
from sensor_bridge import main_loop from sensor_bridge import main_loop
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("POE Sensor Bridge starting up...")
# Give the system a moment to initialize
time.sleep(2)
logging.info("Starting main sensor loop...")
main_loop() main_loop()

View File

@ -29,8 +29,8 @@ job "poe-sensor" {
# Update strategy # Update strategy
update { update {
max_parallel = 1 max_parallel = 1
min_healthy_time = "30s" min_healthy_time = "60s"
healthy_deadline = "3m" healthy_deadline = "5m"
progress_deadline = "10m" progress_deadline = "10m"
auto_revert = true auto_revert = true
canary = 0 canary = 0
@ -50,11 +50,12 @@ job "poe-sensor" {
check { check {
type = "http" type = "http"
path = "/health" path = "/health"
interval = "30s" interval = "60s"
timeout = "10s" timeout = "15s"
initial_status = "passing"
check_restart { check_restart {
limit = 3 limit = 2
grace = "10s" grace = "15s"
} }
} }
} }

View File

@ -236,8 +236,8 @@ def main_loop():
logging.info(f"Starting monitoring of {len(MODBUS_HOSTS)} sensors") logging.info(f"Starting monitoring of {len(MODBUS_HOSTS)} sensors")
logging.info("System status can be monitored at:") logging.info("System status can be monitored at:")
logging.info(f" - Health: http://localhost:8080/health") logging.info(f" - Health: http://0.0.0.0:8080/health")
logging.info(f" - Sensors: http://localhost:8080/sensors") logging.info(f" - Sensors: http://0.0.0.0:8080/sensors")
# Main loop to read and publish data from all hosts # Main loop to read and publish data from all hosts
while True: while True: