Enhance health check functionality and logging in POE project. Added detailed sensor status reporting to health check response, improved error handling during server startup, and updated logging format for better visibility. Adjusted health check parameters in Nomad configuration for increased stability and monitoring accuracy.
This commit is contained in:
@ -3,6 +3,7 @@ import threading
|
|||||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED
|
from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED
|
||||||
|
|
||||||
@ -18,12 +19,41 @@ class HealthCheckHandler(BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
def send_health_response(self):
|
def send_health_response(self):
|
||||||
"""Send basic health check response"""
|
"""Send basic health check response"""
|
||||||
health_data = {
|
try:
|
||||||
"status": "healthy",
|
# Get basic application status
|
||||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
from sensor_tracker import get_sensor_tracker
|
||||||
"service": "modbus-mqtt-bridge",
|
sensor_tracker = get_sensor_tracker()
|
||||||
"version": "1.0.0"
|
summary = sensor_tracker.get_summary()
|
||||||
}
|
|
||||||
|
health_data = {
|
||||||
|
"status": "healthy",
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"service": "modbus-mqtt-bridge",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"sensors": {
|
||||||
|
"total": summary.get('total_sensors', 0),
|
||||||
|
"online": summary.get('online_sensors', 0),
|
||||||
|
"health_percentage": summary.get('health_percentage', 0.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Consider the service unhealthy if no sensors are working
|
||||||
|
if summary.get('total_sensors', 0) > 0 and summary.get('online_sensors', 0) == 0:
|
||||||
|
# If we have sensors configured but none are online, report as degraded but still healthy
|
||||||
|
# (since the health check server itself is working)
|
||||||
|
health_data["status"] = "degraded"
|
||||||
|
health_data["message"] = "All sensors offline"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If we can't get sensor status, still report as healthy since the service is running
|
||||||
|
logging.warning(f"Could not get sensor status for health check: {e}")
|
||||||
|
health_data = {
|
||||||
|
"status": "healthy",
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"service": "modbus-mqtt-bridge",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"message": "Service running, sensor status unavailable"
|
||||||
|
}
|
||||||
|
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.send_header('Content-type', 'application/json')
|
self.send_header('Content-type', 'application/json')
|
||||||
@ -59,18 +89,46 @@ class HealthCheckServer:
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logging.info(f"Attempting to start health check server on port {self.port}")
|
||||||
# Bind to all interfaces to make it accessible from outside container
|
# Bind to all interfaces to make it accessible from outside container
|
||||||
self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler)
|
self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler)
|
||||||
self.thread = threading.Thread(target=self.server.serve_forever, daemon=True)
|
|
||||||
|
# Test if the server can actually bind to the port
|
||||||
|
logging.info(f"Successfully bound to 0.0.0.0:{self.port}")
|
||||||
|
|
||||||
|
self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True)
|
||||||
self.thread.start()
|
self.thread.start()
|
||||||
|
|
||||||
|
# Give the server a moment to start
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
logging.info(f"Health check server started on 0.0.0.0:{self.port}")
|
logging.info(f"Health check server started on 0.0.0.0:{self.port}")
|
||||||
logging.info(f"Health check endpoints:")
|
logging.info(f"Health check endpoints:")
|
||||||
logging.info(f" - http://0.0.0.0:{self.port}/health")
|
logging.info(f" - http://0.0.0.0:{self.port}/health")
|
||||||
logging.info(f" - http://0.0.0.0:{self.port}/sensors")
|
logging.info(f" - http://0.0.0.0:{self.port}/sensors")
|
||||||
|
logging.info("Health check server is ready for external health checks")
|
||||||
|
|
||||||
|
except OSError as e:
|
||||||
|
if e.errno == 98: # Address already in use
|
||||||
|
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
|
||||||
|
else:
|
||||||
|
logging.error(f"OS error starting health check server: {e}")
|
||||||
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to start health check server: {e}")
|
logging.error(f"Failed to start health check server: {e}")
|
||||||
raise e # Re-raise to make the issue visible
|
raise e # Re-raise to make the issue visible
|
||||||
|
|
||||||
|
def _serve_with_error_handling(self):
|
||||||
|
"""Serve forever with error handling"""
|
||||||
|
try:
|
||||||
|
logging.info("Health check server thread started, beginning to serve requests")
|
||||||
|
if self.server:
|
||||||
|
self.server.serve_forever()
|
||||||
|
else:
|
||||||
|
logging.error("Health check server is None, cannot serve requests")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Health check server error: {e}", exc_info=True)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
"""Stop the health check server"""
|
"""Stop the health check server"""
|
||||||
if self.server:
|
if self.server:
|
||||||
|
9
main.py
9
main.py
@ -11,7 +11,16 @@ Usage:
|
|||||||
Author: POE Project
|
Author: POE Project
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
from sensor_bridge import main_loop
|
from sensor_bridge import main_loop
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logging.info("POE Sensor Bridge starting up...")
|
||||||
|
|
||||||
|
# Give the system a moment to initialize
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
logging.info("Starting main sensor loop...")
|
||||||
main_loop()
|
main_loop()
|
@ -29,8 +29,8 @@ job "poe-sensor" {
|
|||||||
# Update strategy
|
# Update strategy
|
||||||
update {
|
update {
|
||||||
max_parallel = 1
|
max_parallel = 1
|
||||||
min_healthy_time = "30s"
|
min_healthy_time = "60s"
|
||||||
healthy_deadline = "3m"
|
healthy_deadline = "5m"
|
||||||
progress_deadline = "10m"
|
progress_deadline = "10m"
|
||||||
auto_revert = true
|
auto_revert = true
|
||||||
canary = 0
|
canary = 0
|
||||||
@ -50,11 +50,12 @@ job "poe-sensor" {
|
|||||||
check {
|
check {
|
||||||
type = "http"
|
type = "http"
|
||||||
path = "/health"
|
path = "/health"
|
||||||
interval = "30s"
|
interval = "60s"
|
||||||
timeout = "10s"
|
timeout = "15s"
|
||||||
|
initial_status = "passing"
|
||||||
check_restart {
|
check_restart {
|
||||||
limit = 3
|
limit = 2
|
||||||
grace = "10s"
|
grace = "15s"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -236,8 +236,8 @@ def main_loop():
|
|||||||
|
|
||||||
logging.info(f"Starting monitoring of {len(MODBUS_HOSTS)} sensors")
|
logging.info(f"Starting monitoring of {len(MODBUS_HOSTS)} sensors")
|
||||||
logging.info("System status can be monitored at:")
|
logging.info("System status can be monitored at:")
|
||||||
logging.info(f" - Health: http://localhost:8080/health")
|
logging.info(f" - Health: http://0.0.0.0:8080/health")
|
||||||
logging.info(f" - Sensors: http://localhost:8080/sensors")
|
logging.info(f" - Sensors: http://0.0.0.0:8080/sensors")
|
||||||
|
|
||||||
# Main loop to read and publish data from all hosts
|
# Main loop to read and publish data from all hosts
|
||||||
while True:
|
while True:
|
||||||
|
Reference in New Issue
Block a user