253 lines
10 KiB
Python
253 lines
10 KiB
Python
import json
|
|
import threading
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
from datetime import datetime, timezone
|
|
import logging
|
|
import time
|
|
import socket
|
|
import os
|
|
|
|
from config import HEALTH_CHECK_PORT, HEALTH_CHECK_ENABLED
|
|
|
|
class HealthCheckHandler(BaseHTTPRequestHandler):
|
|
def do_GET(self):
|
|
if self.path == '/health':
|
|
self.send_health_response()
|
|
elif self.path == '/sensors':
|
|
self.send_sensors_status()
|
|
elif self.path == '/ready':
|
|
self.send_readiness_response()
|
|
else:
|
|
self.send_response(404)
|
|
self.end_headers()
|
|
|
|
def send_health_response(self):
|
|
"""Send basic health check response"""
|
|
try:
|
|
# Get basic application status
|
|
from sensor_tracker import get_sensor_tracker
|
|
sensor_tracker = get_sensor_tracker()
|
|
summary = sensor_tracker.get_summary()
|
|
|
|
health_data = {
|
|
"status": "healthy",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"service": "modbus-mqtt-bridge",
|
|
"version": "1.0.0",
|
|
"sensors": {
|
|
"total": summary.get('total_sensors', 0),
|
|
"online": summary.get('online_sensors', 0),
|
|
"offline": summary.get('offline_sensors', 0),
|
|
"unknown": summary.get('unknown_sensors', 0),
|
|
"health_percentage": summary.get('health_percentage', 0.0)
|
|
},
|
|
"uptime": self._get_uptime(),
|
|
"host": socket.gethostname()
|
|
}
|
|
|
|
# Service is healthy if the health check server is running
|
|
# Don't mark as unhealthy just because sensors are offline
|
|
# That's what the degraded status is for
|
|
if summary.get('total_sensors', 0) > 0:
|
|
if summary.get('online_sensors', 0) == 0:
|
|
health_data["status"] = "degraded"
|
|
health_data["message"] = "All sensors offline but service is running"
|
|
elif summary.get('health_percentage', 0) < 50:
|
|
health_data["status"] = "degraded"
|
|
health_data["message"] = f"Low sensor health: {summary.get('health_percentage', 0):.1f}%"
|
|
|
|
except Exception as e:
|
|
# If we can't get sensor status, still report as healthy since the service is running
|
|
logging.warning(f"Could not get sensor status for health check: {e}")
|
|
health_data = {
|
|
"status": "healthy",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"service": "modbus-mqtt-bridge",
|
|
"version": "1.0.0",
|
|
"message": "Service running, sensor status unavailable",
|
|
"uptime": self._get_uptime(),
|
|
"host": socket.gethostname()
|
|
}
|
|
|
|
self.send_response(200)
|
|
self.send_header('Content-type', 'application/json')
|
|
self.send_header('Cache-Control', 'no-cache')
|
|
self.end_headers()
|
|
self.wfile.write(json.dumps(health_data, indent=2).encode())
|
|
|
|
def send_readiness_response(self):
|
|
"""Send readiness probe response - checks if service is ready to serve"""
|
|
try:
|
|
# Check if main components are initialized
|
|
from sensor_tracker import get_sensor_tracker
|
|
sensor_tracker = get_sensor_tracker()
|
|
|
|
ready_data = {
|
|
"ready": True,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"service": "modbus-mqtt-bridge",
|
|
"checks": {
|
|
"sensor_tracker": True,
|
|
"health_server": True
|
|
}
|
|
}
|
|
|
|
self.send_response(200)
|
|
self.send_header('Content-type', 'application/json')
|
|
self.send_header('Cache-Control', 'no-cache')
|
|
self.end_headers()
|
|
self.wfile.write(json.dumps(ready_data, indent=2).encode())
|
|
|
|
except Exception as e:
|
|
logging.error(f"Readiness check failed: {e}")
|
|
ready_data = {
|
|
"ready": False,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"service": "modbus-mqtt-bridge",
|
|
"error": str(e)
|
|
}
|
|
|
|
self.send_response(503)
|
|
self.send_header('Content-type', 'application/json')
|
|
self.send_header('Cache-Control', 'no-cache')
|
|
self.end_headers()
|
|
self.wfile.write(json.dumps(ready_data, indent=2).encode())
|
|
|
|
def send_sensors_status(self):
|
|
"""Send detailed sensor status"""
|
|
try:
|
|
# Get sensor status from the global sensor tracker
|
|
from sensor_tracker import get_all_sensor_status
|
|
sensors_status = get_all_sensor_status()
|
|
|
|
self.send_response(200)
|
|
self.send_header('Content-type', 'application/json')
|
|
self.send_header('Cache-Control', 'no-cache')
|
|
self.end_headers()
|
|
self.wfile.write(json.dumps(sensors_status, indent=2).encode())
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error getting sensor status: {e}")
|
|
error_response = {
|
|
"error": "Failed to get sensor status",
|
|
"message": str(e),
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
self.send_response(500)
|
|
self.send_header('Content-type', 'application/json')
|
|
self.end_headers()
|
|
self.wfile.write(json.dumps(error_response, indent=2).encode())
|
|
|
|
def _get_uptime(self):
|
|
"""Get service uptime"""
|
|
try:
|
|
with open('/proc/uptime', 'r') as f:
|
|
uptime_seconds = float(f.readline().split()[0])
|
|
return f"{uptime_seconds:.1f}s"
|
|
except:
|
|
return "unknown"
|
|
|
|
def log_message(self, format, *args):
|
|
"""Override to use our logging system"""
|
|
logging.info(f"Health Check - {format % args}")
|
|
|
|
class HealthCheckServer:
|
|
def __init__(self, port=None):
|
|
self.port = port or int(os.getenv('HEALTH_CHECK_PORT', HEALTH_CHECK_PORT))
|
|
self.server = None
|
|
self.thread = None
|
|
self.started = False
|
|
|
|
def start(self):
|
|
"""Start the health check server in a separate thread"""
|
|
if not HEALTH_CHECK_ENABLED:
|
|
logging.info("Health check server is disabled")
|
|
return False
|
|
|
|
try:
|
|
logging.info(f"Attempting to start health check server on port {self.port}")
|
|
|
|
# Check if port is available
|
|
if not self._is_port_available(self.port):
|
|
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
|
|
return False
|
|
|
|
# Bind to all interfaces to make it accessible from outside container
|
|
self.server = HTTPServer(('0.0.0.0', self.port), HealthCheckHandler)
|
|
|
|
# Test if the server can actually bind to the port
|
|
logging.info(f"Successfully bound to 0.0.0.0:{self.port}")
|
|
|
|
self.thread = threading.Thread(target=self._serve_with_error_handling, daemon=True)
|
|
self.thread.start()
|
|
|
|
# Give the server a moment to start and verify it's working
|
|
time.sleep(1)
|
|
|
|
if self._test_health_endpoint():
|
|
self.started = True
|
|
logging.info(f"Health check server started successfully on 0.0.0.0:{self.port}")
|
|
logging.info(f"Health check endpoints:")
|
|
logging.info(f" - http://0.0.0.0:{self.port}/health")
|
|
logging.info(f" - http://0.0.0.0:{self.port}/sensors")
|
|
logging.info(f" - http://0.0.0.0:{self.port}/ready")
|
|
logging.info("Health check server is ready for external health checks")
|
|
return True
|
|
else:
|
|
logging.error("Health check server started but endpoints are not responding")
|
|
return False
|
|
|
|
except OSError as e:
|
|
if e.errno == 98: # Address already in use
|
|
logging.error(f"Port {self.port} is already in use. Cannot start health check server.")
|
|
else:
|
|
logging.error(f"OS error starting health check server: {e}")
|
|
return False
|
|
except Exception as e:
|
|
logging.error(f"Failed to start health check server: {e}")
|
|
return False
|
|
|
|
def _is_port_available(self, port):
|
|
"""Check if port is available"""
|
|
try:
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.bind(('0.0.0.0', port))
|
|
return True
|
|
except OSError:
|
|
return False
|
|
|
|
def _test_health_endpoint(self):
|
|
"""Test if health endpoint is responding"""
|
|
try:
|
|
import urllib.request
|
|
with urllib.request.urlopen(f'http://localhost:{self.port}/health', timeout=5) as response:
|
|
return response.status == 200
|
|
except Exception as e:
|
|
logging.warning(f"Health endpoint test failed: {e}")
|
|
return False
|
|
|
|
def _serve_with_error_handling(self):
|
|
"""Serve forever with error handling"""
|
|
try:
|
|
logging.info("Health check server thread started, beginning to serve requests")
|
|
if self.server:
|
|
self.server.serve_forever()
|
|
else:
|
|
logging.error("Health check server is None, cannot serve requests")
|
|
except Exception as e:
|
|
logging.error(f"Health check server error: {e}", exc_info=True)
|
|
self.started = False
|
|
|
|
def stop(self):
|
|
"""Stop the health check server"""
|
|
if self.server:
|
|
self.server.shutdown()
|
|
self.server.server_close()
|
|
self.started = False
|
|
logging.info("Health check server stopped")
|
|
|
|
def is_running(self):
|
|
"""Check if server is running"""
|
|
return self.started and self.thread and self.thread.is_alive()
|