Files
nomad_mcp/mcp_server.py
Nicolas Koehl dc2fe4c425 Enhance log analysis with advanced filtering and search capabilities
Add comprehensive log analysis features to improve troubleshooting workflows:
- Time-based filtering (8pm-6am shifts) with HH:MM format support
- Multi-level log filtering (ERROR, WARNING, EMERGENCY, etc.)
- Full-text search across log content with case-insensitive matching
- Proper line break formatting for readable output
- Line count limiting for large log files

New REST API endpoints:
- /api/logs/errors/{job_id} - Get only error/warning logs
- /api/logs/search/{job_id} - Search logs for specific terms
- Enhanced /api/logs/job/{job_id} with filtering parameters

New MCP tools:
- get_error_logs - Streamlined error analysis
- search_job_logs - Pattern-based log searching
- Enhanced get_job_logs with all filtering options

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-02 17:42:35 +07:00

925 lines
34 KiB
Python

#!/usr/bin/env python3
"""
Nomad MCP Server for Claude Desktop App
Provides MCP tools for managing HashiCorp Nomad jobs
"""
import asyncio
import json
import logging
import os
import sys
import requests
from typing import Any, Dict, List, Optional
from mcp.server import NotificationOptions, Server
from mcp.server.models import InitializationOptions
import mcp.server.stdio
import mcp.types as types
from app.services.nomad_client import NomadService
from app.schemas.claude_api import ClaudeJobSpecification
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("nomad-mcp")
# Create the server instance
server = Server("nomad-mcp")
@server.list_tools()
async def handle_list_tools() -> List[types.Tool]:
"""List available tools for Nomad management."""
return [
types.Tool(
name="list_nomad_jobs",
description="List all Nomad jobs in a namespace",
inputSchema={
"type": "object",
"properties": {
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
}
}
}
),
types.Tool(
name="get_job_status",
description="Get the status of a specific Nomad job",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to check"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
}
},
"required": ["job_id"]
}
),
types.Tool(
name="stop_job",
description="Stop a running Nomad job",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to stop"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
},
"purge": {
"type": "boolean",
"description": "Whether to purge the job",
"default": False
}
},
"required": ["job_id"]
}
),
types.Tool(
name="restart_job",
description="Restart a Nomad job",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to restart"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
}
},
"required": ["job_id"]
}
),
types.Tool(
name="create_job",
description="Create a new Nomad job",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "Unique ID for the job"
},
"name": {
"type": "string",
"description": "Display name for the job"
},
"type": {
"type": "string",
"description": "Job type (service, batch, etc.)",
"default": "service"
},
"datacenters": {
"type": "array",
"description": "List of datacenters to run the job in",
"items": {"type": "string"},
"default": ["jm"]
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
},
"docker_image": {
"type": "string",
"description": "Docker image to run"
},
"count": {
"type": "integer",
"description": "Number of instances to run",
"default": 1
},
"cpu": {
"type": "integer",
"description": "CPU allocation in MHz",
"default": 100
},
"memory": {
"type": "integer",
"description": "Memory allocation in MB",
"default": 128
},
"ports": {
"type": "array",
"description": "Port mappings",
"items": {"type": "object"},
"default": []
},
"env_vars": {
"type": "object",
"description": "Environment variables for the container",
"default": {}
}
},
"required": ["job_id", "name", "docker_image"]
}
),
types.Tool(
name="get_job_logs",
description="Get logs for a Nomad job with advanced filtering options",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to get logs for"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
},
"log_type": {
"type": "string",
"description": "Type of logs: stdout or stderr",
"enum": ["stdout", "stderr"],
"default": "stderr"
},
"start_time": {
"type": "string",
"description": "Start time filter (HH:MM format, e.g., '20:00' for 8 PM)"
},
"end_time": {
"type": "string",
"description": "End time filter (HH:MM format, e.g., '06:00' for 6 AM)"
},
"log_level": {
"type": "string",
"description": "Filter by log level: ERROR, WARNING, INFO, DEBUG, EMERGENCY, CRITICAL"
},
"search": {
"type": "string",
"description": "Search term to filter logs"
},
"lines": {
"type": "integer",
"description": "Number of recent log lines to return"
},
"limit": {
"type": "integer",
"description": "Number of allocations to check",
"default": 1
}
},
"required": ["job_id"]
}
),
types.Tool(
name="get_error_logs",
description="Get only error and warning logs for a Nomad job, useful for troubleshooting",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to get error logs for"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
},
"start_time": {
"type": "string",
"description": "Start time filter (HH:MM format, e.g., '20:00' for 8 PM)"
},
"end_time": {
"type": "string",
"description": "End time filter (HH:MM format, e.g., '06:00' for 6 AM)"
}
},
"required": ["job_id"]
}
),
types.Tool(
name="search_job_logs",
description="Search Nomad job logs for specific terms or patterns",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to search logs for"
},
"search_term": {
"type": "string",
"description": "Term or pattern to search for in logs"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
},
"log_type": {
"type": "string",
"description": "Type of logs: stdout or stderr",
"enum": ["stdout", "stderr"],
"default": "stderr"
},
"lines": {
"type": "integer",
"description": "Number of matching lines to return",
"default": 50
}
},
"required": ["job_id", "search_term"]
}
),
types.Tool(
name="submit_job_file",
description="Submit a Nomad job from HCL or JSON file content",
inputSchema={
"type": "object",
"properties": {
"file_content": {
"type": "string",
"description": "Content of the Nomad job file (HCL or JSON format)"
},
"file_type": {
"type": "string",
"description": "Type of file content: 'hcl' or 'json'",
"enum": ["hcl", "json"],
"default": "json"
},
"namespace": {
"type": "string",
"description": "Nomad namespace to submit the job to",
"default": "development"
}
},
"required": ["file_content"]
}
),
types.Tool(
name="get_allocation_status",
description="Get detailed status of job allocations",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to check allocations for"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
}
},
"required": ["job_id"]
}
),
types.Tool(
name="get_job_evaluations",
description="Get evaluations for a job to understand placement and failures",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to get evaluations for"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
}
},
"required": ["job_id"]
}
),
types.Tool(
name="force_evaluate_job",
description="Force a new evaluation for a job (retry failed placements)",
inputSchema={
"type": "object",
"properties": {
"job_id": {
"type": "string",
"description": "ID of the job to force evaluate"
},
"namespace": {
"type": "string",
"description": "Nomad namespace",
"default": "development"
}
},
"required": ["job_id"]
}
)
]
@server.call_tool()
async def handle_call_tool(name: str, arguments: Dict[str, Any]) -> List[types.TextContent]:
"""Handle tool calls from Claude."""
try:
# Create Nomad service instance
nomad_service = NomadService()
namespace = arguments.get("namespace", "development")
nomad_service.namespace = namespace
if name == "list_nomad_jobs":
jobs = nomad_service.list_jobs()
simplified_jobs = []
for job in jobs:
simplified_jobs.append({
"id": job.get("ID"),
"name": job.get("Name"),
"status": job.get("Status"),
"type": job.get("Type"),
"namespace": namespace
})
return [types.TextContent(
type="text",
text=json.dumps(simplified_jobs, indent=2)
)]
elif name == "get_job_status":
job_id = arguments.get("job_id")
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
job = nomad_service.get_job(job_id)
allocations = nomad_service.get_allocations(job_id)
latest_alloc = None
if allocations:
sorted_allocations = sorted(
allocations,
key=lambda a: a.get("CreateTime", 0),
reverse=True
)
latest_alloc = sorted_allocations[0]
result = {
"job_id": job_id,
"status": job.get("Status", "unknown"),
"message": f"Job {job_id} is {job.get('Status', 'unknown')}",
"details": {
"job": job,
"latest_allocation": latest_alloc
}
}
return [types.TextContent(
type="text",
text=json.dumps(result, indent=2)
)]
elif name == "stop_job":
job_id = arguments.get("job_id")
purge = arguments.get("purge", False)
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
result = nomad_service.stop_job(job_id, purge=purge)
response = {
"success": True,
"job_id": job_id,
"status": "stopped",
"message": f"Job {job_id} has been stopped" + (" and purged" if purge else ""),
"details": result
}
return [types.TextContent(
type="text",
text=json.dumps(response, indent=2)
)]
elif name == "restart_job":
job_id = arguments.get("job_id")
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
# Get current job spec
job_spec = nomad_service.get_job(job_id)
# Stop and restart
nomad_service.stop_job(job_id)
result = nomad_service.start_job(job_spec)
response = {
"success": True,
"job_id": job_id,
"status": "restarted",
"message": f"Job {job_id} has been restarted",
"details": result
}
return [types.TextContent(
type="text",
text=json.dumps(response, indent=2)
)]
elif name == "create_job":
# Validate required arguments
required_args = ["job_id", "name", "docker_image"]
for arg in required_args:
if not arguments.get(arg):
return [types.TextContent(
type="text",
text=f"Error: {arg} is required"
)]
# Create job specification
job_spec = ClaudeJobSpecification(**arguments)
# Set namespace
if job_spec.namespace:
nomad_service.namespace = job_spec.namespace
# Convert to Nomad format and start
nomad_job_spec = job_spec.to_nomad_job_spec()
result = nomad_service.start_job(nomad_job_spec)
response = {
"success": True,
"job_id": job_spec.job_id,
"status": "started",
"message": f"Job {job_spec.job_id} has been created and started",
"details": result
}
return [types.TextContent(
type="text",
text=json.dumps(response, indent=2)
)]
elif name == "get_job_logs":
job_id = arguments.get("job_id")
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
# Use the enhanced REST API endpoint
import requests
base_url = os.getenv("BASE_URL", "http://localhost:8000")
# Build query parameters
params = {
"namespace": arguments.get("namespace", namespace),
"log_type": arguments.get("log_type", "stderr"),
"limit": arguments.get("limit", 1),
"plain_text": True,
"formatted": True
}
# Add optional filters
if arguments.get("start_time"):
params["start_time"] = arguments["start_time"]
if arguments.get("end_time"):
params["end_time"] = arguments["end_time"]
if arguments.get("log_level"):
params["log_level"] = arguments["log_level"]
if arguments.get("search"):
params["search"] = arguments["search"]
if arguments.get("lines"):
params["lines"] = arguments["lines"]
try:
response = requests.get(
f"{base_url}/api/logs/job/{job_id}",
params=params,
timeout=30
)
if response.status_code == 200:
logs_text = response.text
result = {
"success": True,
"job_id": job_id,
"namespace": params["namespace"],
"message": f"Retrieved logs for job {job_id}",
"logs": logs_text,
"filters_applied": {k: v for k, v in params.items() if k not in ["namespace", "plain_text", "formatted"]}
}
else:
result = {
"success": False,
"job_id": job_id,
"message": f"Failed to get logs: {response.status_code} - {response.text}",
"logs": None
}
except Exception as e:
result = {
"success": False,
"job_id": job_id,
"message": f"Error getting logs: {str(e)}",
"logs": None
}
return [types.TextContent(
type="text",
text=json.dumps(result, indent=2) if not result.get("success") else result["logs"]
)]
elif name == "get_error_logs":
job_id = arguments.get("job_id")
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
# Use the error logs endpoint
import requests
base_url = os.getenv("BASE_URL", "http://localhost:8000")
params = {
"namespace": arguments.get("namespace", namespace),
"plain_text": True
}
if arguments.get("start_time"):
params["start_time"] = arguments["start_time"]
if arguments.get("end_time"):
params["end_time"] = arguments["end_time"]
try:
response = requests.get(
f"{base_url}/api/logs/errors/{job_id}",
params=params,
timeout=30
)
if response.status_code == 200:
logs_text = response.text
result = {
"success": True,
"job_id": job_id,
"message": f"Retrieved error logs for job {job_id}",
"logs": logs_text
}
return [types.TextContent(
type="text",
text=logs_text
)]
else:
return [types.TextContent(
type="text",
text=f"Error: Failed to get error logs: {response.status_code} - {response.text}"
)]
except Exception as e:
return [types.TextContent(
type="text",
text=f"Error getting error logs: {str(e)}"
)]
elif name == "search_job_logs":
job_id = arguments.get("job_id")
search_term = arguments.get("search_term")
if not job_id or not search_term:
return [types.TextContent(
type="text",
text="Error: job_id and search_term are required"
)]
# Use the search logs endpoint
import requests
base_url = os.getenv("BASE_URL", "http://localhost:8000")
params = {
"q": search_term,
"namespace": arguments.get("namespace", namespace),
"log_type": arguments.get("log_type", "stderr"),
"lines": arguments.get("lines", 50),
"plain_text": True
}
try:
response = requests.get(
f"{base_url}/api/logs/search/{job_id}",
params=params,
timeout=30
)
if response.status_code == 200:
logs_text = response.text
return [types.TextContent(
type="text",
text=logs_text
)]
else:
return [types.TextContent(
type="text",
text=f"Error: Failed to search logs: {response.status_code} - {response.text}"
)]
except Exception as e:
return [types.TextContent(
type="text",
text=f"Error searching logs: {str(e)}"
)]
elif name == "submit_job_file":
file_content = arguments.get("file_content")
file_type = arguments.get("file_type", "json")
if not file_content:
return [types.TextContent(
type="text",
text="Error: file_content is required"
)]
try:
# Parse the job specification based on file type
if file_type.lower() == "json":
import json as json_parser
job_spec = json_parser.loads(file_content)
elif file_type.lower() == "hcl":
return [types.TextContent(
type="text",
text="Error: HCL parsing not yet implemented. Please provide JSON format."
)]
else:
return [types.TextContent(
type="text",
text=f"Error: Unsupported file type '{file_type}'. Use 'json' or 'hcl'."
)]
# Submit the job
result = nomad_service.start_job(job_spec)
response = {
"success": True,
"job_id": result.get("job_id"),
"status": "submitted",
"message": f"Job {result.get('job_id')} has been submitted from {file_type} file",
"details": result
}
return [types.TextContent(
type="text",
text=json.dumps(response, indent=2)
)]
except json.JSONDecodeError as e:
return [types.TextContent(
type="text",
text=f"Error: Invalid JSON format - {str(e)}"
)]
except Exception as e:
return [types.TextContent(
type="text",
text=f"Error submitting job: {str(e)}"
)]
elif name == "get_allocation_status":
job_id = arguments.get("job_id")
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
# Get allocations for the job
allocations = nomad_service.get_allocations(job_id)
# Get detailed status for each allocation
detailed_allocations = []
for alloc in allocations:
alloc_id = alloc.get("ID")
detailed_allocations.append({
"allocation_id": alloc_id,
"name": alloc.get("Name"),
"client_status": alloc.get("ClientStatus"),
"desired_status": alloc.get("DesiredStatus"),
"job_id": alloc.get("JobID"),
"task_group": alloc.get("TaskGroup"),
"node_id": alloc.get("NodeID"),
"create_time": alloc.get("CreateTime"),
"modify_time": alloc.get("ModifyTime"),
"task_states": alloc.get("TaskStates", {}),
"failed": alloc.get("Failed", False),
"deployment_status": alloc.get("DeploymentStatus", {})
})
result = {
"job_id": job_id,
"total_allocations": len(allocations),
"allocations": detailed_allocations,
"message": f"Found {len(allocations)} allocations for job {job_id}"
}
return [types.TextContent(
type="text",
text=json.dumps(result, indent=2)
)]
elif name == "get_job_evaluations":
job_id = arguments.get("job_id")
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
try:
evaluations = nomad_service.get_job_evaluations(job_id)
simplified_evals = []
for eval_item in evaluations:
simplified_evals.append({
"eval_id": eval_item.get("ID"),
"status": eval_item.get("Status"),
"type": eval_item.get("Type"),
"triggered_by": eval_item.get("TriggeredBy"),
"job_id": eval_item.get("JobID"),
"create_time": eval_item.get("CreateTime"),
"modify_time": eval_item.get("ModifyTime"),
"wait_until": eval_item.get("WaitUntil"),
"blocked_eval": eval_item.get("BlockedEval"),
"failed_tg_allocs": eval_item.get("FailedTGAllocs", {}),
"class_eligibility": eval_item.get("ClassEligibility", {}),
"quota_limit_reached": eval_item.get("QuotaLimitReached")
})
result = {
"job_id": job_id,
"total_evaluations": len(evaluations),
"evaluations": simplified_evals,
"message": f"Found {len(evaluations)} evaluations for job {job_id}"
}
return [types.TextContent(
type="text",
text=json.dumps(result, indent=2)
)]
except Exception as e:
return [types.TextContent(
type="text",
text=f"Error getting evaluations: {str(e)}"
)]
elif name == "force_evaluate_job":
job_id = arguments.get("job_id")
if not job_id:
return [types.TextContent(
type="text",
text="Error: job_id is required"
)]
try:
# Force evaluation by making a direct API call
import requests
nomad_addr = f"http://{nomad_service.client.host}:{nomad_service.client.port}"
url = f"{nomad_addr}/v1/job/{job_id}/evaluate"
headers = {}
if hasattr(nomad_service.client, 'token') and nomad_service.client.token:
headers["X-Nomad-Token"] = nomad_service.client.token
params = {"namespace": nomad_service.namespace}
response = requests.post(
url=url,
headers=headers,
params=params,
verify=False if os.getenv("NOMAD_SKIP_VERIFY", "false").lower() == "true" else True
)
if response.status_code == 200:
response_data = response.json()
result = {
"success": True,
"job_id": job_id,
"eval_id": response_data.get("EvalID"),
"status": "evaluation_forced",
"message": f"Forced evaluation for job {job_id}",
"details": response_data
}
return [types.TextContent(
type="text",
text=json.dumps(result, indent=2)
)]
else:
return [types.TextContent(
type="text",
text=f"Error: Failed to force evaluation - {response.text}"
)]
except Exception as e:
return [types.TextContent(
type="text",
text=f"Error forcing evaluation: {str(e)}"
)]
else:
return [types.TextContent(
type="text",
text=f"Error: Unknown tool '{name}'"
)]
except Exception as e:
logger.error(f"Error in tool '{name}': {str(e)}")
return [types.TextContent(
type="text",
text=f"Error: {str(e)}"
)]
async def main():
"""Main entry point for the MCP server."""
# Run the server using stdio transport
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
await server.run(
read_stream,
write_stream,
InitializationOptions(
server_name="nomad-mcp",
server_version="1.0.0",
capabilities=server.get_capabilities(
notification_options=NotificationOptions(),
experimental_capabilities={},
),
),
)
if __name__ == "__main__":
asyncio.run(main())