Update README.md

This commit is contained in:
2025-02-26 15:25:39 +07:00
parent d6acf632e3
commit baf1723a50
69 changed files with 5525 additions and 0 deletions

396
app/routers/jobs.py Normal file
View File

@ -0,0 +1,396 @@
from fastapi import APIRouter, Depends, HTTPException, Body, Query
from typing import Dict, Any, List, Optional
import json
import logging
from app.services.nomad_client import NomadService
from app.services.config_service import ConfigService
from app.schemas.job import JobResponse, JobOperation, JobSpecification
router = APIRouter()
nomad_service = NomadService()
config_service = ConfigService()
# Configure logging
logger = logging.getLogger(__name__)
@router.get("/", response_model=List[JobResponse])
async def list_jobs():
"""List all jobs."""
jobs = nomad_service.list_jobs()
# Enhance job responses with repository information if available
for job in jobs:
job_id = job.get("ID")
if job_id:
repository = config_service.get_repository_from_job(job_id)
if repository:
job["repository"] = repository
return jobs
@router.get("/{job_id}", response_model=JobResponse)
async def get_job(job_id: str):
"""Get a job by ID."""
job = nomad_service.get_job(job_id)
# Add repository information if available
repository = config_service.get_repository_from_job(job_id)
if repository:
job["repository"] = repository
return job
@router.post("/", response_model=JobOperation)
async def start_job(job_spec: JobSpecification = Body(...)):
"""Start a Nomad job with the provided specification."""
return nomad_service.start_job(job_spec.dict())
@router.delete("/{job_id}", response_model=JobOperation)
async def stop_job(job_id: str, purge: bool = Query(False)):
"""Stop a job by ID."""
return nomad_service.stop_job(job_id, purge)
@router.get("/{job_id}/allocations")
async def get_job_allocations(job_id: str):
"""Get all allocations for a job."""
return nomad_service.get_allocations(job_id)
@router.get("/{job_id}/latest-allocation")
async def get_latest_allocation(job_id: str):
"""Get the latest allocation for a job."""
allocations = nomad_service.get_allocations(job_id)
if not allocations:
raise HTTPException(status_code=404, detail=f"No allocations found for job {job_id}")
# Sort allocations by creation time (descending)
sorted_allocations = sorted(
allocations,
key=lambda a: a.get("CreateTime", 0),
reverse=True
)
return sorted_allocations[0]
@router.get("/{job_id}/status")
async def get_job_status(job_id: str, namespace: str = Query(None, description="Nomad namespace")):
"""Get the current status of a job, including deployment and latest allocation."""
try:
# Create a custom service with the specific namespace if provided
custom_nomad = NomadService()
if namespace:
custom_nomad.namespace = namespace
logger.info(f"Getting job status for {job_id} in namespace {namespace}")
else:
logger.info(f"Getting job status for {job_id} in default namespace (development)")
job = custom_nomad.get_job(job_id)
status = {
"job_id": job_id,
"namespace": namespace or custom_nomad.namespace,
"status": job.get("Status", "unknown"),
"stable": job.get("Stable", False),
"submitted_at": job.get("SubmitTime", 0),
}
# Get the latest deployment if any
try:
deployment = custom_nomad.get_deployment_status(job_id)
if deployment:
status["deployment"] = {
"id": deployment.get("ID"),
"status": deployment.get("Status"),
"description": deployment.get("StatusDescription"),
}
except Exception as e:
logger.warning(f"Failed to get deployment for job {job_id}: {str(e)}")
pass # Deployment info is optional
# Get the latest allocation if any
try:
allocations = custom_nomad.get_allocations(job_id)
if allocations:
sorted_allocations = sorted(
allocations,
key=lambda a: a.get("CreateTime", 0),
reverse=True
)
latest_alloc = sorted_allocations[0]
status["latest_allocation"] = {
"id": latest_alloc.get("ID"),
"status": latest_alloc.get("ClientStatus"),
"description": latest_alloc.get("ClientDescription", ""),
"created_at": latest_alloc.get("CreateTime", 0),
}
except Exception as e:
logger.warning(f"Failed to get allocations for job {job_id}: {str(e)}")
pass # Allocation info is optional
return status
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get job status: {str(e)}")
@router.get("/{job_id}/specification")
async def get_job_specification(job_id: str, namespace: str = Query(None, description="Nomad namespace"), raw: bool = Query(False)):
"""Get the job specification for a job."""
try:
# Create a custom service with the specific namespace if provided
custom_nomad = NomadService()
if namespace:
custom_nomad.namespace = namespace
logger.info(f"Getting job specification for {job_id} in namespace {namespace}")
else:
logger.info(f"Getting job specification for {job_id} in default namespace (development)")
job = custom_nomad.get_job(job_id)
if raw:
return job
# Extract just the job specification part if present
if "JobID" in job:
job_spec = {
"id": job.get("ID"),
"name": job.get("Name"),
"type": job.get("Type"),
"status": job.get("Status"),
"datacenters": job.get("Datacenters", []),
"namespace": job.get("Namespace"),
"task_groups": job.get("TaskGroups", []),
"meta": job.get("Meta", {}),
}
return job_spec
return job
except Exception as e:
raise HTTPException(status_code=404, detail=f"Failed to get job specification: {str(e)}")
@router.post("/{job_id}/restart")
async def restart_job(job_id: str):
"""Restart a job by stopping it and starting it again."""
try:
# Get the current job specification
job_spec = nomad_service.get_job(job_id)
# Stop the job
nomad_service.stop_job(job_id)
# Start the job with the original specification
result = nomad_service.start_job(job_spec)
return {
"job_id": job_id,
"status": "restarted",
"eval_id": result.get("eval_id"),
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to restart job: {str(e)}")
@router.get("/by-repository/{repository}")
async def get_job_by_repository(repository: str):
"""Get job information by repository URL or name."""
job_info = config_service.get_job_from_repository(repository)
if not job_info:
raise HTTPException(status_code=404, detail=f"No job found for repository: {repository}")
job_id = job_info.get("job_id")
namespace = job_info.get("namespace")
# Get the job using the specific namespace if provided
try:
if namespace:
# Override the default namespace with the specific one
custom_nomad = NomadService()
custom_nomad.namespace = namespace
job = custom_nomad.get_job(job_id)
else:
# Use the default namespace settings
job = nomad_service.get_job(job_id)
# Add repository information
job["repository"] = repository
return job
except Exception as e:
raise HTTPException(status_code=404, detail=f"Job not found: {job_id}, Error: {str(e)}")
@router.post("/by-repository/{repository}/start")
async def start_job_by_repository(repository: str):
"""Start a job by its associated repository."""
logger = logging.getLogger(__name__)
job_info = config_service.get_job_from_repository(repository)
if not job_info:
raise HTTPException(status_code=404, detail=f"No job found for repository: {repository}")
job_id = job_info.get("job_id")
namespace = job_info.get("namespace")
logger.info(f"Starting job for repository {repository}, job_id: {job_id}, namespace: {namespace}")
# Create a custom service with the specific namespace if provided
custom_nomad = NomadService()
if namespace:
logger.info(f"Setting custom_nomad.namespace to {namespace}")
custom_nomad.namespace = namespace
# Log the current namespace being used
logger.info(f"Nomad client namespace: {custom_nomad.namespace}")
try:
# Get the job specification from an existing job
job_spec = custom_nomad.get_job(job_id)
# Log the job specification
logger.info(f"Retrieved job specification for {job_id} from existing job")
# Ensure namespace is set in job spec
if isinstance(job_spec, dict):
# Ensure namespace is explicitly set
if namespace:
logger.info(f"Setting namespace in job spec to {namespace}")
job_spec["Namespace"] = namespace
# Log the keys in the job specification
logger.info(f"Job spec keys: {job_spec.keys()}")
# Start the job with the retrieved specification
result = custom_nomad.start_job(job_spec)
return {
"job_id": job_id,
"repository": repository,
"status": "started",
"eval_id": result.get("eval_id"),
"namespace": namespace
}
except HTTPException as e:
# If job not found, try to get spec from config
if e.status_code == 404:
logger.info(f"Job {job_id} not found, attempting to get specification from config")
# Try to get job spec from repository config
job_spec = config_service.get_job_spec_from_repository(repository)
if not job_spec:
logger.warning(f"No job specification found for repository {repository}, creating a default one")
# Create a simple default job spec if none exists
job_spec = {
"ID": job_id,
"Name": job_id,
"Type": "service",
"Datacenters": ["jm"], # Default datacenter
"TaskGroups": [
{
"Name": "app",
"Count": 1,
"Tasks": [
{
"Name": job_id.split('-')[0], # Use first part of job ID as task name
"Driver": "docker",
"Config": {
"image": f"registry.dev.meisheng.group/{repository}:latest",
"force_pull": True,
"ports": ["http"]
},
"Resources": {
"CPU": 500,
"MemoryMB": 512
}
}
],
"Networks": [
{
"DynamicPorts": [
{
"Label": "http",
"Value": 0,
"To": 8000
}
]
}
]
}
],
"Meta": {
"repository": repository
}
}
# Set the namespace explicitly in the job spec
if namespace:
logger.info(f"Setting namespace in default job spec to {namespace}")
job_spec["Namespace"] = namespace
logger.info(f"Starting job {job_id} with specification")
# Log the job specification structure
if isinstance(job_spec, dict):
logger.info(f"Job spec keys: {job_spec.keys()}")
if "Namespace" in job_spec:
logger.info(f"Job spec namespace: {job_spec['Namespace']}")
# Start the job with the specification
result = custom_nomad.start_job(job_spec)
return {
"job_id": job_id,
"repository": repository,
"status": "started",
"eval_id": result.get("eval_id"),
"namespace": namespace
}
@router.post("/by-repository/{repository}/stop")
async def stop_job_by_repository(repository: str, purge: bool = Query(False)):
"""Stop a job by its associated repository."""
job_info = config_service.get_job_from_repository(repository)
if not job_info:
raise HTTPException(status_code=404, detail=f"No job found for repository: {repository}")
job_id = job_info.get("job_id")
namespace = job_info.get("namespace")
# Create a custom service with the specific namespace if provided
custom_nomad = NomadService()
if namespace:
custom_nomad.namespace = namespace
# Stop the job
result = custom_nomad.stop_job(job_id, purge)
return {
"job_id": job_id,
"repository": repository,
"status": "stopped",
"eval_id": result.get("eval_id"),
"namespace": namespace
}
@router.post("/by-repository/{repository}/restart")
async def restart_job_by_repository(repository: str):
"""Restart a job by its associated repository."""
job_info = config_service.get_job_from_repository(repository)
if not job_info:
raise HTTPException(status_code=404, detail=f"No job found for repository: {repository}")
job_id = job_info.get("job_id")
namespace = job_info.get("namespace")
# Create a custom service with the specific namespace if provided
custom_nomad = NomadService()
if namespace:
custom_nomad.namespace = namespace
# Get the job specification
job_spec = custom_nomad.get_job(job_id)
# Stop the job first
custom_nomad.stop_job(job_id)
# Start the job with the original specification
result = custom_nomad.start_job(job_spec)
return {
"job_id": job_id,
"repository": repository,
"status": "restarted",
"eval_id": result.get("eval_id"),
"namespace": namespace
}