Machine Learning Model Deployment: From Training to Production

Deploying ML models to production is a major challenge. Models that are accurate in Jupyter notebooks don’t always work in production. This article covers the end-to-end process of ML model deployment.

ML Deployment Stages

1. Model Preparation

import tensorflow as tf
import pickle
import json

# Save trained model
def save_model(model, model_path, metadata):
    """Save model with metadata for production"""
    
    # Save model
    if isinstance(model, tf.keras.Model):
        model.save(f"{model_path}/model.h5")
    else:
        with open(f"{model_path}/model.pkl", 'wb') as f:
            pickle.dump(model, f)
    
    # Save preprocessing pipeline
    with open(f"{model_path}/preprocessor.pkl", 'wb') as f:
        pickle.dump(metadata['preprocessor'], f)
    
    # Save model metadata
    metadata_info = {
        'model_type': metadata['model_type'],
        'version': metadata['version'],
        'features': metadata['features'],
        'target': metadata['target'],
        'metrics': metadata['metrics'],
        'created_at': metadata['created_at']
    }
    
    with open(f"{model_path}/metadata.json", 'w') as f:
        json.dump(metadata_info, f, indent=2)
    
    print(f"Model saved to {model_path}")

# Example usage
metadata = {
    'model_type': 'RandomForestClassifier',
    'version': '1.0.0',
    'features': ['age', 'income', 'credit_score'],
    'target': 'loan_approval',
    'metrics': {
        'accuracy': 0.95,
        'precision': 0.93,
        'recall': 0.94,
        'f1_score': 0.935
    },
    'preprocessor': scaler,
    'created_at': '2026-01-15'
}

save_model(model, './models/loan_predictor', metadata)

2. Create API Endpoint

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import numpy as np
import pickle
import json

app = FastAPI(title="ML Model API")

# Load model at startup
class ModelLoader:
    def __init__(self, model_path):
        with open(f"{model_path}/model.pkl", 'rb') as f:
            self.model = pickle.load(f)
        
        with open(f"{model_path}/preprocessor.pkl", 'rb') as f:
            self.preprocessor = pickle.load(f)
        
        with open(f"{model_path}/metadata.json", 'r') as f:
            self.metadata = json.load(f)
    
    def predict(self, features):
        # Preprocess input
        features_scaled = self.preprocessor.transform([features])
        # Make prediction
        prediction = self.model.predict(features_scaled)
        probability = self.model.predict_proba(features_scaled)
        
        return {
            'prediction': int(prediction[0]),
            'probability': float(probability[0][1]),
            'model_version': self.metadata['version']
        }

# Initialize model
model_loader = ModelLoader('./models/loan_predictor')

# Define request/response models
class PredictionRequest(BaseModel):
    age: int
    income: float
    credit_score: int

class PredictionResponse(BaseModel):
    prediction: int
    probability: float
    model_version: str

@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
    try:
        features = [request.age, request.income, request.credit_score]
        result = model_loader.predict(features)
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {
        "status": "healthy",
        "model_version": model_loader.metadata['version']
    }

@app.get("/model/info")
async def model_info():
    return model_loader.metadata

3. Containerization

# Dockerfile
FROM python:3.9-slim

WORKDIR /app

# Install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application
COPY . .

# Expose port
EXPOSE 8000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
  CMD curl -f http://localhost:8000/health || exit 1

# Run application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

# docker-compose.yml
version: '3.8'

services:
  ml-api:
    build: .
    ports:
      - "8000:8000"
    environment:
      - MODEL_PATH=/models
      - LOG_LEVEL=info
    volumes:
      - ./models:/models
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 4G
    restart: unless-stopped

  redis:
    image: redis:alpine
    ports:
      - "6379:6379"
    volumes:
      - redis_data:/data

volumes:
  redis_data:

MLOps Best Practices

1. Model Versioning

import mlflow
import mlflow.sklearn

# Setup MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("loan_prediction")

# Log model training
with mlflow.start_run():
    # Train model
    model.fit(X_train, y_train)
    
    # Log parameters
    mlflow.log_params({
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 5
    })
    
    # Log metrics
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric('accuracy', accuracy)
    
    # Log model
    mlflow.sklearn.log_model(
        model, 
        "model",
        registered_model_name="LoanPredictor"
    )
    
    # Log artifacts
    mlflow.log_artifact('preprocessing.pkl')
    mlflow.log_artifact('feature_importance.png')

2. A/B Testing

from fastapi import Request
import random

class ABTestRouter:
    def __init__(self):
        self.models = {
            'model_a': ModelLoader('./models/v1'),
            'model_b': ModelLoader('./models/v2')
        }
        self.traffic_split = {'model_a': 0.8, 'model_b': 0.2}
    
    def get_model(self, user_id: str):
        """Route traffic based on user_id or random"""
        # Consistent routing per user
        hash_value = hash(user_id) % 100
        
        if hash_value < self.traffic_split['model_a'] * 100:
            return 'model_a', self.models['model_a']
        else:
            return 'model_b', self.models['model_b']
    
    async def predict(self, user_id: str, features: list):
        model_name, model = self.get_model(user_id)
        
        result = model.predict(features)
        result['model_variant'] = model_name
        
        # Log for analysis
        await self.log_prediction(user_id, model_name, result)
        
        return result

ab_router = ABTestRouter()

@app.post("/predict/ab")
async def predict_ab(request: PredictionRequest, user_id: str):
    features = [request.age, request.income, request.credit_score]
    return await ab_router.predict(user_id, features)

3. Model Monitoring

import prometheus_client
from prometheus_client import Counter, Histogram, Gauge

# Define metrics
prediction_counter = Counter(
    'ml_predictions_total',
    'Total number of predictions',
    ['model_version', 'prediction']
)

prediction_latency = Histogram(
    'ml_prediction_latency_seconds',
    'Prediction latency in seconds',
    ['model_version']
)

model_accuracy = Gauge(
    'ml_model_accuracy',
    'Current model accuracy',
    ['model_version']
)

# Monitor predictions
@app.post("/predict/monitored")
async def predict_monitored(request: PredictionRequest):
    import time
    start_time = time.time()
    
    # Make prediction
    features = [request.age, request.income, request.credit_score]
    result = model_loader.predict(features)
    
    # Record metrics
    latency = time.time() - start_time
    prediction_latency.labels(
        model_version=result['model_version']
    ).observe(latency)
    
    prediction_counter.labels(
        model_version=result['model_version'],
        prediction=result['prediction']
    ).inc()
    
    return result

@app.get("/metrics")
async def metrics():
    return prometheus_client.generate_latest()

Deployment Strategies

1. Kubernetes Deployment

# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ml-model-api
spec:
  replicas: 3
  selector:
    matchLabels:
      app: ml-model-api
  template:
    metadata:
      labels:
        app: ml-model-api
    spec:
      containers:
      - name: api
        image: myregistry/ml-model-api:v1.0.0
        ports:
        - containerPort: 8000
        resources:
          requests:
            memory: "2Gi"
            cpu: "1000m"
          limits:
            memory: "4Gi"
            cpu: "2000m"
        livenessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 5
          periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
  name: ml-model-service
spec:
  selector:
    app: ml-model-api
  ports:
  - protocol: TCP
    port: 80
    targetPort: 8000
  type: LoadBalancer
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: ml-model-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: ml-model-api
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70

2. Serverless Deployment

# AWS Lambda handler
import json
import boto3
import pickle

s3 = boto3.client('s3')

def lambda_handler(event, context):
    # Load model from S3 (with caching)
    if not hasattr(lambda_handler, 'model'):
        model_data = s3.get_object(
            Bucket='ml-models',
            Key='loan_predictor/model.pkl'
        )
        lambda_handler.model = pickle.loads(model_data['Body'].read())
    
    # Parse input
    body = json.loads(event['body'])
    features = [body['age'], body['income'], body['credit_score']]
    
    # Predict
    prediction = lambda_handler.model.predict([features])
    
    return {
        'statusCode': 200,
        'body': json.dumps({
            'prediction': int(prediction[0])
        })
    }

Conclusion

ML model deployment is a complex process that requires:

✅ Proper model serialization
✅ Robust API design
✅ Monitoring and logging
✅ A/B testing capability
✅ Scalable infrastructure

Start implementing MLOps today!

Resources

Already deployed ML models? Share your experience! 🚀