Machine Learning Model Deployment: From Training to Production
11/29/2025 • AI • By Tech Writers
Machine LearningMLOpsModel DeploymentAI ProductionTensorFlowPyTorch
Deploying ML models to production is a major challenge. Models that are accurate in Jupyter notebooks don’t always work in production. This article covers the end-to-end process of ML model deployment.
ML Deployment Stages
1. Model Preparation
import tensorflow as tf
import pickle
import json
# Save trained model
def save_model(model, model_path, metadata):
"""Save model with metadata for production"""
# Save model
if isinstance(model, tf.keras.Model):
model.save(f"{model_path}/model.h5")
else:
with open(f"{model_path}/model.pkl", 'wb') as f:
pickle.dump(model, f)
# Save preprocessing pipeline
with open(f"{model_path}/preprocessor.pkl", 'wb') as f:
pickle.dump(metadata['preprocessor'], f)
# Save model metadata
metadata_info = {
'model_type': metadata['model_type'],
'version': metadata['version'],
'features': metadata['features'],
'target': metadata['target'],
'metrics': metadata['metrics'],
'created_at': metadata['created_at']
}
with open(f"{model_path}/metadata.json", 'w') as f:
json.dump(metadata_info, f, indent=2)
print(f"Model saved to {model_path}")
# Example usage
metadata = {
'model_type': 'RandomForestClassifier',
'version': '1.0.0',
'features': ['age', 'income', 'credit_score'],
'target': 'loan_approval',
'metrics': {
'accuracy': 0.95,
'precision': 0.93,
'recall': 0.94,
'f1_score': 0.935
},
'preprocessor': scaler,
'created_at': '2026-01-15'
}
save_model(model, './models/loan_predictor', metadata)
2. Create API Endpoint
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import numpy as np
import pickle
import json
app = FastAPI(title="ML Model API")
# Load model at startup
class ModelLoader:
def __init__(self, model_path):
with open(f"{model_path}/model.pkl", 'rb') as f:
self.model = pickle.load(f)
with open(f"{model_path}/preprocessor.pkl", 'rb') as f:
self.preprocessor = pickle.load(f)
with open(f"{model_path}/metadata.json", 'r') as f:
self.metadata = json.load(f)
def predict(self, features):
# Preprocess input
features_scaled = self.preprocessor.transform([features])
# Make prediction
prediction = self.model.predict(features_scaled)
probability = self.model.predict_proba(features_scaled)
return {
'prediction': int(prediction[0]),
'probability': float(probability[0][1]),
'model_version': self.metadata['version']
}
# Initialize model
model_loader = ModelLoader('./models/loan_predictor')
# Define request/response models
class PredictionRequest(BaseModel):
age: int
income: float
credit_score: int
class PredictionResponse(BaseModel):
prediction: int
probability: float
model_version: str
@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
try:
features = [request.age, request.income, request.credit_score]
result = model_loader.predict(features)
return result
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"model_version": model_loader.metadata['version']
}
@app.get("/model/info")
async def model_info():
return model_loader.metadata
3. Containerization
# Dockerfile
FROM python:3.9-slim
WORKDIR /app
# Install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
COPY . .
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Run application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
# docker-compose.yml
version: '3.8'
services:
ml-api:
build: .
ports:
- "8000:8000"
environment:
- MODEL_PATH=/models
- LOG_LEVEL=info
volumes:
- ./models:/models
deploy:
resources:
limits:
cpus: '2'
memory: 4G
restart: unless-stopped
redis:
image: redis:alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
volumes:
redis_data:
MLOps Best Practices
1. Model Versioning
import mlflow
import mlflow.sklearn
# Setup MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("loan_prediction")
# Log model training
with mlflow.start_run():
# Train model
model.fit(X_train, y_train)
# Log parameters
mlflow.log_params({
'n_estimators': 100,
'max_depth': 10,
'min_samples_split': 5
})
# Log metrics
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mlflow.log_metric('accuracy', accuracy)
# Log model
mlflow.sklearn.log_model(
model,
"model",
registered_model_name="LoanPredictor"
)
# Log artifacts
mlflow.log_artifact('preprocessing.pkl')
mlflow.log_artifact('feature_importance.png')
2. A/B Testing
from fastapi import Request
import random
class ABTestRouter:
def __init__(self):
self.models = {
'model_a': ModelLoader('./models/v1'),
'model_b': ModelLoader('./models/v2')
}
self.traffic_split = {'model_a': 0.8, 'model_b': 0.2}
def get_model(self, user_id: str):
"""Route traffic based on user_id or random"""
# Consistent routing per user
hash_value = hash(user_id) % 100
if hash_value < self.traffic_split['model_a'] * 100:
return 'model_a', self.models['model_a']
else:
return 'model_b', self.models['model_b']
async def predict(self, user_id: str, features: list):
model_name, model = self.get_model(user_id)
result = model.predict(features)
result['model_variant'] = model_name
# Log for analysis
await self.log_prediction(user_id, model_name, result)
return result
ab_router = ABTestRouter()
@app.post("/predict/ab")
async def predict_ab(request: PredictionRequest, user_id: str):
features = [request.age, request.income, request.credit_score]
return await ab_router.predict(user_id, features)
3. Model Monitoring
import prometheus_client
from prometheus_client import Counter, Histogram, Gauge
# Define metrics
prediction_counter = Counter(
'ml_predictions_total',
'Total number of predictions',
['model_version', 'prediction']
)
prediction_latency = Histogram(
'ml_prediction_latency_seconds',
'Prediction latency in seconds',
['model_version']
)
model_accuracy = Gauge(
'ml_model_accuracy',
'Current model accuracy',
['model_version']
)
# Monitor predictions
@app.post("/predict/monitored")
async def predict_monitored(request: PredictionRequest):
import time
start_time = time.time()
# Make prediction
features = [request.age, request.income, request.credit_score]
result = model_loader.predict(features)
# Record metrics
latency = time.time() - start_time
prediction_latency.labels(
model_version=result['model_version']
).observe(latency)
prediction_counter.labels(
model_version=result['model_version'],
prediction=result['prediction']
).inc()
return result
@app.get("/metrics")
async def metrics():
return prometheus_client.generate_latest()
Deployment Strategies
1. Kubernetes Deployment
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-model-api
spec:
replicas: 3
selector:
matchLabels:
app: ml-model-api
template:
metadata:
labels:
app: ml-model-api
spec:
containers:
- name: api
image: myregistry/ml-model-api:v1.0.0
ports:
- containerPort: 8000
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: ml-model-service
spec:
selector:
app: ml-model-api
ports:
- protocol: TCP
port: 80
targetPort: 8000
type: LoadBalancer
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ml-model-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ml-model-api
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
2. Serverless Deployment
# AWS Lambda handler
import json
import boto3
import pickle
s3 = boto3.client('s3')
def lambda_handler(event, context):
# Load model from S3 (with caching)
if not hasattr(lambda_handler, 'model'):
model_data = s3.get_object(
Bucket='ml-models',
Key='loan_predictor/model.pkl'
)
lambda_handler.model = pickle.loads(model_data['Body'].read())
# Parse input
body = json.loads(event['body'])
features = [body['age'], body['income'], body['credit_score']]
# Predict
prediction = lambda_handler.model.predict([features])
return {
'statusCode': 200,
'body': json.dumps({
'prediction': int(prediction[0])
})
}
Conclusion
ML model deployment is a complex process that requires:
- ✅ Proper model serialization
- ✅ Robust API design
- ✅ Monitoring and logging
- ✅ A/B testing capability
- ✅ Scalable infrastructure
Start implementing MLOps today!
Resources
Already deployed ML models? Share your experience! 🚀