Monitoring CI/CD Pipelines

Pipeline Metrics

# Track key metrics
metrics:
  - Build duration
  - Test duration
  - Deployment frequency
  - Success rate
  - Failure rate
  - Mean time to recovery (MTTR)

GitHub Actions Monitoring

name: Monitor Pipeline

on: [push]

jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      
      - name: Record start time
        run: echo "START_TIME=$(date +%s)" >> $GITHUB_ENV
      
      - name: Build
        run: npm run build
      
      - name: Record metrics
        if: always()
        run: |
          END_TIME=$(date +%s)
          DURATION=$((END_TIME - START_TIME))
          
          curl -X POST ${{ secrets.METRICS_ENDPOINT }} \
            -d "pipeline=build&duration=$DURATION&status=${{ job.status }}"

Pipeline Dashboard

// Express metrics endpoint
const express = require('express');
const prometheus = require('prom-client');

const app = express();
const register = new prometheus.Registry();

// Pipeline duration
const pipelineDuration = new prometheus.Histogram({
  name: 'pipeline_duration_seconds',
  help: 'Pipeline execution duration',
  labelNames: ['pipeline', 'status'],
  registers: [register]
});

// Pipeline runs
const pipelineRuns = new prometheus.Counter({
  name: 'pipeline_runs_total',
  help: 'Total pipeline runs',
  labelNames: ['pipeline', 'status'],
  registers: [register]
});

app.post('/metrics/pipeline', (req, res) => {
  const { pipeline, duration, status } = req.body;
  
  pipelineDuration.observe({ pipeline, status }, duration);
  pipelineRuns.inc({ pipeline, status });
  
  res.sendStatus(200);
});

app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

app.listen(3000);

Grafana Dashboard

{
  "dashboard": {
    "title": "CI/CD Pipeline Metrics",
    "panels": [
      {
        "title": "Build Duration",
        "targets": [{
          "expr": "histogram_quantile(0.95, pipeline_duration_seconds{pipeline=\"build\"})"
        }]
      },
      {
        "title": "Success Rate",
        "targets": [{
          "expr": "rate(pipeline_runs_total{status=\"success\"}[5m]) / rate(pipeline_runs_total[5m])"
        }]
      },
      {
        "title": "Deployment Frequency",
        "targets": [{
          "expr": "rate(pipeline_runs_total{pipeline=\"deploy\"}[1h])"
        }]
      }
    ]
  }
}

Alerting

# Prometheus alerts
groups:
  - name: pipeline_alerts
    rules:
      - alert: HighFailureRate
        expr: rate(pipeline_runs_total{status="failure"}[5m]) > 0.1
        for: 5m
        annotations:
          summary: "High pipeline failure rate"
      
      - alert: SlowBuild
        expr: pipeline_duration_seconds{pipeline="build"} > 600
        for: 5m
        annotations:
          summary: "Build taking too long"

Slack Notifications

# GitHub Actions with Slack
jobs:
  deploy:
    steps:
      - name: Deploy
        run: ./deploy.sh
      
      - name: Notify success
        if: success()
        uses: 8398a7/action-slack@v3
        with:
          status: success
          text: 'Deployment succeeded'
          webhook_url: ${{ secrets.SLACK_WEBHOOK }}
      
      - name: Notify failure
        if: failure()
        uses: 8398a7/action-slack@v3
        with:
          status: failure
          text: 'Deployment failed'
          webhook_url: ${{ secrets.SLACK_WEBHOOK }}

Log Aggregation

# Fluentd for log collection
<source>
  @type forward
  port 24224
</source>

<match cicd.**>
  @type elasticsearch
  host elasticsearch
  port 9200
  logstash_format true
  logstash_prefix cicd
</match>

DORA Metrics

// Track DORA metrics
class DORAMetrics {
  async getDeploymentFrequency() {
    const deployments = await db.query(`
      SELECT COUNT(*) / 7 as per_day
      FROM deployments
      WHERE created_at > NOW() - INTERVAL '7 days'
    `);
    return deployments.rows[0].per_day;
  }
  
  async getLeadTime() {
    const leadTime = await db.query(`
      SELECT AVG(deployed_at - committed_at) as avg_lead_time
      FROM deployments
      WHERE deployed_at > NOW() - INTERVAL '30 days'
    `);
    return leadTime.rows[0].avg_lead_time;
  }
  
  async getMTTR() {
    const mttr = await db.query(`
      SELECT AVG(resolved_at - detected_at) as avg_mttr
      FROM incidents
      WHERE resolved_at > NOW() - INTERVAL '30 days'
    `);
    return mttr.rows[0].avg_mttr;
  }
  
  async getChangeFailureRate() {
    const failures = await db.query(`
      SELECT 
        COUNT(CASE WHEN status = 'failed' THEN 1 END)::float / 
        COUNT(*)::float as failure_rate
      FROM deployments
      WHERE deployed_at > NOW() - INTERVAL '30 days'
    `);
    return failures.rows[0].failure_rate;
  }
}

Pipeline Health Check

async function checkPipelineHealth() {
  const metrics = {
    buildDuration: await getAvgBuildDuration(),
    successRate: await getSuccessRate(),
    deploymentFrequency: await getDeploymentFrequency()
  };
  
  const health = {
    status: 'healthy',
    issues: []
  };
  
  if (metrics.buildDuration > 600) {
    health.status = 'degraded';
    health.issues.push('Build duration too high');
  }
  
  if (metrics.successRate < 0.95) {
    health.status = 'unhealthy';
    health.issues.push('Success rate below 95%');
  }
  
  return health;
}

Cost Monitoring

// Track CI/CD costs
class CostMonitor {
  async calculateCosts() {
    const buildMinutes = await this.getBuildMinutes();
    const storageGB = await this.getStorageUsage();
    
    const costs = {
      compute: buildMinutes * 0.008,  // $0.008 per minute
      storage: storageGB * 0.25,      // $0.25 per GB
      total: 0
    };
    
    costs.total = costs.compute + costs.storage;
    
    return costs;
  }
}

Interview Tips

  • Explain metrics: Build duration, success rate, DORA
  • Show monitoring: Prometheus, Grafana
  • Demonstrate alerting: Slack notifications
  • Discuss logs: Centralized aggregation
  • Mention health: Pipeline health checks
  • Show costs: Track CI/CD expenses

Summary

Monitor CI/CD pipelines with metrics like build duration, success rate, and deployment frequency. Use Prometheus and Grafana for visualization. Implement alerting for failures. Track DORA metrics for performance. Aggregate logs centrally. Monitor pipeline health and costs. Essential for maintaining efficient CI/CD operations.

Test Your Knowledge

Take a quick quiz to test your understanding of this topic.

Test Your Cicd Knowledge

Ready to put your skills to the test? Take our interactive Cicd quiz and get instant feedback on your answers.