API Monitoring

Why Monitor APIs?

Performance: Track response times
Availability: Detect downtime
Errors: Identify issues quickly
Usage: Understand traffic patterns
Capacity: Plan for scaling

Key Metrics

// Golden signals
const metrics = {
  latency: 'Response time',
  traffic: 'Requests per second',
  errors: 'Error rate',
  saturation: 'Resource utilization'
};

Prometheus Metrics

const prometheus = require('prom-client');

// Create registry
const register = new prometheus.Registry();

// Default metrics
prometheus.collectDefaultMetrics({ register });

// HTTP request duration
const httpRequestDuration = new prometheus.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status'],
  buckets: [0.1, 0.5, 1, 2, 5],
  registers: [register]
});

// HTTP request counter
const httpRequestsTotal = new prometheus.Counter({
  name: 'http_requests_total',
  help: 'Total HTTP requests',
  labelNames: ['method', 'route', 'status'],
  registers: [register]
});

// Active connections
const activeConnections = new prometheus.Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
  registers: [register]
});

// Middleware
app.use((req, res, next) => {
  const start = Date.now();
  activeConnections.inc();
  
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route?.path || req.path;
    
    httpRequestDuration.observe(
      { method: req.method, route, status: res.statusCode },
      duration
    );
    
    httpRequestsTotal.inc({
      method: req.method,
      route,
      status: res.statusCode
    });
    
    activeConnections.dec();
  });
  
  next();
});

// Metrics endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

Application Performance Monitoring (APM)

// New Relic
require('newrelic');

// Datadog
const tracer = require('dd-trace').init();

// Elastic APM
const apm = require('elastic-apm-node').start({
  serviceName: 'user-api',
  serverUrl: process.env.APM_SERVER_URL
});

// Custom transaction
app.get('/api/users/:id', async (req, res) => {
  const transaction = apm.startTransaction('GET /api/users/:id', 'request');
  
  try {
    const user = await User.findById(req.params.id);
    res.json(user);
    transaction.result = 'success';
  } catch (error) {
    transaction.result = 'error';
    apm.captureError(error);
    res.status(500).json({ error: error.message });
  } finally {
    transaction.end();
  }
});

Health Checks

// Basic health check
app.get('/health', (req, res) => {
  res.status(200).json({ status: 'healthy' });
});

// Detailed health check
app.get('/health/detailed', async (req, res) => {
  const health = {
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    checks: {}
  };
  
  // Database check
  try {
    await db.ping();
    health.checks.database = { status: 'up' };
  } catch (error) {
    health.checks.database = { status: 'down', error: error.message };
    health.status = 'unhealthy';
  }
  
  // Redis check
  try {
    await redis.ping();
    health.checks.redis = { status: 'up' };
  } catch (error) {
    health.checks.redis = { status: 'down', error: error.message };
    health.status = 'degraded';
  }
  
  const statusCode = health.status === 'healthy' ? 200 : 503;
  res.status(statusCode).json(health);
});

Logging

const winston = require('winston');

const logger = winston.createLogger({
  level: 'info',
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.errors({ stack: true }),
    winston.format.json()
  ),
  transports: [
    new winston.transports.File({ filename: 'error.log', level: 'error' }),
    new winston.transports.File({ filename: 'combined.log' })
  ]
});

// Request logging
app.use((req, res, next) => {
  const start = Date.now();
  
  res.on('finish', () => {
    logger.info({
      method: req.method,
      path: req.path,
      statusCode: res.statusCode,
      duration: Date.now() - start,
      ip: req.ip,
      userAgent: req.headers['user-agent']
    });
  });
  
  next();
});

// Error logging
app.use((err, req, res, next) => {
  logger.error({
    message: err.message,
    stack: err.stack,
    path: req.path,
    method: req.method
  });
  
  res.status(500).json({ error: 'Internal server error' });
});

Distributed Tracing

const opentelemetry = require('@opentelemetry/api');
const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');

// Setup tracing
const provider = new NodeTracerProvider();
const exporter = new JaegerExporter({
  serviceName: 'user-api',
  endpoint: 'http://localhost:14268/api/traces'
});

provider.addSpanProcessor(new opentelemetry.SimpleSpanProcessor(exporter));
provider.register();

const tracer = opentelemetry.trace.getTracer('user-api');

// Trace request
app.get('/api/users/:id', async (req, res) => {
  const span = tracer.startSpan('GET /api/users/:id');
  
  try {
    // Database span
    const dbSpan = tracer.startSpan('database.query', { parent: span });
    const user = await User.findById(req.params.id);
    dbSpan.end();
    
    // Cache span
    const cacheSpan = tracer.startSpan('cache.set', { parent: span });
    await cache.set(`user:${req.params.id}`, user);
    cacheSpan.end();
    
    res.json(user);
  } finally {
    span.end();
  }
});

Alerting

// Prometheus alerting rules
const alertRules = `
groups:
  - name: api_alerts
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value }} requests/sec"
      
      - alert: HighLatency
        expr: histogram_quantile(0.95, http_request_duration_seconds) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"
          description: "95th percentile latency is {{ $value }}s"
`;

// Send alerts to Slack
const sendSlackAlert = async (alert) => {
  await fetch(process.env.SLACK_WEBHOOK, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      text: `🚨 ${alert.summary}`,
      attachments: [{
        color: alert.severity === 'critical' ? 'danger' : 'warning',
        fields: [
          { title: 'Description', value: alert.description },
          { title: 'Severity', value: alert.severity }
        ]
      }]
    })
  });
};

Uptime Monitoring

// Pingdom-style monitoring
const axios = require('axios');

async function checkEndpoint(url) {
  const start = Date.now();
  
  try {
    const response = await axios.get(url, { timeout: 5000 });
    const duration = Date.now() - start;
    
    return {
      status: 'up',
      statusCode: response.status,
      responseTime: duration
    };
  } catch (error) {
    return {
      status: 'down',
      error: error.message,
      responseTime: Date.now() - start
    };
  }
}

// Check every minute
setInterval(async () => {
  const result = await checkEndpoint('https://api.example.com/health');
  
  if (result.status === 'down') {
    await sendSlackAlert({
      summary: 'API is down',
      description: result.error,
      severity: 'critical'
    });
  }
}, 60000);

Dashboard Example

// Grafana dashboard JSON
const dashboard = {
  title: 'API Metrics',
  panels: [
    {
      title: 'Request Rate',
      targets: [{
        expr: 'rate(http_requests_total[5m])'
      }]
    },
    {
      title: 'Error Rate',
      targets: [{
        expr: 'rate(http_requests_total{status=~"5.."}[5m])'
      }]
    },
    {
      title: 'Response Time (p95)',
      targets: [{
        expr: 'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))'
      }]
    },
    {
      title: 'Active Connections',
      targets: [{
        expr: 'active_connections'
      }]
    }
  ]
};

.NET Monitoring

// Application Insights
services.AddApplicationInsightsTelemetry();

// Custom metrics
public class MetricsService
{
    private readonly TelemetryClient _telemetry;
    
    public void TrackRequest(string name, TimeSpan duration, bool success)
    {
        _telemetry.TrackRequest(name, DateTimeOffset.Now, duration, 
            success ? "200" : "500", success);
    }
    
    public void TrackMetric(string name, double value)
    {
        _telemetry.TrackMetric(name, value);
    }
}

Interview Tips

Explain metrics: Latency, traffic, errors, saturation
Show Prometheus: Metrics collection
Demonstrate logging: Structured logging
Discuss tracing: Distributed tracing
Mention alerting: Proactive notifications
Show health checks: Endpoint monitoring

Summary

Monitor REST APIs with Prometheus for metrics collection, structured logging with Winston, distributed tracing with OpenTelemetry, health check endpoints, APM tools like New Relic or Datadog, alerting with Prometheus Alertmanager, uptime monitoring, and Grafana dashboards. Track golden signals: latency, traffic, errors, saturation. Essential for production API reliability.

Test Your Knowledge

Take a quick quiz to test your understanding of this topic.

Search

Search Coming Soon