Monitoring and Observability

Key Metrics

const keyMetrics = {
  performance: ['Query latency', 'Throughput (ops/sec)', 'Connection count'],
  resources: ['CPU usage', 'Memory usage', 'Disk I/O', 'Network I/O'],
  database: ['Document count', 'Index size', 'Replication lag', 'Cache hit ratio'],
  errors: ['Failed operations', 'Connection errors', 'Timeout errors']
};

MongoDB Monitoring

Server Status

// Get server status
const status = await db.adminCommand({ serverStatus: 1 });

console.log({
  connections: status.connections,
  opcounters: status.opcounters,
  mem: status.mem,
  network: status.network,
  repl: status.repl
});

// Monitor connections
const connections = {
  current: status.connections.current,
  available: status.connections.available,
  totalCreated: status.connections.totalCreated
};

if (connections.current > connections.available * 0.8) {
  console.warn('Connection pool nearing capacity');
}

Database Stats

// Database statistics
const dbStats = await db.stats();

console.log({
  collections: dbStats.collections,
  dataSize: dbStats.dataSize,
  indexSize: dbStats.indexSize,
  storageSize: dbStats.storageSize,
  avgObjSize: dbStats.avgObjSize
});

// Collection statistics
const collStats = await db.collection('users').stats();

console.log({
  count: collStats.count,
  size: collStats.size,
  avgObjSize: collStats.avgObjSize,
  storageSize: collStats.storageSize,
  totalIndexSize: collStats.totalIndexSize
});

Profiler

// Enable profiler
db.setProfilingLevel(2);  // Log all operations
db.setProfilingLevel(1, { slowms: 100 });  // Log slow queries

// View slow queries
const slowQueries = await db.system.profile.find({
  millis: { $gt: 100 }
}).sort({ ts: -1 }).limit(10).toArray();

slowQueries.forEach(query => {
  console.log({
    op: query.op,
    ns: query.ns,
    millis: query.millis,
    query: query.command
  });
});

// Disable profiler
db.setProfilingLevel(0);

Current Operations

// View current operations
const currentOps = await db.adminCommand({
  currentOp: true,
  $all: true
});

// Find long-running operations
const longRunning = currentOps.inprog.filter(op => op.secs_running > 5);

longRunning.forEach(op => {
  console.log({
    opid: op.opid,
    op: op.op,
    ns: op.ns,
    secs_running: op.secs_running,
    query: op.command
  });
});

// Kill operation
if (longRunning.length > 0) {
  await db.adminCommand({ killOp: 1, op: longRunning[0].opid });
}

Prometheus Integration

const client = require('prom-client');

// Create metrics
const register = new client.Registry();

const queryDuration = new client.Histogram({
  name: 'mongodb_query_duration_seconds',
  help: 'MongoDB query duration in seconds',
  labelNames: ['collection', 'operation'],
  registers: [register]
});

const connectionCount = new client.Gauge({
  name: 'mongodb_connections_current',
  help: 'Current number of MongoDB connections',
  registers: [register]
});

const errorCounter = new client.Counter({
  name: 'mongodb_errors_total',
  help: 'Total number of MongoDB errors',
  labelNames: ['type'],
  registers: [register]
});

// Instrument queries
class MonitoredUserService {
  async getUser(userId) {
    const end = queryDuration.startTimer({ collection: 'users', operation: 'findOne' });
    
    try {
      const user = await db.collection('users').findOne({ _id: userId });
      end();
      return user;
    } catch (error) {
      errorCounter.inc({ type: error.name });
      throw error;
    }
  }
}

// Collect metrics periodically
setInterval(async () => {
  const status = await db.adminCommand({ serverStatus: 1 });
  connectionCount.set(status.connections.current);
}, 10000);

// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

Grafana Dashboard

// Example Prometheus queries for Grafana

const grafanaQueries = {
  queryLatency: 'rate(mongodb_query_duration_seconds_sum[5m]) / rate(mongodb_query_duration_seconds_count[5m])',
  
  throughput: 'rate(mongodb_query_duration_seconds_count[5m])',
  
  errorRate: 'rate(mongodb_errors_total[5m])',
  
  connections: 'mongodb_connections_current',
  
  slowQueries: 'mongodb_query_duration_seconds_bucket{le="1.0"}'
};

Application Performance Monitoring

const { MongoClient } = require('mongodb');

class APMMongoClient {
  constructor(uri) {
    this.client = new MongoClient(uri, {
      monitorCommands: true
    });
    
    this.setupMonitoring();
  }
  
  setupMonitoring() {
    // Command started
    this.client.on('commandStarted', (event) => {
      console.log('Command started:', {
        requestId: event.requestId,
        command: event.commandName,
        database: event.databaseName
      });
    });
    
    // Command succeeded
    this.client.on('commandSucceeded', (event) => {
      console.log('Command succeeded:', {
        requestId: event.requestId,
        duration: event.duration,
        reply: event.reply
      });
    });
    
    // Command failed
    this.client.on('commandFailed', (event) => {
      console.error('Command failed:', {
        requestId: event.requestId,
        duration: event.duration,
        failure: event.failure
      });
    });
  }
}

Health Checks

// Express health check endpoint
app.get('/health', async (req, res) => {
  try {
    // Check MongoDB connection
    await db.admin().ping();
    
    // Check replica set status
    const replStatus = await db.adminCommand({ replSetGetStatus: 1 });
    const primary = replStatus.members.find(m => m.state === 1);
    
    if (!primary) {
      return res.status(503).json({
        status: 'unhealthy',
        reason: 'No primary in replica set'
      });
    }
    
    // Check replication lag
    const lag = Date.now() - primary.optimeDate.getTime();
    if (lag > 10000) {
      return res.status(503).json({
        status: 'degraded',
        reason: 'High replication lag',
        lag
      });
    }
    
    res.json({
      status: 'healthy',
      database: 'connected',
      replication: 'ok'
    });
  } catch (error) {
    res.status(503).json({
      status: 'unhealthy',
      error: error.message
    });
  }
});

Alerting

class AlertingService {
  async checkMetrics() {
    const status = await db.adminCommand({ serverStatus: 1 });
    
    // High connection usage
    const connUsage = status.connections.current / status.connections.available;
    if (connUsage > 0.8) {
      await this.sendAlert('High connection usage', {
        current: status.connections.current,
        available: status.connections.available,
        usage: `${(connUsage * 100).toFixed(2)}%`
      });
    }
    
    // High memory usage
    const memUsage = status.mem.resident / status.mem.virtual;
    if (memUsage > 0.9) {
      await this.sendAlert('High memory usage', {
        resident: status.mem.resident,
        virtual: status.mem.virtual,
        usage: `${(memUsage * 100).toFixed(2)}%`
      });
    }
    
    // Replication lag
    if (status.repl) {
      const lag = status.repl.lag;
      if (lag > 10) {
        await this.sendAlert('High replication lag', { lag });
      }
    }
  }
  
  async sendAlert(title, details) {
    // Send to Slack, PagerDuty, etc.
    console.error(`ALERT: ${title}`, details);
    
    // Example: Slack webhook
    await fetch(process.env.SLACK_WEBHOOK_URL, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        text: `🚨 ${title}`,
        attachments: [{
          color: 'danger',
          fields: Object.entries(details).map(([key, value]) => ({
            title: key,
            value: String(value),
            short: true
          }))
        }]
      })
    });
  }
}

// Run checks every minute
setInterval(() => alertingService.checkMetrics(), 60000);

Cassandra Monitoring

// nodetool commands for monitoring
const cassandraMonitoring = {
  status: 'nodetool status',
  info: 'nodetool info',
  tpstats: 'nodetool tpstats',
  cfstats: 'nodetool cfstats',
  compactionstats: 'nodetool compactionstats',
  proxyhistograms: 'nodetool proxyhistograms'
};

// Example: Parse nodetool output
const { exec } = require('child_process');
const { promisify } = require('util');
const execPromise = promisify(exec);

async function getCassandraMetrics() {
  const { stdout } = await execPromise('nodetool tpstats');
  
  // Parse thread pool stats
  const lines = stdout.split('\n');
  const metrics = {};
  
  for (const line of lines) {
    if (line.includes('ReadStage')) {
      const parts = line.split(/\s+/);
      metrics.readStage = {
        active: parseInt(parts[1]),
        pending: parseInt(parts[2]),
        blocked: parseInt(parts[4])
      };
    }
  }
  
  return metrics;
}

Redis Monitoring

const redis = require('redis');
const client = redis.createClient();
await client.connect();

// Get Redis info
const info = await client.info();

// Parse info sections
const sections = {
  server: info.match(/# Server([\s\S]*?)# Clients/)[1],
  clients: info.match(/# Clients([\s\S]*?)# Memory/)[1],
  memory: info.match(/# Memory([\s\S]*?)# Persistence/)[1],
  stats: info.match(/# Stats([\s\S]*?)# Replication/)[1]
};

// Monitor memory usage
const memoryInfo = await client.info('memory');
const usedMemory = parseInt(memoryInfo.match(/used_memory:(\d+)/)[1]);
const maxMemory = parseInt(memoryInfo.match(/maxmemory:(\d+)/)[1]);

if (usedMemory > maxMemory * 0.9) {
  console.warn('Redis memory usage high');
}

// Monitor slow log
const slowLog = await client.slowLogGet(10);
slowLog.forEach(entry => {
  console.log({
    id: entry.id,
    timestamp: entry.timestamp,
    duration: entry.duration,
    command: entry.command
  });
});

DynamoDB Monitoring

const { CloudWatchClient, GetMetricStatisticsCommand } = require('@aws-sdk/client-cloudwatch');

const cloudwatch = new CloudWatchClient({ region: 'us-east-1' });

async function getDynamoDBMetrics(tableName) {
  const endTime = new Date();
  const startTime = new Date(endTime - 3600000);  // Last hour
  
  const command = new GetMetricStatisticsCommand({
    Namespace: 'AWS/DynamoDB',
    MetricName: 'ConsumedReadCapacityUnits',
    Dimensions: [
      { Name: 'TableName', Value: tableName }
    ],
    StartTime: startTime,
    EndTime: endTime,
    Period: 300,  // 5 minutes
    Statistics: ['Sum', 'Average', 'Maximum']
  });
  
  const response = await cloudwatch.send(command);
  return response.Datapoints;
}

.NET Monitoring

using MongoDB.Driver;
using MongoDB.Driver.Core.Events;

public class MonitoringService
{
    private readonly IMongoClient _client;
    
    public MonitoringService()
    {
        var settings = MongoClientSettings.FromConnectionString(connectionString);
        
        // Subscribe to command events
        settings.ClusterConfigurator = cb =>
        {
            cb.Subscribe<CommandStartedEvent>(e =>
            {
                Console.WriteLine($"Command started: {e.CommandName}");
            });
            
            cb.Subscribe<CommandSucceededEvent>(e =>
            {
                Console.WriteLine($"Command succeeded: {e.CommandName} ({e.Duration})");
            });
            
            cb.Subscribe<CommandFailedEvent>(e =>
            {
                Console.WriteLine($"Command failed: {e.CommandName} - {e.Failure}");
            });
        };
        
        _client = new MongoClient(settings);
    }
    
    public async Task<ServerStatus> GetServerStatus()
    {
        var database = _client.GetDatabase("admin");
        var command = new BsonDocument("serverStatus", 1);
        var result = await database.RunCommandAsync<BsonDocument>(command);
        
        return new ServerStatus
        {
            Connections = result["connections"]["current"].AsInt32,
            Memory = result["mem"]["resident"].AsInt32,
            Uptime = result["uptime"].AsInt32
        };
    }
}

Monitoring Best Practices

const monitoringBestPractices = [
  'Monitor key metrics: latency, throughput, errors',
  'Set up alerts for anomalies',
  'Track resource usage: CPU, memory, disk, network',
  'Monitor replication lag',
  'Profile slow queries',
  'Implement health checks',
  'Use APM tools',
  'Create dashboards (Grafana)',
  'Log all errors',
  'Monitor connection pool usage',
  'Track index usage',
  'Set up automated alerts',
  'Review metrics regularly',
  'Establish baselines'
];

Interview Tips

  • Explain metrics: Latency, throughput, errors, resources
  • Show tools: Prometheus, Grafana, CloudWatch
  • Demonstrate profiling: Slow query analysis
  • Discuss alerting: Automated notifications
  • Mention health checks: Endpoint monitoring
  • Show examples: MongoDB, Cassandra, Redis, DynamoDB

Summary

Monitor NoSQL databases with key metrics: query latency, throughput, error rates, and resource usage. Use MongoDB profiler for slow queries, serverStatus for metrics. Integrate with Prometheus and Grafana for visualization. Implement health check endpoints. Set up automated alerts for anomalies. Monitor replication lag, connection pool usage, and index efficiency. Use APM tools for application-level monitoring. Track CloudWatch metrics for DynamoDB. Essential for maintaining production database health and performance.

Test Your Knowledge

Take a quick quiz to test your understanding of this topic.

Test Your Nosql Knowledge

Ready to put your skills to the test? Take our interactive Nosql quiz and get instant feedback on your answers.