Monitoring Microservices
The Three Pillars
1. Metrics
const prometheus = require('prom-client');
// Create registry
const register = new prometheus.Registry();
// Counter
const httpRequests = new prometheus.Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'path', 'status']
});
// Histogram
const httpDuration = new prometheus.Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration',
labelNames: ['method', 'path']
});
// Gauge
const activeConnections = new prometheus.Gauge({
name: 'active_connections',
help: 'Number of active connections'
});
register.registerMetric(httpRequests);
register.registerMetric(httpDuration);
register.registerMetric(activeConnections);
// Middleware
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
httpRequests.inc({
method: req.method,
path: req.path,
status: res.statusCode
});
httpDuration.observe({
method: req.method,
path: req.path
}, duration);
});
next();
});
// Metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});2. Logs
const winston = require('winston');
const logger = winston.createLogger({
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new winston.transports.Console(),
new winston.transports.File({ filename: 'app.log' })
]
});
// Structured logging
app.use((req, res, next) => {
logger.info('Request received', {
method: req.method,
path: req.path,
correlationId: req.headers['x-correlation-id'],
userId: req.user?.id
});
next();
});
// Error logging
app.use((err, req, res, next) => {
logger.error('Error occurred', {
error: err.message,
stack: err.stack,
correlationId: req.headers['x-correlation-id']
});
next(err);
});3. Traces
const { trace } = require('@opentelemetry/api');
const tracer = trace.getTracer('user-service');
app.post('/users', async (req, res) => {
const span = tracer.startSpan('create-user');
try {
span.setAttribute('user.email', req.body.email);
const user = await User.create(req.body);
span.setStatus({ code: SpanStatusCode.OK });
res.json(user);
} catch (error) {
span.recordException(error);
span.setStatus({ code: SpanStatusCode.ERROR });
res.status(500).json({ error: error.message });
} finally {
span.end();
}
});Application Performance Monitoring (APM)
// New Relic
require('newrelic');
// Datadog
const tracer = require('dd-trace').init();
// Elastic APM
const apm = require('elastic-apm-node').start({
serviceName: 'user-service',
serverUrl: 'http://apm-server:8200'
});Health Checks
class HealthCheck {
constructor() {
this.checks = new Map();
}
register(name, checkFn) {
this.checks.set(name, checkFn);
}
async execute() {
const results = {};
let healthy = true;
for (const [name, checkFn] of this.checks.entries()) {
try {
await checkFn();
results[name] = { status: 'UP' };
} catch (error) {
results[name] = {
status: 'DOWN',
error: error.message
};
healthy = false;
}
}
return {
status: healthy ? 'UP' : 'DOWN',
checks: results
};
}
}
const healthCheck = new HealthCheck();
healthCheck.register('database', async () => {
await db.ping();
});
healthCheck.register('cache', async () => {
await redis.ping();
});
healthCheck.register('external-api', async () => {
await axios.get('http://external-api/health');
});
app.get('/health', async (req, res) => {
const result = await healthCheck.execute();
const status = result.status === 'UP' ? 200 : 503;
res.status(status).json(result);
});Alerting
class AlertManager {
constructor() {
this.thresholds = new Map();
this.alerts = [];
}
setThreshold(metric, threshold) {
this.thresholds.set(metric, threshold);
}
check(metric, value) {
const threshold = this.thresholds.get(metric);
if (threshold && value > threshold) {
this.alert(metric, value, threshold);
}
}
alert(metric, value, threshold) {
const alert = {
metric,
value,
threshold,
timestamp: new Date()
};
this.alerts.push(alert);
// Send to alerting system
this.sendAlert(alert);
}
async sendAlert(alert) {
// PagerDuty, Slack, email, etc.
await axios.post('http://alerting-service/alerts', alert);
}
}
const alertManager = new AlertManager();
alertManager.setThreshold('error_rate', 0.05);
alertManager.setThreshold('response_time', 1000);
// Check metrics
setInterval(async () => {
const errorRate = await getErrorRate();
alertManager.check('error_rate', errorRate);
const responseTime = await getAvgResponseTime();
alertManager.check('response_time', responseTime);
}, 60000);Dashboards
# Grafana dashboard config
apiVersion: 1
providers:
- name: 'Microservices'
folder: 'Services'
type: file
options:
path: /var/lib/grafana/dashboards
# Dashboard JSON
{
"dashboard": {
"title": "User Service",
"panels": [
{
"title": "Request Rate",
"targets": [
{
"expr": "rate(http_requests_total[5m])"
}
]
},
{
"title": "Error Rate",
"targets": [
{
"expr": "rate(http_requests_total{status=~\"5..\"}[5m])"
}
]
},
{
"title": "Response Time",
"targets": [
{
"expr": "histogram_quantile(0.95, http_request_duration_seconds)"
}
]
}
]
}
}Service Level Objectives (SLOs)
class SLOMonitor {
constructor() {
this.slos = new Map();
}
define(name, target, window) {
this.slos.set(name, { target, window });
}
async check(name) {
const slo = this.slos.get(name);
const actual = await this.measure(name, slo.window);
return {
name,
target: slo.target,
actual,
met: actual >= slo.target
};
}
async measure(name, window) {
// Query metrics for the window
const query = `avg_over_time(${name}[${window}])`;
const result = await prometheus.query(query);
return result.value;
}
}
const sloMonitor = new SLOMonitor();
// 99.9% availability
sloMonitor.define('availability', 0.999, '30d');
// 95th percentile < 200ms
sloMonitor.define('latency_p95', 200, '30d');
// Error rate < 0.1%
sloMonitor.define('error_rate', 0.001, '30d');Distributed Tracing
// Correlation ID middleware
app.use((req, res, next) => {
const correlationId = req.headers['x-correlation-id'] || generateId();
req.correlationId = correlationId;
res.setHeader('x-correlation-id', correlationId);
next();
});
// Propagate to downstream services
async function callService(url, data, req) {
return axios.post(url, data, {
headers: {
'x-correlation-id': req.correlationId
}
});
}Best Practices
Monitor the Four Golden Signals:
- Latency
- Traffic
- Errors
- Saturation
Use structured logging
Implement distributed tracing
Set up alerts for SLOs
Create dashboards
Monitor dependencies
Track business metrics
Interview Tips
- Explain pillars: Metrics, logs, traces
- Show implementation: Prometheus, Winston, OpenTelemetry
- Demonstrate health checks: Liveness and readiness
- Discuss alerting: Thresholds and notifications
- Mention SLOs: Service level objectives
- Show dashboards: Grafana visualization
Summary
Monitoring microservices requires metrics (Prometheus), logs (Winston/ELK), and traces (OpenTelemetry). Implement health checks, alerting, and dashboards. Track the four golden signals: latency, traffic, errors, and saturation. Use distributed tracing with correlation IDs. Define and monitor SLOs. Essential for maintaining reliable microservices.
Test Your Knowledge
Take a quick quiz to test your understanding of this topic.