Files
drone-detector/docs/container-health-endpoint.md
2025-09-14 18:14:42 +02:00

6.5 KiB

Container Health & Metrics Endpoint Implementation

Creative Container Monitoring Solutions

1. Internal Health/Metrics Endpoints

Add these endpoints to each container for self-reporting metrics:

Backend Container (Node.js Example)

const express = require('express');
const os = require('os');
const fs = require('fs');

// Health & Metrics endpoint
app.get('/health/metrics', (req, res) => {
  const memUsage = process.memoryUsage();
  const cpuUsage = process.cpuUsage();
  
  res.json({
    container: process.env.CONTAINER_NAME || 'backend',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    memory: {
      usage: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`,
      total: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`,
      percentage: `${Math.round((memUsage.heapUsed / memUsage.heapTotal) * 100)}%`
    },
    cpu: {
      user: cpuUsage.user,
      system: cpuUsage.system,
      load: os.loadavg()[0].toFixed(2) + '%'
    },
    network: {
      connections: getActiveConnections(),
      requests_per_minute: getRequestRate()
    },
    disk: {
      logs: getDiskUsage('/var/log'),
      temp: getDiskUsage('/tmp')
    },
    health: 'healthy',
    version: process.env.APP_VERSION || '1.0.0'
  });
});

function getActiveConnections() {
  try {
    const netstat = require('child_process').execSync('netstat -an | grep ESTABLISHED | wc -l', { encoding: 'utf8' });
    return parseInt(netstat.trim());
  } catch (e) {
    return 'N/A';
  }
}

function getRequestRate() {
  // Implement request counter logic
  return global.requestCounter || 0;
}

function getDiskUsage(path) {
  try {
    const stats = fs.statSync(path);
    return `${Math.round(stats.size / 1024 / 1024)}MB`;
  } catch (e) {
    return 'N/A';
  }
}

Frontend Container (Nginx + JS Example)

# Add to nginx.conf
location /health/metrics {
    access_log off;
    return 200 '{"container":"frontend","status":"healthy","nginx_version":"$nginx_version","connections":"$connections_active","timestamp":"$time_iso8601"}';
    add_header Content-Type application/json;
}

2. Prometheus-style Metrics Scraping

// In management.js
const scrapePrometheusMetrics = async (containerUrl) => {
  try {
    const response = await fetch(`${containerUrl}/metrics`);
    const metricsText = await response.text();
    
    // Parse Prometheus format
    const metrics = {};
    metricsText.split('\n').forEach(line => {
      if (line.startsWith('container_cpu_usage')) {
        metrics.cpu = line.split(' ')[1] + '%';
      }
      if (line.startsWith('container_memory_usage_bytes')) {
        const bytes = parseInt(line.split(' ')[1]);
        metrics.memory = Math.round(bytes / 1024 / 1024) + 'MB';
      }
    });
    
    return metrics;
  } catch (error) {
    return { error: 'Prometheus metrics unavailable' };
  }
};

3. Socket.IO Real-time Metrics Broadcasting

// Each container broadcasts its metrics via Socket.IO
const io = require('socket.io-client');
const socket = io('http://management-backend:3000');

setInterval(() => {
  const metrics = {
    container: process.env.CONTAINER_NAME,
    cpu: getCurrentCPU(),
    memory: getCurrentMemory(),
    timestamp: Date.now()
  };
  
  socket.emit('container_metrics', metrics);
}, 10000); // Every 10 seconds

// Management backend collects these
io.on('container_metrics', (metrics) => {
  containerMetricsCache[metrics.container] = metrics;
});

4. Log File Tailing Approach

// Parse container logs for metrics
const tailContainerLogs = async (containerName) => {
  try {
    const { stdout } = await execAsync(`docker logs --tail 50 ${containerName} | grep "METRICS:"`);
    const logLines = stdout.split('\n').filter(line => line.includes('METRICS:'));
    
    if (logLines.length > 0) {
      const lastMetric = logLines[logLines.length - 1];
      const metricsJson = lastMetric.split('METRICS:')[1];
      return JSON.parse(metricsJson);
    }
  } catch (error) {
    return { error: 'Log metrics unavailable' };
  }
};

// Containers log metrics in structured format
console.log(`METRICS: ${JSON.stringify({
  cpu: getCurrentCPU(),
  memory: getCurrentMemory(),
  timestamp: new Date().toISOString()
})}`);

5. Shared Volume Metrics Files

// Each container writes metrics to shared volume
const writeMetricsToFile = () => {
  const metrics = {
    container: process.env.CONTAINER_NAME,
    cpu: getCurrentCPU(),
    memory: getCurrentMemory(),
    timestamp: Date.now()
  };
  
  fs.writeFileSync(`/shared/metrics/${process.env.CONTAINER_NAME}.json`, JSON.stringify(metrics));
};

// Management reads from shared volume
const readSharedMetrics = () => {
  const metricsDir = '/shared/metrics';
  const files = fs.readdirSync(metricsDir);
  
  return files.reduce((acc, file) => {
    if (file.endsWith('.json')) {
      const metrics = JSON.parse(fs.readFileSync(path.join(metricsDir, file)));
      acc[file.replace('.json', '')] = metrics;
    }
    return acc;
  }, {});
};

6. Database-based Metrics Collection

// Containers insert metrics into shared database
const recordMetrics = async () => {
  await db.query(`
    INSERT INTO container_metrics (container_name, cpu_usage, memory_usage, timestamp)
    VALUES (?, ?, ?, ?)
  `, [process.env.CONTAINER_NAME, getCurrentCPU(), getCurrentMemory(), new Date()]);
};

// Management queries latest metrics
const getLatestMetrics = async () => {
  const result = await db.query(`
    SELECT container_name, cpu_usage, memory_usage, timestamp
    FROM container_metrics
    WHERE timestamp > NOW() - INTERVAL 1 MINUTE
    ORDER BY timestamp DESC
  `);
  
  return result.reduce((acc, row) => {
    acc[row.container_name] = {
      cpu: row.cpu_usage,
      memory: row.memory_usage,
      lastUpdate: row.timestamp
    };
    return acc;
  }, {});
};

Implementation Priority

  1. Health Endpoints - Most reliable, direct communication
  2. Socket.IO Broadcasting - Real-time, low overhead
  3. Prometheus Metrics - Industry standard, rich data
  4. Shared Volume Files - Simple, filesystem-based
  5. Log Tailing - Works with existing logging
  6. Database Collection - Persistent, queryable history

Benefits

  • Fallback Chain: Multiple methods ensure metrics are always available
  • Self-Reporting: Containers know their own state best
  • Real-time: Direct communication provides immediate updates
  • Standardized: Each method can provide consistent metric format
  • Resilient: If one method fails, others still work