Files
drone-detector/docs/container-health-endpoint.md
2025-09-14 18:14:42 +02:00

237 lines
6.5 KiB
Markdown

# Container Health & Metrics Endpoint Implementation
## Creative Container Monitoring Solutions
### 1. Internal Health/Metrics Endpoints
Add these endpoints to each container for self-reporting metrics:
#### Backend Container (Node.js Example)
```javascript
const express = require('express');
const os = require('os');
const fs = require('fs');
// Health & Metrics endpoint
app.get('/health/metrics', (req, res) => {
const memUsage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
res.json({
container: process.env.CONTAINER_NAME || 'backend',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
memory: {
usage: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`,
total: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`,
percentage: `${Math.round((memUsage.heapUsed / memUsage.heapTotal) * 100)}%`
},
cpu: {
user: cpuUsage.user,
system: cpuUsage.system,
load: os.loadavg()[0].toFixed(2) + '%'
},
network: {
connections: getActiveConnections(),
requests_per_minute: getRequestRate()
},
disk: {
logs: getDiskUsage('/var/log'),
temp: getDiskUsage('/tmp')
},
health: 'healthy',
version: process.env.APP_VERSION || '1.0.0'
});
});
function getActiveConnections() {
try {
const netstat = require('child_process').execSync('netstat -an | grep ESTABLISHED | wc -l', { encoding: 'utf8' });
return parseInt(netstat.trim());
} catch (e) {
return 'N/A';
}
}
function getRequestRate() {
// Implement request counter logic
return global.requestCounter || 0;
}
function getDiskUsage(path) {
try {
const stats = fs.statSync(path);
return `${Math.round(stats.size / 1024 / 1024)}MB`;
} catch (e) {
return 'N/A';
}
}
```
#### Frontend Container (Nginx + JS Example)
```nginx
# Add to nginx.conf
location /health/metrics {
access_log off;
return 200 '{"container":"frontend","status":"healthy","nginx_version":"$nginx_version","connections":"$connections_active","timestamp":"$time_iso8601"}';
add_header Content-Type application/json;
}
```
### 2. Prometheus-style Metrics Scraping
```javascript
// In management.js
const scrapePrometheusMetrics = async (containerUrl) => {
try {
const response = await fetch(`${containerUrl}/metrics`);
const metricsText = await response.text();
// Parse Prometheus format
const metrics = {};
metricsText.split('\n').forEach(line => {
if (line.startsWith('container_cpu_usage')) {
metrics.cpu = line.split(' ')[1] + '%';
}
if (line.startsWith('container_memory_usage_bytes')) {
const bytes = parseInt(line.split(' ')[1]);
metrics.memory = Math.round(bytes / 1024 / 1024) + 'MB';
}
});
return metrics;
} catch (error) {
return { error: 'Prometheus metrics unavailable' };
}
};
```
### 3. Socket.IO Real-time Metrics Broadcasting
```javascript
// Each container broadcasts its metrics via Socket.IO
const io = require('socket.io-client');
const socket = io('http://management-backend:3000');
setInterval(() => {
const metrics = {
container: process.env.CONTAINER_NAME,
cpu: getCurrentCPU(),
memory: getCurrentMemory(),
timestamp: Date.now()
};
socket.emit('container_metrics', metrics);
}, 10000); // Every 10 seconds
// Management backend collects these
io.on('container_metrics', (metrics) => {
containerMetricsCache[metrics.container] = metrics;
});
```
### 4. Log File Tailing Approach
```javascript
// Parse container logs for metrics
const tailContainerLogs = async (containerName) => {
try {
const { stdout } = await execAsync(`docker logs --tail 50 ${containerName} | grep "METRICS:"`);
const logLines = stdout.split('\n').filter(line => line.includes('METRICS:'));
if (logLines.length > 0) {
const lastMetric = logLines[logLines.length - 1];
const metricsJson = lastMetric.split('METRICS:')[1];
return JSON.parse(metricsJson);
}
} catch (error) {
return { error: 'Log metrics unavailable' };
}
};
// Containers log metrics in structured format
console.log(`METRICS: ${JSON.stringify({
cpu: getCurrentCPU(),
memory: getCurrentMemory(),
timestamp: new Date().toISOString()
})}`);
```
### 5. Shared Volume Metrics Files
```javascript
// Each container writes metrics to shared volume
const writeMetricsToFile = () => {
const metrics = {
container: process.env.CONTAINER_NAME,
cpu: getCurrentCPU(),
memory: getCurrentMemory(),
timestamp: Date.now()
};
fs.writeFileSync(`/shared/metrics/${process.env.CONTAINER_NAME}.json`, JSON.stringify(metrics));
};
// Management reads from shared volume
const readSharedMetrics = () => {
const metricsDir = '/shared/metrics';
const files = fs.readdirSync(metricsDir);
return files.reduce((acc, file) => {
if (file.endsWith('.json')) {
const metrics = JSON.parse(fs.readFileSync(path.join(metricsDir, file)));
acc[file.replace('.json', '')] = metrics;
}
return acc;
}, {});
};
```
### 6. Database-based Metrics Collection
```javascript
// Containers insert metrics into shared database
const recordMetrics = async () => {
await db.query(`
INSERT INTO container_metrics (container_name, cpu_usage, memory_usage, timestamp)
VALUES (?, ?, ?, ?)
`, [process.env.CONTAINER_NAME, getCurrentCPU(), getCurrentMemory(), new Date()]);
};
// Management queries latest metrics
const getLatestMetrics = async () => {
const result = await db.query(`
SELECT container_name, cpu_usage, memory_usage, timestamp
FROM container_metrics
WHERE timestamp > NOW() - INTERVAL 1 MINUTE
ORDER BY timestamp DESC
`);
return result.reduce((acc, row) => {
acc[row.container_name] = {
cpu: row.cpu_usage,
memory: row.memory_usage,
lastUpdate: row.timestamp
};
return acc;
}, {});
};
```
## Implementation Priority
1. **Health Endpoints** - Most reliable, direct communication
2. **Socket.IO Broadcasting** - Real-time, low overhead
3. **Prometheus Metrics** - Industry standard, rich data
4. **Shared Volume Files** - Simple, filesystem-based
5. **Log Tailing** - Works with existing logging
6. **Database Collection** - Persistent, queryable history
## Benefits
- **Fallback Chain**: Multiple methods ensure metrics are always available
- **Self-Reporting**: Containers know their own state best
- **Real-time**: Direct communication provides immediate updates
- **Standardized**: Each method can provide consistent metric format
- **Resilient**: If one method fails, others still work