237 lines
6.5 KiB
Markdown
237 lines
6.5 KiB
Markdown
# Container Health & Metrics Endpoint Implementation
|
|
|
|
## Creative Container Monitoring Solutions
|
|
|
|
### 1. Internal Health/Metrics Endpoints
|
|
|
|
Add these endpoints to each container for self-reporting metrics:
|
|
|
|
#### Backend Container (Node.js Example)
|
|
```javascript
|
|
const express = require('express');
|
|
const os = require('os');
|
|
const fs = require('fs');
|
|
|
|
// Health & Metrics endpoint
|
|
app.get('/health/metrics', (req, res) => {
|
|
const memUsage = process.memoryUsage();
|
|
const cpuUsage = process.cpuUsage();
|
|
|
|
res.json({
|
|
container: process.env.CONTAINER_NAME || 'backend',
|
|
timestamp: new Date().toISOString(),
|
|
uptime: process.uptime(),
|
|
memory: {
|
|
usage: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`,
|
|
total: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`,
|
|
percentage: `${Math.round((memUsage.heapUsed / memUsage.heapTotal) * 100)}%`
|
|
},
|
|
cpu: {
|
|
user: cpuUsage.user,
|
|
system: cpuUsage.system,
|
|
load: os.loadavg()[0].toFixed(2) + '%'
|
|
},
|
|
network: {
|
|
connections: getActiveConnections(),
|
|
requests_per_minute: getRequestRate()
|
|
},
|
|
disk: {
|
|
logs: getDiskUsage('/var/log'),
|
|
temp: getDiskUsage('/tmp')
|
|
},
|
|
health: 'healthy',
|
|
version: process.env.APP_VERSION || '1.0.0'
|
|
});
|
|
});
|
|
|
|
function getActiveConnections() {
|
|
try {
|
|
const netstat = require('child_process').execSync('netstat -an | grep ESTABLISHED | wc -l', { encoding: 'utf8' });
|
|
return parseInt(netstat.trim());
|
|
} catch (e) {
|
|
return 'N/A';
|
|
}
|
|
}
|
|
|
|
function getRequestRate() {
|
|
// Implement request counter logic
|
|
return global.requestCounter || 0;
|
|
}
|
|
|
|
function getDiskUsage(path) {
|
|
try {
|
|
const stats = fs.statSync(path);
|
|
return `${Math.round(stats.size / 1024 / 1024)}MB`;
|
|
} catch (e) {
|
|
return 'N/A';
|
|
}
|
|
}
|
|
```
|
|
|
|
#### Frontend Container (Nginx + JS Example)
|
|
```nginx
|
|
# Add to nginx.conf
|
|
location /health/metrics {
|
|
access_log off;
|
|
return 200 '{"container":"frontend","status":"healthy","nginx_version":"$nginx_version","connections":"$connections_active","timestamp":"$time_iso8601"}';
|
|
add_header Content-Type application/json;
|
|
}
|
|
```
|
|
|
|
### 2. Prometheus-style Metrics Scraping
|
|
|
|
```javascript
|
|
// In management.js
|
|
const scrapePrometheusMetrics = async (containerUrl) => {
|
|
try {
|
|
const response = await fetch(`${containerUrl}/metrics`);
|
|
const metricsText = await response.text();
|
|
|
|
// Parse Prometheus format
|
|
const metrics = {};
|
|
metricsText.split('\n').forEach(line => {
|
|
if (line.startsWith('container_cpu_usage')) {
|
|
metrics.cpu = line.split(' ')[1] + '%';
|
|
}
|
|
if (line.startsWith('container_memory_usage_bytes')) {
|
|
const bytes = parseInt(line.split(' ')[1]);
|
|
metrics.memory = Math.round(bytes / 1024 / 1024) + 'MB';
|
|
}
|
|
});
|
|
|
|
return metrics;
|
|
} catch (error) {
|
|
return { error: 'Prometheus metrics unavailable' };
|
|
}
|
|
};
|
|
```
|
|
|
|
### 3. Socket.IO Real-time Metrics Broadcasting
|
|
|
|
```javascript
|
|
// Each container broadcasts its metrics via Socket.IO
|
|
const io = require('socket.io-client');
|
|
const socket = io('http://management-backend:3000');
|
|
|
|
setInterval(() => {
|
|
const metrics = {
|
|
container: process.env.CONTAINER_NAME,
|
|
cpu: getCurrentCPU(),
|
|
memory: getCurrentMemory(),
|
|
timestamp: Date.now()
|
|
};
|
|
|
|
socket.emit('container_metrics', metrics);
|
|
}, 10000); // Every 10 seconds
|
|
|
|
// Management backend collects these
|
|
io.on('container_metrics', (metrics) => {
|
|
containerMetricsCache[metrics.container] = metrics;
|
|
});
|
|
```
|
|
|
|
### 4. Log File Tailing Approach
|
|
|
|
```javascript
|
|
// Parse container logs for metrics
|
|
const tailContainerLogs = async (containerName) => {
|
|
try {
|
|
const { stdout } = await execAsync(`docker logs --tail 50 ${containerName} | grep "METRICS:"`);
|
|
const logLines = stdout.split('\n').filter(line => line.includes('METRICS:'));
|
|
|
|
if (logLines.length > 0) {
|
|
const lastMetric = logLines[logLines.length - 1];
|
|
const metricsJson = lastMetric.split('METRICS:')[1];
|
|
return JSON.parse(metricsJson);
|
|
}
|
|
} catch (error) {
|
|
return { error: 'Log metrics unavailable' };
|
|
}
|
|
};
|
|
|
|
// Containers log metrics in structured format
|
|
console.log(`METRICS: ${JSON.stringify({
|
|
cpu: getCurrentCPU(),
|
|
memory: getCurrentMemory(),
|
|
timestamp: new Date().toISOString()
|
|
})}`);
|
|
```
|
|
|
|
### 5. Shared Volume Metrics Files
|
|
|
|
```javascript
|
|
// Each container writes metrics to shared volume
|
|
const writeMetricsToFile = () => {
|
|
const metrics = {
|
|
container: process.env.CONTAINER_NAME,
|
|
cpu: getCurrentCPU(),
|
|
memory: getCurrentMemory(),
|
|
timestamp: Date.now()
|
|
};
|
|
|
|
fs.writeFileSync(`/shared/metrics/${process.env.CONTAINER_NAME}.json`, JSON.stringify(metrics));
|
|
};
|
|
|
|
// Management reads from shared volume
|
|
const readSharedMetrics = () => {
|
|
const metricsDir = '/shared/metrics';
|
|
const files = fs.readdirSync(metricsDir);
|
|
|
|
return files.reduce((acc, file) => {
|
|
if (file.endsWith('.json')) {
|
|
const metrics = JSON.parse(fs.readFileSync(path.join(metricsDir, file)));
|
|
acc[file.replace('.json', '')] = metrics;
|
|
}
|
|
return acc;
|
|
}, {});
|
|
};
|
|
```
|
|
|
|
### 6. Database-based Metrics Collection
|
|
|
|
```javascript
|
|
// Containers insert metrics into shared database
|
|
const recordMetrics = async () => {
|
|
await db.query(`
|
|
INSERT INTO container_metrics (container_name, cpu_usage, memory_usage, timestamp)
|
|
VALUES (?, ?, ?, ?)
|
|
`, [process.env.CONTAINER_NAME, getCurrentCPU(), getCurrentMemory(), new Date()]);
|
|
};
|
|
|
|
// Management queries latest metrics
|
|
const getLatestMetrics = async () => {
|
|
const result = await db.query(`
|
|
SELECT container_name, cpu_usage, memory_usage, timestamp
|
|
FROM container_metrics
|
|
WHERE timestamp > NOW() - INTERVAL 1 MINUTE
|
|
ORDER BY timestamp DESC
|
|
`);
|
|
|
|
return result.reduce((acc, row) => {
|
|
acc[row.container_name] = {
|
|
cpu: row.cpu_usage,
|
|
memory: row.memory_usage,
|
|
lastUpdate: row.timestamp
|
|
};
|
|
return acc;
|
|
}, {});
|
|
};
|
|
```
|
|
|
|
## Implementation Priority
|
|
|
|
1. **Health Endpoints** - Most reliable, direct communication
|
|
2. **Socket.IO Broadcasting** - Real-time, low overhead
|
|
3. **Prometheus Metrics** - Industry standard, rich data
|
|
4. **Shared Volume Files** - Simple, filesystem-based
|
|
5. **Log Tailing** - Works with existing logging
|
|
6. **Database Collection** - Persistent, queryable history
|
|
|
|
## Benefits
|
|
|
|
- **Fallback Chain**: Multiple methods ensure metrics are always available
|
|
- **Self-Reporting**: Containers know their own state best
|
|
- **Real-time**: Direct communication provides immediate updates
|
|
- **Standardized**: Each method can provide consistent metric format
|
|
- **Resilient**: If one method fails, others still work
|