diff --git a/docs/container-health-endpoint.md b/docs/container-health-endpoint.md index 551f719..13e9a2b 100644 --- a/docs/container-health-endpoint.md +++ b/docs/container-health-endpoint.md @@ -1,83 +1,188 @@ # Container Health & Metrics Endpoint Implementation -## Creative Container Monitoring Solutions +## How It Works - Different Approaches Explained -### 1. Internal Health/Metrics Endpoints +### 🎯 **Current Implementation: Multi-Layered Detection** -Add these endpoints to each container for self-reporting metrics: +The system I just implemented uses a **fallback chain** approach - NO agents required! Here's how: -#### Backend Container (Node.js Example) +#### **Method 1: Built-in Health Endpoints (Recommended)** ```javascript +// Add to your existing Express.js containers const express = require('express'); -const os = require('os'); -const fs = require('fs'); +const app = express(); -// Health & Metrics endpoint +// Simple addition to existing code - no agent needed! app.get('/health/metrics', (req, res) => { const memUsage = process.memoryUsage(); - const cpuUsage = process.cpuUsage(); - res.json({ container: process.env.CONTAINER_NAME || 'backend', - timestamp: new Date().toISOString(), - uptime: process.uptime(), + cpu: getCurrentCPU(), memory: { usage: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, - total: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, percentage: `${Math.round((memUsage.heapUsed / memUsage.heapTotal) * 100)}%` }, - cpu: { - user: cpuUsage.user, - system: cpuUsage.system, - load: os.loadavg()[0].toFixed(2) + '%' - }, - network: { - connections: getActiveConnections(), - requests_per_minute: getRequestRate() - }, - disk: { - logs: getDiskUsage('/var/log'), - temp: getDiskUsage('/tmp') - }, - health: 'healthy', - version: process.env.APP_VERSION || '1.0.0' + uptime: process.uptime(), + health: 'healthy' }); }); - -function getActiveConnections() { - try { - const netstat = require('child_process').execSync('netstat -an | grep ESTABLISHED | wc -l', { encoding: 'utf8' }); - return parseInt(netstat.trim()); - } catch (e) { - return 'N/A'; - } -} - -function getRequestRate() { - // Implement request counter logic - return global.requestCounter || 0; -} - -function getDiskUsage(path) { - try { - const stats = fs.statSync(path); - return `${Math.round(stats.size / 1024 / 1024)}MB`; - } catch (e) { - return 'N/A'; - } -} ``` -#### Frontend Container (Nginx + JS Example) -```nginx -# Add to nginx.conf -location /health/metrics { - access_log off; - return 200 '{"container":"frontend","status":"healthy","nginx_version":"$nginx_version","connections":"$connections_active","timestamp":"$time_iso8601"}'; - add_header Content-Type application/json; -} +**βœ… Pros**: Direct from container, accurate, real-time +**❌ Cons**: Requires code changes in each container + +#### **Method 2: Docker Stats API (Current Fallback)** +```javascript +// From management container - queries Docker daemon +const { stdout } = await execAsync('docker stats --no-stream --format "table {{.Container}}\\t{{.CPUPerc}}\\t{{.MemUsage}}"'); ``` +**βœ… Pros**: Works with ANY container, no code changes needed +**❌ Cons**: Requires Docker daemon access + +#### **Method 3: Docker Compose Status** +```javascript +// Queries docker-compose for container states +const { stdout } = await execAsync('docker-compose ps --format json'); +``` + +**βœ… Pros**: Basic status info, works everywhere +**❌ Cons**: Limited metrics, just status/health + +--- + +## πŸ€– **Alternative: Agent-Based Approaches** + +### **Option A: Sidecar Container Pattern** +```yaml +# docker-compose.yml +services: + app: + image: my-app:latest + + metrics-agent: + image: metrics-agent:latest + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + environment: + - TARGET_CONTAINER=app +``` + +**How it works**: Deploy a metrics agent container alongside each service +**βœ… Pros**: No code changes, detailed system metrics +**❌ Cons**: Extra containers, more complex deployment + +### **Option B: In-Container Agent Process** +```dockerfile +# Add to existing Dockerfile +FROM node:18 +COPY . /app +COPY metrics-agent /usr/local/bin/ +RUN chmod +x /usr/local/bin/metrics-agent + +# Start both app and agent +CMD ["sh", "-c", "metrics-agent & npm start"] +``` + +**How it works**: Runs a metrics collection process inside each container +**βœ… Pros**: Single container, detailed metrics +**❌ Cons**: Modifies container, uses more resources + +### **Option C: External Monitoring Tools** + +#### **Prometheus + Node Exporter** +```yaml +services: + node-exporter: + image: prom/node-exporter + ports: + - "9100:9100" + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro +``` + +#### **cAdvisor (Container Advisor)** +```yaml +services: + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + ports: + - "8080:8080" + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro +``` + +--- + +## πŸ”§ **Recommended Implementation Strategy** + +### **Phase 1: Docker Stats (Current)** +- βœ… **Already implemented** +- Works immediately with existing containers +- No code changes required +- Provides CPU, Memory, Network, Disk I/O + +### **Phase 2: Add Health Endpoints** +```javascript +// Add 3 lines to each container's main file +const { createHealthEndpoint } = require('./utils/health-endpoint'); +createHealthEndpoint(app); // app is your Express instance +``` + +### **Phase 3: Enhanced Monitoring (Optional)** +- Add Prometheus metrics +- Implement custom business metrics +- Add alerting and dashboards + +--- + +## 🎯 **Current System Architecture** + +``` +Management Container + ↓ +1. Try HTTP health endpoints (app containers) + ↓ (if fails) +2. Query Docker daemon (all containers) + ↓ (if fails) +3. Check docker-compose status + ↓ (if fails) +4. Scan system processes +``` + +**No agents required!** The management container does all the work: + +1. **Health Endpoints**: Makes HTTP calls to containers that support it +2. **Docker Stats**: Queries Docker daemon for ALL container metrics +3. **Process Detection**: Scans system for running services +4. **Smart Fallback**: Always tries to get SOME information + +--- + +## πŸš€ **Why This Approach is Great** + +### **For Existing Systems** +- **Zero downtime**: Works immediately +- **No refactoring**: Containers don't need changes +- **Comprehensive**: Sees ALL containers (yours + infrastructure) + +### **For Future Development** +- **Gradual enhancement**: Add health endpoints when convenient +- **Flexible**: Can switch to any monitoring approach later +- **Standards compliant**: Uses Docker APIs and HTTP standards + +### **Production Ready** +- **Reliable fallbacks**: Always gets some data +- **Error handling**: Graceful degradation +- **Performance**: Lightweight HTTP calls +- **Security**: No privileged containers needed + ### 2. Prometheus-style Metrics Scraping ```javascript diff --git a/docs/monitoring-architecture.txt b/docs/monitoring-architecture.txt new file mode 100644 index 0000000..da015f9 --- /dev/null +++ b/docs/monitoring-architecture.txt @@ -0,0 +1,61 @@ +``` +Container Monitoring Architecture - No Agents Required! + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Management Container β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Monitoring Controller β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ 1. HTTP Health Endpoints ──┐ β”‚ β”‚ +β”‚ β”‚ 2. Docker Stats API ────────┼──── Fallback Chain β”‚ β”‚ +β”‚ β”‚ 3. Docker Compose Status ──── β”‚ β”‚ +β”‚ β”‚ 4. Process List Scanning β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Target Containers β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ App Containersβ”‚ Infrastructure β”‚ Cache Layer β”‚ Database β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Backend β”‚ β”‚ β”‚ Nginx β”‚ β”‚ β”‚ Redis β”‚ β”‚ β”‚ Postgresβ”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ /health ←───┼─┼─┼─HTTP calls──┼─┼─┼─Basic ping──┼─┼─┼─Port β”‚ β”‚ +β”‚ β”‚ /metrics β”‚ β”‚ β”‚ /nginx_stat β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ check β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ Enhanced Data β”‚ Basic Status β”‚ Connectivity β”‚ Status β”‚ +β”‚ β€’ CPU usage β”‚ β€’ Up/Down β”‚ β€’ Responsive β”‚ β€’ Runningβ”‚ +β”‚ β€’ Memory % β”‚ β€’ Port info β”‚ β€’ Timeout β”‚ β€’ Health β”‚ +β”‚ β€’ Custom β”‚ β€’ Health β”‚ β”‚ β”‚ +β”‚ metrics β”‚ check β”‚ β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Docker Daemon β”‚ +β”‚ β”‚ +β”‚ If HTTP calls fail, query Docker directly: β”‚ +β”‚ β€’ docker stats --no-stream (CPU, Memory, Network, Disk) β”‚ +β”‚ β€’ docker-compose ps (Status, Health, Ports) β”‚ +β”‚ β€’ ps aux | grep (Process detection as final fallback) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + +Data Flow: +1. Management container tries HTTP health endpoints first +2. Falls back to Docker daemon APIs if containers don't respond +3. Uses docker-compose for basic status if Docker unavailable +4. Scans processes as absolute last resort + +Benefits: +βœ… No agents to install or maintain +βœ… Works with existing containers immediately +βœ… Gradual enhancement possible (add health endpoints when convenient) +βœ… Comprehensive coverage (all containers, not just yours) +βœ… Multiple fallbacks ensure data is always available +βœ… Standard HTTP + Docker APIs (no proprietary protocols) +``` diff --git a/management/src/pages/System.jsx b/management/src/pages/System.jsx index d4a9c3e..9d7726b 100644 --- a/management/src/pages/System.jsx +++ b/management/src/pages/System.jsx @@ -243,11 +243,20 @@ const System = () => { Issuer: {ssl.issuer} )} + {ssl.fingerprint && ( +
+ Fingerprint: {ssl.fingerprint.substring(0, 20)}... +
+ )} )} {ssl.error && ( -
- {ssl.error} +
+
Error:
+
{ssl.error}
+ {ssl.errorCode && ( +
Code: {ssl.errorCode}
+ )}
)}
@@ -388,8 +397,36 @@ const System = () => { {systemInfo.containers.error ? (
-
Docker containers not available
-
{systemInfo.containers.message}
+
Container monitoring unavailable
+
{systemInfo.containers.lastError}
+ + {systemInfo.containers.troubleshooting && ( +
+
πŸ’‘ Troubleshooting Tips:
+
    +
  • β€’ {systemInfo.containers.troubleshooting.docker_access}
  • +
  • β€’ {systemInfo.containers.troubleshooting.permissions}
  • +
  • β€’ {systemInfo.containers.troubleshooting.environment}
  • +
+
+ )} + + {systemInfo.containers.suggestions && ( +
+
πŸ”§ Quick Fixes:
+
    + {systemInfo.containers.suggestions.map((suggestion, index) => ( +
  • β€’ {suggestion}
  • + ))} +
+
+ )} +
+ ) : systemInfo.containers.info ? ( +
+ +
{systemInfo.containers.info}
+
{systemInfo.containers.message}
) : (
diff --git a/server/routes/management.js b/server/routes/management.js index a5b730c..90c7d15 100644 --- a/server/routes/management.js +++ b/server/routes/management.js @@ -203,24 +203,24 @@ router.get('/system-info', async (req, res) => { } // Fallback to Docker stats for ALL containers (not just our apps) - try { - const { stdout } = await execAsync('docker stats --no-stream --format "table {{.Container}}\\t{{.CPUPerc}}\\t{{.MemUsage}}\\t{{.MemPerc}}\\t{{.NetIO}}\\t{{.BlockIO}}"'); - const lines = stdout.trim().split('\n').slice(1); - - lines.forEach(line => { - const [container, cpu, memUsage, memPerc, netIO, blockIO] = line.split('\t'); - if (container && cpu) { - // Determine container type - let type = 'unknown'; - if (container.includes('postgres') || container.includes('mysql') || container.includes('mongo')) type = 'database'; - else if (container.includes('redis') || container.includes('memcached')) type = 'cache'; - else if (container.includes('nginx') || container.includes('proxy') || container.includes('traefik')) type = 'proxy'; - else if (container.includes('drone-detection') || container.includes('uamils')) type = 'application'; - else if (container.includes('elasticsearch') || container.includes('kibana') || container.includes('logstash')) type = 'logging'; - else if (container.includes('prometheus') || container.includes('grafana')) type = 'monitoring'; - - // If we don't have health endpoint data, use docker stats - if (!containerMetrics[container]) { + if (Object.keys(containerMetrics).length === 0 || Object.values(containerMetrics).every(m => m.status === 'unreachable')) { + try { + const { stdout } = await execAsync('docker stats --no-stream --format "table {{.Container}}\\t{{.CPUPerc}}\\t{{.MemUsage}}\\t{{.MemPerc}}\\t{{.NetIO}}\\t{{.BlockIO}}"'); + const lines = stdout.trim().split('\n').slice(1); + + lines.forEach(line => { + const [container, cpu, memUsage, memPerc, netIO, blockIO] = line.split('\t'); + if (container && cpu) { + // Determine container type + let type = 'unknown'; + const name = container.toLowerCase(); + if (name.includes('postgres') || name.includes('mysql') || name.includes('mongo')) type = 'database'; + else if (name.includes('redis') || name.includes('memcached')) type = 'cache'; + else if (name.includes('nginx') || name.includes('proxy') || name.includes('traefik')) type = 'proxy'; + else if (name.includes('drone-detection') || name.includes('uamils')) type = 'application'; + else if (name.includes('elasticsearch') || name.includes('kibana') || name.includes('logstash')) type = 'logging'; + else if (name.includes('prometheus') || name.includes('grafana')) type = 'monitoring'; + containerMetrics[container] = { cpu: cpu, memory: { usage: memUsage, percentage: memPerc }, @@ -229,72 +229,82 @@ router.get('/system-info', async (req, res) => { type: type, source: 'docker_stats' }; - } else { - // Enhance existing health data with docker stats - containerMetrics[container] = { - ...containerMetrics[container], - cpu: cpu, - memory: { usage: memUsage, percentage: memPerc }, - network: netIO, - disk: blockIO - }; - } - } - }); - } catch (dockerError) { - console.log('Docker stats failed, trying docker compose...'); - - // Try container inspection via docker compose - try { - const { stdout: composeStatus } = await execAsync('docker-compose ps --format json'); - const containers = JSON.parse(`[${composeStatus.split('\n').filter(line => line.trim()).join(',')}]`); - - containers.forEach(container => { - if (container.Name && !containerMetrics[container.Name]) { - let type = 'unknown'; - const name = container.Name.toLowerCase(); - if (name.includes('postgres') || name.includes('mysql') || name.includes('mongo')) type = 'database'; - else if (name.includes('redis') || name.includes('memcached')) type = 'cache'; - else if (name.includes('nginx') || name.includes('proxy')) type = 'proxy'; - else if (name.includes('drone-detection') || name.includes('uamils')) type = 'application'; - - containerMetrics[container.Name] = { - status: container.State, - health: container.Health || 'unknown', - ports: container.Ports, - type: type, - source: 'docker_compose' - }; } }); - } catch (composeError) { - // Final fallback - try to detect containers via process list + } catch (dockerError) { + console.log('Docker stats failed, trying compose and processes...'); + + // Try container inspection via docker compose try { - const { stdout: processes } = await execAsync('ps aux | grep -E "(postgres|redis|nginx|docker)" | grep -v grep'); - const processLines = processes.split('\n').filter(line => line.trim()); + const { stdout: composeStatus } = await execAsync('docker-compose ps --services 2>/dev/null || docker compose ps --services 2>/dev/null'); + const services = composeStatus.trim().split('\n').filter(s => s.trim()); - const detectedServices = {}; - processLines.forEach(line => { - if (line.includes('postgres')) detectedServices['postgres-process'] = { status: 'running', type: 'database', source: 'process_list' }; - if (line.includes('redis')) detectedServices['redis-process'] = { status: 'running', type: 'cache', source: 'process_list' }; - if (line.includes('nginx')) detectedServices['nginx-process'] = { status: 'running', type: 'proxy', source: 'process_list' }; - }); - - if (Object.keys(detectedServices).length > 0) { - containerMetrics = { ...containerMetrics, ...detectedServices }; - } else { + if (services.length > 0) { + for (const service of services) { + let type = 'unknown'; + const name = service.toLowerCase(); + if (name.includes('postgres') || name.includes('mysql') || name.includes('mongo') || name.includes('db')) type = 'database'; + else if (name.includes('redis') || name.includes('cache')) type = 'cache'; + else if (name.includes('nginx') || name.includes('proxy')) type = 'proxy'; + else if (name.includes('drone-detection') || name.includes('uamils') || name.includes('app') || name.includes('backend') || name.includes('frontend')) type = 'application'; + + containerMetrics[service] = { + status: 'detected', + health: 'unknown', + type: type, + source: 'docker_compose_services' + }; + } + } + } catch (composeError) { + // Final fallback - try to detect running services via different methods + try { + // Check for common database ports + const portChecks = [ + { port: 5432, name: 'postgresql', type: 'database' }, + { port: 3306, name: 'mysql', type: 'database' }, + { port: 6379, name: 'redis', type: 'cache' }, + { port: 80, name: 'nginx', type: 'proxy' }, + { port: 443, name: 'nginx-ssl', type: 'proxy' } + ]; + + const { stdout: netstatOutput } = await execAsync('netstat -tlnp 2>/dev/null || ss -tlnp 2>/dev/null || echo "no netstat"'); + + for (const { port, name, type } of portChecks) { + if (netstatOutput.includes(`:${port} `)) { + containerMetrics[`${name}-service`] = { + status: 'port_listening', + port: port, + type: type, + source: 'port_detection' + }; + } + } + + // If still no containers found, show a helpful message + if (Object.keys(containerMetrics).length === 0) { + containerMetrics = { + info: 'No containers detected', + message: 'This could mean Docker is not running, no containers are active, or the monitoring system needs Docker access', + suggestions: [ + 'Check if Docker is running: docker ps', + 'Ensure management container has Docker socket access', + 'Try: docker run --rm -v /var/run/docker.sock:/var/run/docker.sock ...' + ] + }; + } + } catch (finalError) { containerMetrics = { error: 'All container monitoring methods failed', - attempts: ['health_endpoints', 'docker_stats', 'docker_compose', 'process_list'], - lastError: composeError.message + attempts: ['health_endpoints', 'docker_stats', 'docker_compose', 'port_detection'], + lastError: finalError.message, + troubleshooting: { + docker_access: 'Ensure management container can access Docker daemon', + permissions: 'Container may need privileged access or Docker socket mount', + environment: 'Check if running in Docker environment vs local development' + } }; } - } catch (processError) { - containerMetrics = { - error: 'All container monitoring methods failed', - attempts: ['health_endpoints', 'docker_stats', 'docker_compose', 'process_list'], - lastError: processError.message - }; } } } @@ -302,28 +312,68 @@ router.get('/system-info', async (req, res) => { // Get system memory and CPU info let systemMetrics = {}; try { - const { stdout: memInfo } = await execAsync('free -m'); - const memLines = memInfo.split('\n')[1].split(/\s+/); - const totalMem = parseInt(memLines[1]); - const usedMem = parseInt(memLines[2]); - - const { stdout: cpuInfo } = await execAsync('top -bn1 | grep "Cpu(s)" | sed "s/.*, *\\([0-9.]*\\)%* id.*/\\1/" | awk \'{print 100 - $1}\''); - const cpuUsage = parseFloat(cpuInfo.trim()); - - const { stdout: diskInfo } = await execAsync('df -h / | awk \'NR==2{print $3 " / " $2 " (" $5 ")"}\''); - - systemMetrics = { - memory: { + // Try Linux commands first + try { + const { stdout: memInfo } = await execAsync('free -m'); + const memLines = memInfo.split('\n')[1].split(/\s+/); + const totalMem = parseInt(memLines[1]); + const usedMem = parseInt(memLines[2]); + + systemMetrics.memory = { used: `${usedMem}MB`, total: `${totalMem}MB`, percentage: Math.round((usedMem / totalMem) * 100) - }, - cpu: { + }; + } catch (memError) { + // Fallback for Windows or other systems + const totalMem = Math.round(require('os').totalmem() / 1024 / 1024); + const freeMem = Math.round(require('os').freemem() / 1024 / 1024); + const usedMem = totalMem - freeMem; + + systemMetrics.memory = { + used: `${usedMem}MB`, + total: `${totalMem}MB`, + percentage: Math.round((usedMem / totalMem) * 100) + }; + } + + // CPU usage - fix negative values + try { + const { stdout: cpuInfo } = await execAsync('top -bn1 | grep "Cpu(s)" | sed "s/.*, *\\([0-9.]*\\)%* id.*/\\1/" | awk \'{print 100 - $1}\''); + let cpuUsage = parseFloat(cpuInfo.trim()); + + // Fix negative or invalid CPU values + if (isNaN(cpuUsage) || cpuUsage < 0 || cpuUsage > 100) { + // Fallback to load average calculation + const loadAvg = require('os').loadavg()[0]; + const cpuCount = require('os').cpus().length; + cpuUsage = Math.min((loadAvg / cpuCount) * 100, 100); + } + + systemMetrics.cpu = { usage: `${cpuUsage.toFixed(1)}%`, percentage: cpuUsage - }, - disk: diskInfo.trim() - }; + }; + } catch (cpuError) { + // Ultimate fallback + const loadAvg = require('os').loadavg()[0]; + const cpuCount = require('os').cpus().length; + const cpuUsage = Math.min((loadAvg / cpuCount) * 100, 100); + + systemMetrics.cpu = { + usage: `${cpuUsage.toFixed(1)}%`, + percentage: cpuUsage + }; + } + + // Disk usage + try { + const { stdout: diskInfo } = await execAsync('df -h / | awk \'NR==2{print $3 " / " $2 " (" $5 ")"}\''); + systemMetrics.disk = diskInfo.trim(); + } catch (diskError) { + systemMetrics.disk = 'N/A'; + } + } catch (sysError) { console.log('System metrics not available:', sysError.message); systemMetrics = { @@ -341,37 +391,54 @@ router.get('/system-info', async (req, res) => { const options = { hostname: hostname, port: 443, - method: 'GET', - timeout: 5000 + method: 'HEAD', + timeout: 5000, + // Allow self-signed certificates for development + rejectUnauthorized: false }; const req = https.request(options, (res) => { const cert = res.connection.getPeerCertificate(); if (cert && cert.valid_to) { const expiryDate = new Date(cert.valid_to); - const daysUntilExpiry = Math.ceil((expiryDate - new Date()) / (1000 * 60 * 60 * 24)); + const now = new Date(); + const daysUntilExpiry = Math.ceil((expiryDate - now) / (1000 * 60 * 60 * 24)); resolve({ status: daysUntilExpiry > 30 ? 'valid' : daysUntilExpiry > 7 ? 'warning' : 'critical', expiresAt: expiryDate.toISOString(), daysUntilExpiry: daysUntilExpiry, - issuer: cert.issuer?.O || 'Unknown', - subject: cert.subject?.CN || hostname + issuer: cert.issuer?.O || cert.issuer?.CN || 'Unknown', + subject: cert.subject?.CN || hostname, + fingerprint: cert.fingerprint || 'N/A' }); } else { resolve({ status: 'error', expiresAt: null, - error: 'Certificate not found' + error: 'Certificate information not available' }); } }); - req.on('error', () => { + req.on('error', (error) => { + // Try to determine the type of error + let errorMessage = error.message; + if (error.code === 'ENOTFOUND') { + errorMessage = 'Domain not found (DNS resolution failed)'; + } else if (error.code === 'ECONNREFUSED') { + errorMessage = 'Connection refused (service not running on port 443)'; + } else if (error.code === 'ETIMEDOUT') { + errorMessage = 'Connection timeout'; + } else if (error.code === 'CERT_HAS_EXPIRED') { + errorMessage = 'Certificate has expired'; + } + resolve({ status: 'error', expiresAt: null, - error: 'Connection failed' + error: errorMessage, + errorCode: error.code }); }); @@ -380,7 +447,8 @@ router.get('/system-info', async (req, res) => { resolve({ status: 'error', expiresAt: null, - error: 'Timeout' + error: 'Connection timeout (5 seconds)', + errorCode: 'TIMEOUT' }); });