386 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Markdown
		
	
	
	
	
	
			
		
		
	
	
			386 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Markdown
		
	
	
	
	
	
| # Phase 3 Monitoring Guide
 | |
| 
 | |
| ## 📊 Real-Time Performance Dashboard
 | |
| 
 | |
| ### Access the Dashboard
 | |
| ```bash
 | |
| # View performance metrics in JSON format
 | |
| curl http://localhost:3000/__perf | jq
 | |
| 
 | |
| # Pretty print with colors
 | |
| curl -s http://localhost:3000/__perf | jq '.' --color-output
 | |
| 
 | |
| # Watch metrics update in real-time
 | |
| watch -n 1 'curl -s http://localhost:3000/__perf | jq .'
 | |
| ```
 | |
| 
 | |
| ### Dashboard Response Structure
 | |
| 
 | |
| ```json
 | |
| {
 | |
|   "performance": {
 | |
|     "uptime": 12345,                    // Server uptime in ms
 | |
|     "requests": {
 | |
|       "total": 500,                     // Total requests
 | |
|       "errors": 2,                      // Failed requests
 | |
|       "errorRate": "0.4%"               // Error percentage
 | |
|     },
 | |
|     "cache": {
 | |
|       "hits": 425,                      // Cache hits
 | |
|       "misses": 75,                     // Cache misses
 | |
|       "hitRate": "85%"                  // Hit rate percentage
 | |
|     },
 | |
|     "retries": {
 | |
|       "meilisearch": 3,                 // Meilisearch retries
 | |
|       "filesystem": 1                   // Filesystem retries
 | |
|     },
 | |
|     "latency": {
 | |
|       "avgMs": 42,                      // Average response time
 | |
|       "p95Ms": 98,                      // 95th percentile
 | |
|       "samples": 500                    // Number of samples
 | |
|     }
 | |
|   },
 | |
|   "cache": {
 | |
|     "size": 8,                          // Current cache entries
 | |
|     "maxItems": 10000,                  // Max cache size
 | |
|     "ttlMs": 300000,                    // Cache TTL (5 min)
 | |
|     "hitRate": 85.0,                    // Hit rate %
 | |
|     "hits": 425,                        // Total hits
 | |
|     "misses": 75,                       // Total misses
 | |
|     "evictions": 0,                     // LRU evictions
 | |
|     "sets": 83                          // Cache sets
 | |
|   },
 | |
|   "circuitBreaker": {
 | |
|     "state": "closed",                  // closed|open|half-open
 | |
|     "failureCount": 0,                  // Consecutive failures
 | |
|     "failureThreshold": 5               // Failure threshold
 | |
|   },
 | |
|   "timestamp": "2025-10-23T14:30:00.000Z"
 | |
| }
 | |
| ```
 | |
| 
 | |
| ## 🎯 Key Metrics to Monitor
 | |
| 
 | |
| ### 1. Cache Hit Rate
 | |
| ```bash
 | |
| # Extract cache hit rate
 | |
| curl -s http://localhost:3000/__perf | jq '.cache.hitRate'
 | |
| # Output: 85.0
 | |
| 
 | |
| # Target: > 80% after 5 minutes of usage
 | |
| # If lower: Check TTL, cache size, or request patterns
 | |
| ```
 | |
| 
 | |
| ### 2. Response Latency
 | |
| ```bash
 | |
| # Check average response time
 | |
| curl -s http://localhost:3000/__perf | jq '.performance.latency'
 | |
| # Output:
 | |
| # {
 | |
| #   "avgMs": 42,
 | |
| #   "p95Ms": 98,
 | |
| #   "samples": 500
 | |
| # }
 | |
| 
 | |
| # Target: 
 | |
| # - Cached: < 20ms average
 | |
| # - Uncached: < 500ms average
 | |
| # - P95: < 200ms
 | |
| ```
 | |
| 
 | |
| ### 3. Error Rate
 | |
| ```bash
 | |
| # Check error rate
 | |
| curl -s http://localhost:3000/__perf | jq '.performance.requests.errorRate'
 | |
| # Output: "0.4%"
 | |
| 
 | |
| # Target: < 1% under normal conditions
 | |
| # If higher: Check Meilisearch or filesystem issues
 | |
| ```
 | |
| 
 | |
| ### 4. Circuit Breaker State
 | |
| ```bash
 | |
| # Check circuit breaker status
 | |
| curl -s http://localhost:3000/__perf | jq '.circuitBreaker'
 | |
| # Output:
 | |
| # {
 | |
| #   "state": "closed",
 | |
| #   "failureCount": 0,
 | |
| #   "failureThreshold": 5
 | |
| # }
 | |
| 
 | |
| # States:
 | |
| # - "closed": Normal operation
 | |
| # - "half-open": Testing recovery
 | |
| # - "open": Failing, requests rejected
 | |
| ```
 | |
| 
 | |
| ### 5. Retry Counts
 | |
| ```bash
 | |
| # Check retry activity
 | |
| curl -s http://localhost:3000/__perf | jq '.performance.retries'
 | |
| # Output:
 | |
| # {
 | |
| #   "meilisearch": 3,
 | |
| #   "filesystem": 1
 | |
| # }
 | |
| 
 | |
| # Indicates transient failures being handled gracefully
 | |
| ```
 | |
| 
 | |
| ## 📈 Monitoring Dashboards
 | |
| 
 | |
| ### Simple Shell Script
 | |
| ```bash
 | |
| #!/bin/bash
 | |
| # monitor-phase3.sh
 | |
| 
 | |
| while true; do
 | |
|   clear
 | |
|   echo "=== ObsiViewer Phase 3 Monitoring ==="
 | |
|   echo "Time: $(date)"
 | |
|   echo ""
 | |
|   
 | |
|   curl -s http://localhost:3000/__perf | jq '{
 | |
|     uptime: .performance.uptime,
 | |
|     requests: .performance.requests,
 | |
|     cache: .performance.cache,
 | |
|     latency: .performance.latency,
 | |
|     circuitBreaker: .circuitBreaker
 | |
|   }'
 | |
|   
 | |
|   echo ""
 | |
|   echo "Refreshing in 5 seconds..."
 | |
|   sleep 5
 | |
| done
 | |
| ```
 | |
| 
 | |
| ### Using jq for Specific Metrics
 | |
| ```bash
 | |
| # Cache hit rate only
 | |
| curl -s http://localhost:3000/__perf | jq '.cache.hitRate'
 | |
| 
 | |
| # Average latency
 | |
| curl -s http://localhost:3000/__perf | jq '.performance.latency.avgMs'
 | |
| 
 | |
| # Error rate
 | |
| curl -s http://localhost:3000/__perf | jq '.performance.requests.errorRate'
 | |
| 
 | |
| # Uptime in seconds
 | |
| curl -s http://localhost:3000/__perf | jq '.performance.uptime / 1000 | floor'
 | |
| ```
 | |
| 
 | |
| ## 🔍 Server Logs Analysis
 | |
| 
 | |
| ### Log Patterns to Look For
 | |
| 
 | |
| #### Cache Hits/Misses
 | |
| ```
 | |
| [/api/vault/metadata] CACHE HIT - 12ms
 | |
| [/api/vault/metadata] CACHE MISS - 245ms
 | |
| ```
 | |
| 
 | |
| #### Meilisearch Indexing
 | |
| ```
 | |
| [Meilisearch] Scheduling background indexing...
 | |
| [Meilisearch] Background indexing completed
 | |
| [Meilisearch] ✅ Background indexing completed
 | |
| ```
 | |
| 
 | |
| #### Retry Activity
 | |
| ```
 | |
| [Meilisearch] Retry attempt 1, delay 100ms: Connection timeout
 | |
| [Filesystem] Retry attempt 1, delay 150ms: ENOENT
 | |
| ```
 | |
| 
 | |
| #### Circuit Breaker
 | |
| ```
 | |
| [Meilisearch] Circuit breaker opened after 5 failures
 | |
| [Meilisearch] Circuit breaker is open (reset in 25000ms)
 | |
| ```
 | |
| 
 | |
| ### Log Filtering
 | |
| ```bash
 | |
| # Show only cache operations
 | |
| npm run start 2>&1 | grep -i cache
 | |
| 
 | |
| # Show only Meilisearch operations
 | |
| npm run start 2>&1 | grep -i meilisearch
 | |
| 
 | |
| # Show only errors
 | |
| npm run start 2>&1 | grep -i error
 | |
| 
 | |
| # Show only retries
 | |
| npm run start 2>&1 | grep -i retry
 | |
| ```
 | |
| 
 | |
| ## 📊 Performance Benchmarks
 | |
| 
 | |
| ### Expected Performance
 | |
| 
 | |
| #### Startup Time
 | |
| ```
 | |
| Before Phase 3: 5-10 seconds (blocked by indexing)
 | |
| After Phase 3:  < 2 seconds (indexing in background)
 | |
| Improvement:    5-10x faster ✅
 | |
| ```
 | |
| 
 | |
| #### Metadata Endpoint Response Time
 | |
| ```
 | |
| First request (cache miss):   200-500ms
 | |
| Subsequent requests (hit):    5-15ms
 | |
| Improvement:                  30x faster ✅
 | |
| ```
 | |
| 
 | |
| #### Cache Hit Rate Over Time
 | |
| ```
 | |
| 0-1 min:   0% (warming up)
 | |
| 1-5 min:   50-80% (building cache)
 | |
| 5+ min:    85-95% (stable)
 | |
| ```
 | |
| 
 | |
| #### Memory Usage
 | |
| ```
 | |
| Baseline:  50-100MB
 | |
| With cache: 50-100MB (controlled by LRU)
 | |
| Overhead:  Minimal (< 5MB for cache)
 | |
| ```
 | |
| 
 | |
| ## 🧪 Load Testing
 | |
| 
 | |
| ### Test Cache Behavior
 | |
| ```bash
 | |
| #!/bin/bash
 | |
| # test-cache.sh
 | |
| 
 | |
| echo "Testing cache behavior..."
 | |
| 
 | |
| # Warm up (first request)
 | |
| echo "Request 1 (cache miss):"
 | |
| time curl -s http://localhost:3000/api/vault/metadata > /dev/null
 | |
| 
 | |
| # Should be fast (cache hit)
 | |
| echo "Request 2 (cache hit):"
 | |
| time curl -s http://localhost:3000/api/vault/metadata > /dev/null
 | |
| 
 | |
| # Check metrics
 | |
| echo ""
 | |
| echo "Cache statistics:"
 | |
| curl -s http://localhost:3000/__perf | jq '.cache'
 | |
| ```
 | |
| 
 | |
| ### Test Retry Behavior
 | |
| ```bash
 | |
| #!/bin/bash
 | |
| # test-retry.sh
 | |
| 
 | |
| # Stop Meilisearch to trigger retries
 | |
| echo "Stopping Meilisearch..."
 | |
| docker-compose down
 | |
| 
 | |
| # Make requests (should use filesystem fallback with retries)
 | |
| echo "Making requests with Meilisearch down..."
 | |
| for i in {1..5}; do
 | |
|   echo "Request $i:"
 | |
|   curl -s http://localhost:3000/api/vault/metadata | jq '.items | length'
 | |
|   sleep 1
 | |
| done
 | |
| 
 | |
| # Check retry counts
 | |
| echo ""
 | |
| echo "Retry statistics:"
 | |
| curl -s http://localhost:3000/__perf | jq '.performance.retries'
 | |
| 
 | |
| # Restart Meilisearch
 | |
| echo "Restarting Meilisearch..."
 | |
| docker-compose up -d
 | |
| ```
 | |
| 
 | |
| ### Test Circuit Breaker
 | |
| ```bash
 | |
| #!/bin/bash
 | |
| # test-circuit-breaker.sh
 | |
| 
 | |
| # Make many requests to trigger failures
 | |
| echo "Triggering circuit breaker..."
 | |
| for i in {1..10}; do
 | |
|   curl -s http://localhost:3000/api/vault/metadata > /dev/null 2>&1 &
 | |
| done
 | |
| 
 | |
| # Check circuit breaker state
 | |
| sleep 2
 | |
| echo "Circuit breaker state:"
 | |
| curl -s http://localhost:3000/__perf | jq '.circuitBreaker'
 | |
| ```
 | |
| 
 | |
| ## 🚨 Alert Thresholds
 | |
| 
 | |
| ### Recommended Alerts
 | |
| 
 | |
| | Metric | Threshold | Action |
 | |
| |--------|-----------|--------|
 | |
| | Cache Hit Rate | < 50% | Check TTL, cache size |
 | |
| | Error Rate | > 5% | Check Meilisearch, filesystem |
 | |
| | P95 Latency | > 500ms | Check server load, cache |
 | |
| | Circuit Breaker | "open" | Restart Meilisearch |
 | |
| | Memory Usage | > 200MB | Check for memory leak |
 | |
| 
 | |
| ### Setting Up Alerts
 | |
| ```bash
 | |
| #!/bin/bash
 | |
| # alert-monitor.sh
 | |
| 
 | |
| while true; do
 | |
|   METRICS=$(curl -s http://localhost:3000/__perf)
 | |
|   
 | |
|   # Check cache hit rate
 | |
|   HIT_RATE=$(echo $METRICS | jq '.cache.hitRate')
 | |
|   if (( $(echo "$HIT_RATE < 50" | bc -l) )); then
 | |
|     echo "⚠️  ALERT: Low cache hit rate: $HIT_RATE%"
 | |
|   fi
 | |
|   
 | |
|   # Check error rate
 | |
|   ERROR_RATE=$(echo $METRICS | jq '.performance.requests.errorRate' | tr -d '%')
 | |
|   if (( $(echo "$ERROR_RATE > 5" | bc -l) )); then
 | |
|     echo "⚠️  ALERT: High error rate: $ERROR_RATE%"
 | |
|   fi
 | |
|   
 | |
|   # Check circuit breaker
 | |
|   CB_STATE=$(echo $METRICS | jq -r '.circuitBreaker.state')
 | |
|   if [ "$CB_STATE" = "open" ]; then
 | |
|     echo "⚠️  ALERT: Circuit breaker is open"
 | |
|   fi
 | |
|   
 | |
|   sleep 10
 | |
| done
 | |
| ```
 | |
| 
 | |
| ## 📝 Monitoring Checklist
 | |
| 
 | |
| - [ ] Server starts in < 2 seconds
 | |
| - [ ] `/__perf` endpoint responds with metrics
 | |
| - [ ] Cache hit rate reaches > 80% after 5 minutes
 | |
| - [ ] Average latency for cached requests < 20ms
 | |
| - [ ] Error rate < 1%
 | |
| - [ ] Circuit breaker state is "closed"
 | |
| - [ ] No memory leaks over time
 | |
| - [ ] Meilisearch indexing completes in background
 | |
| - [ ] Filesystem fallback works when Meilisearch down
 | |
| - [ ] Graceful shutdown on SIGINT
 | |
| 
 | |
| ## 🎯 Success Criteria
 | |
| 
 | |
| ✅ Cache hit rate > 80% after 5 minutes
 | |
| ✅ Response time < 20ms for cached requests
 | |
| ✅ Server startup < 2 seconds
 | |
| ✅ Error rate < 1%
 | |
| ✅ Memory usage stable
 | |
| ✅ Circuit breaker protecting against cascading failures
 | |
| ✅ Automatic retry handling transient failures
 | |
| ✅ Graceful fallback to filesystem
 | |
| 
 | |
| ---
 | |
| 
 | |
| **Last Updated**: 2025-10-23
 | |
| **Status**: Production Ready
 |