Status Monitoring¶
The Status API provides information about Kibana server health and operational metrics. This is essential for monitoring, alerting, and troubleshooting your Kibana deployment.
Overview¶
The Status API allows you to:
Check Kibana server health
Monitor service availability
Retrieve operational statistics
Implement health checks for automation
Track resource usage and performance
Checking Kibana Status¶
Basic Status Check¶
from kibana import Kibana
client = Kibana("http://localhost:5601", api_key="your_api_key")
# Get current status
response = client.status.get_status()
status = response.body
# Check overall health
overall_status = status["status"]["overall"]["level"]
print(f"Kibana status: {overall_status}")
client.close()
Status Levels¶
Kibana reports three status levels:
available: All services are operationaldegraded: Some services are experiencing issues but Kibana is still functionalunavailable: Kibana is not operational
Detailed Status Information¶
response = client.status.get_status()
status = response.body
# Overall status
print(f"Overall: {status['status']['overall']['level']}")
print(f"Summary: {status['status']['overall']['summary']}")
# Core services status
for service_name, service_info in status['status']['core'].items():
print(f"{service_name}: {service_info['level']}")
if service_info.get('summary'):
print(f" Summary: {service_info['summary']}")
# Plugin statuses
for plugin_name, plugin_info in status['status'].get('plugins', {}).items():
print(f"Plugin {plugin_name}: {plugin_info['level']}")
Version Information¶
response = client.status.get_status()
status = response.body
# Kibana version
version_info = status['version']
print(f"Kibana version: {version_info['number']}")
print(f"Build number: {version_info['build_number']}")
print(f"Build hash: {version_info['build_hash']}")
Getting Operational Statistics¶
Basic Statistics¶
# Get detailed statistics
response = client.status.get_stats()
stats = response.body
# Process information
process = stats['process']
print(f"Uptime: {process['uptime_in_millis'] / 1000:.2f} seconds")
print(f"Memory used: {process['memory']['heap']['used_bytes'] / (1024**2):.2f} MB")
print(f"Memory total: {process['memory']['heap']['total_bytes'] / (1024**2):.2f} MB")
# OS information
os_info = stats['os']
print(f"Platform: {os_info['platform']}")
print(f"Load average (1m): {os_info['load']['1m']}")
print(f"Load average (5m): {os_info['load']['5m']}")
print(f"Load average (15m): {os_info['load']['15m']}")
Memory Statistics¶
response = client.status.get_stats()
stats = response.body
memory = stats['process']['memory']
heap = memory['heap']
print(f"Heap used: {heap['used_bytes'] / (1024**2):.2f} MB")
print(f"Heap total: {heap['total_bytes'] / (1024**2):.2f} MB")
print(f"Heap limit: {heap['size_limit'] / (1024**2):.2f} MB")
print(f"Heap usage: {(heap['used_bytes'] / heap['size_limit']) * 100:.1f}%")
Request Statistics¶
response = client.status.get_stats()
stats = response.body
# HTTP request statistics
requests = stats.get('requests', {})
print(f"Total requests: {requests.get('total', 0)}")
print(f"Disconnects: {requests.get('disconnects', 0)}")
print(f"Status codes: {requests.get('statusCodes', {})}")
Health Check Patterns¶
Simple Health Check¶
def is_kibana_healthy(client):
"""Check if Kibana is healthy."""
try:
response = client.status.get_status()
status_level = response.body['status']['overall']['level']
return status_level == 'available'
except Exception:
return False
# Usage
if is_kibana_healthy(client):
print("✅ Kibana is healthy")
else:
print("❌ Kibana is unhealthy")
Detailed Health Check¶
def check_kibana_health(client):
"""Perform detailed health check."""
try:
response = client.status.get_status()
status = response.body
overall = status['status']['overall']['level']
result = {
'healthy': overall == 'available',
'status': overall,
'version': status['version']['number'],
'services': {}
}
# Check core services
for service_name, service_info in status['status']['core'].items():
result['services'][service_name] = {
'level': service_info['level'],
'summary': service_info.get('summary', '')
}
return result
except Exception as e:
return {
'healthy': False,
'error': str(e)
}
# Usage
health = check_kibana_health(client)
print(f"Healthy: {health['healthy']}")
print(f"Status: {health.get('status', 'unknown')}")
for service, info in health.get('services', {}).items():
print(f" {service}: {info['level']}")
Monitoring with Alerts¶
import time
def monitor_kibana(client, check_interval=60, alert_threshold=3):
"""Monitor Kibana and alert on issues."""
consecutive_failures = 0
while True:
try:
response = client.status.get_status()
status_level = response.body['status']['overall']['level']
if status_level == 'available':
consecutive_failures = 0
print(f"✅ Kibana is healthy")
elif status_level == 'degraded':
consecutive_failures += 1
print(f"⚠️ Kibana is degraded ({consecutive_failures}/{alert_threshold})")
if consecutive_failures >= alert_threshold:
send_alert("Kibana is degraded")
else: # unavailable
consecutive_failures += 1
print(f"❌ Kibana is unavailable ({consecutive_failures}/{alert_threshold})")
if consecutive_failures >= alert_threshold:
send_alert("Kibana is unavailable")
except Exception as e:
consecutive_failures += 1
print(f"❌ Failed to check status: {e}")
if consecutive_failures >= alert_threshold:
send_alert(f"Cannot connect to Kibana: {e}")
time.sleep(check_interval)
def send_alert(message):
"""Send alert notification."""
print(f"🚨 ALERT: {message}")
# Implement your alerting logic here
# (email, Slack, PagerDuty, etc.)
Performance Monitoring¶
Memory Usage Monitoring¶
def check_memory_usage(client, threshold_percent=80):
"""Check if memory usage exceeds threshold."""
response = client.status.get_stats()
stats = response.body
heap = stats['process']['memory']['heap']
used = heap['used_bytes']
limit = heap['size_limit']
usage_percent = (used / limit) * 100
if usage_percent > threshold_percent:
print(f"⚠️ High memory usage: {usage_percent:.1f}%")
return False
else:
print(f"✅ Memory usage OK: {usage_percent:.1f}%")
return True
Load Average Monitoring¶
def check_load_average(client, threshold=2.0):
"""Check if system load is high."""
response = client.status.get_stats()
stats = response.body
load_1m = stats['os']['load']['1m']
if load_1m > threshold:
print(f"⚠️ High load average: {load_1m}")
return False
else:
print(f"✅ Load average OK: {load_1m}")
return True
Integration with Monitoring Systems¶
Prometheus Metrics¶
def export_prometheus_metrics(client):
"""Export Kibana metrics in Prometheus format."""
response = client.status.get_stats()
stats = response.body
metrics = []
# Memory metrics
heap = stats['process']['memory']['heap']
metrics.append(f'kibana_heap_used_bytes {heap["used_bytes"]}')
metrics.append(f'kibana_heap_total_bytes {heap["total_bytes"]}')
metrics.append(f'kibana_heap_limit_bytes {heap["size_limit"]}')
# Uptime metric
uptime = stats['process']['uptime_in_millis'] / 1000
metrics.append(f'kibana_uptime_seconds {uptime}')
# Load average
load = stats['os']['load']
metrics.append(f'kibana_load_1m {load["1m"]}')
metrics.append(f'kibana_load_5m {load["5m"]}')
metrics.append(f'kibana_load_15m {load["15m"]}')
return '\n'.join(metrics)
# Usage
metrics = export_prometheus_metrics(client)
print(metrics)
Health Check Endpoint¶
from flask import Flask, jsonify
app = Flask(__name__)
@app.route('/health')
def health_check():
"""Health check endpoint for load balancers."""
try:
response = client.status.get_status()
status_level = response.body['status']['overall']['level']
if status_level == 'available':
return jsonify({'status': 'healthy'}), 200
elif status_level == 'degraded':
return jsonify({'status': 'degraded'}), 200
else:
return jsonify({'status': 'unhealthy'}), 503
except Exception as e:
return jsonify({'status': 'error', 'message': str(e)}), 503
@app.route('/metrics')
def metrics():
"""Metrics endpoint for monitoring."""
try:
response = client.status.get_stats()
stats = response.body
return jsonify({
'uptime_seconds': stats['process']['uptime_in_millis'] / 1000,
'memory_used_mb': stats['process']['memory']['heap']['used_bytes'] / (1024**2),
'memory_limit_mb': stats['process']['memory']['heap']['size_limit'] / (1024**2),
'load_1m': stats['os']['load']['1m']
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
Best Practices¶
1. Implement Regular Health Checks¶
# Check health before critical operations
if not is_kibana_healthy(client):
print("Kibana is unhealthy, skipping operation")
return
# Proceed with operation
result = client.actions.create(...)
2. Monitor Key Metrics¶
def monitor_key_metrics(client):
"""Monitor key Kibana metrics."""
response = client.status.get_stats()
stats = response.body
# Memory usage
heap = stats['process']['memory']['heap']
memory_usage = (heap['used_bytes'] / heap['size_limit']) * 100
# Load average
load_1m = stats['os']['load']['1m']
# Uptime
uptime_hours = stats['process']['uptime_in_millis'] / (1000 * 60 * 60)
return {
'memory_usage_percent': memory_usage,
'load_1m': load_1m,
'uptime_hours': uptime_hours
}
3. Set Up Alerts¶
def check_and_alert(client):
"""Check metrics and send alerts if needed."""
metrics = monitor_key_metrics(client)
if metrics['memory_usage_percent'] > 80:
send_alert(f"High memory usage: {metrics['memory_usage_percent']:.1f}%")
if metrics['load_1m'] > 2.0:
send_alert(f"High load average: {metrics['load_1m']}")
4. Log Status Information¶
import logging
logger = logging.getLogger(__name__)
def log_status(client):
"""Log Kibana status information."""
try:
response = client.status.get_status()
status = response.body
logger.info(
"Kibana status check",
extra={
'status': status['status']['overall']['level'],
'version': status['version']['number'],
'uptime': status.get('metrics', {}).get('process', {}).get('uptime_in_millis')
}
)
except Exception as e:
logger.error(f"Failed to check Kibana status: {e}")
Troubleshooting¶
Connection Issues¶
Problem: Cannot connect to Kibana
Solutions:
Verify Kibana URL is correct
Check network connectivity
Verify authentication credentials
Check firewall rules
Degraded Status¶
Problem: Kibana reports degraded status
Solutions:
Check individual service statuses
Review Kibana server logs
Verify Elasticsearch connectivity
Check resource availability (memory, disk)
High Memory Usage¶
Problem: Memory usage is consistently high
Solutions:
Increase heap size in Kibana configuration
Review and optimize dashboards and visualizations
Check for memory leaks
Consider scaling horizontally
Next Steps¶
Learn about Error Handling for comprehensive error management
Explore Observability for distributed tracing
Check Advanced Usage for performance optimization
See Examples for practical code samples