From 38676e6e348eefb03dc6c01535dcffa81d022c52 Mon Sep 17 00:00:00 2001 From: sascha Date: Sun, 5 Apr 2026 09:06:37 +0200 Subject: [PATCH] feat: Prometheus /metrics endpoint + webhook notifications (error/stale events) --- app.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index 11f0770..90dd7ce 100644 --- a/app.py +++ b/app.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 """ Backup Monitor – Backend -MongoDB-backed backup monitoring with Web UI and Uptime Kuma integration. +MongoDB-backed backup monitoring with Web UI, Uptime Kuma, Prometheus & Webhook integration. """ -from flask import Flask, request, jsonify, render_template, send_from_directory +from flask import Flask, request, jsonify, render_template, Response from pymongo import MongoClient, DESCENDING from datetime import datetime, timedelta -import os, time, requests, logging +import os, time, requests, logging, threading app = Flask(__name__) logging.basicConfig(level=logging.INFO) @@ -66,6 +66,13 @@ def push(): except Exception as e: log.warning(f"Kuma push failed for {host}: {e}") + # Webhooks + if entry["status"] == "error": + _send_webhooks("error", host, entry.get("message", "Backup fehlgeschlagen")) + + # Check for stale hosts + _check_stale_hosts() + return jsonify({"ok": True, "host": host}) @@ -217,6 +224,108 @@ def summary(): }) +# ── Prometheus Metrics ────────────────────────────────────────────────────── + +@app.route("/metrics") +def prometheus_metrics(): + now = datetime.utcnow() + hosts = list(db.hosts.find()) + lines = [ + "# HELP backup_hosts_total Total number of monitored hosts", + "# TYPE backup_hosts_total gauge", + f"backup_hosts_total {len([h for h in hosts if h.get('enabled', True)])}", + "# HELP backup_host_last_seconds Seconds since last backup", + "# TYPE backup_host_last_seconds gauge", + "# HELP backup_host_status Backup status (1=ok, 0=error, -1=stale, -2=disabled)", + "# TYPE backup_host_status gauge", + "# HELP backup_host_duration_seconds Duration of last backup", + "# TYPE backup_host_duration_seconds gauge", + "# HELP backup_host_size_bytes Original size of last backup", + "# TYPE backup_host_size_bytes gauge", + "# HELP backup_host_dedup_bytes Deduplicated size of last backup", + "# TYPE backup_host_dedup_bytes gauge", + "# HELP backup_host_files_new New files in last backup", + "# TYPE backup_host_files_new gauge", + ] + for h in hosts: + name = h["name"] + labels = f'host="{name}"' + age = (now - h["last_backup"]).total_seconds() if h.get("last_backup") else 999999 + + if not h.get("enabled", True): + status_val = -2 + elif age > STALE_HOURS * 3600: + status_val = -1 + elif h.get("last_status") == "error": + status_val = 0 + else: + status_val = 1 + + lines.append(f"backup_host_last_seconds{{{labels}}} {int(age)}") + lines.append(f"backup_host_status{{{labels}}} {status_val}") + + last = db.history.find_one({"host": name}, sort=[("timestamp", DESCENDING)]) + if last: + lines.append(f'backup_host_duration_seconds{{{labels}}} {last.get("duration_sec", 0)}') + lines.append(f'backup_host_size_bytes{{{labels}}} {last.get("original_size", 0)}') + lines.append(f'backup_host_dedup_bytes{{{labels}}} {last.get("deduplicated_size", 0)}') + lines.append(f'backup_host_files_new{{{labels}}} {last.get("nfiles_new", 0)}') + + today = now.replace(hour=0, minute=0, second=0, microsecond=0) + today_count = db.history.count_documents({"timestamp": {"$gte": today}}) + today_size = sum(e.get("original_size", 0) for e in db.history.find({"timestamp": {"$gte": today}})) + lines += [ + "# HELP backup_today_total Backups completed today", + "# TYPE backup_today_total gauge", + f"backup_today_total {today_count}", + "# HELP backup_today_bytes Total bytes backed up today", + "# TYPE backup_today_bytes gauge", + f"backup_today_bytes {today_size}", + ] + return Response("\n".join(lines) + "\n", mimetype="text/plain; version=0.0.4") + + +# ── Webhooks (Notifications) ────────────────────────────────────────────── + +WEBHOOK_URLS = [u.strip() for u in os.environ.get("WEBHOOK_URLS", "").split(",") if u.strip()] +WEBHOOK_EVENTS = os.environ.get("WEBHOOK_EVENTS", "error,stale").split(",") + + +def _send_webhooks(event, host, message): + """Fire webhooks in background thread.""" + if event not in WEBHOOK_EVENTS or not WEBHOOK_URLS: + return + payload = { + "event": event, + "host": host, + "message": message, + "timestamp": datetime.utcnow().isoformat() + "Z", + } + def _fire(): + for url in WEBHOOK_URLS: + try: + requests.post(url, json=payload, timeout=10) + except Exception as e: + log.warning(f"Webhook failed ({url}): {e}") + threading.Thread(target=_fire, daemon=True).start() + + +# ── Stale Check (runs after each push) ──────────────────────────────────── + +def _check_stale_hosts(): + """Check all hosts for stale status and fire webhooks.""" + now = datetime.utcnow() + for h in db.hosts.find({"enabled": True}): + if not h.get("last_backup"): + continue + age_h = (now - h["last_backup"]).total_seconds() / 3600 + if age_h > STALE_HOURS and not h.get("_stale_notified"): + _send_webhooks("stale", h["name"], f"Kein Backup seit {int(age_h)}h") + db.hosts.update_one({"name": h["name"]}, {"$set": {"_stale_notified": True}}) + elif age_h <= STALE_HOURS and h.get("_stale_notified"): + db.hosts.update_one({"name": h["name"]}, {"$unset": {"_stale_notified": ""}}) + + # ── Web UI ───────────────────────────────────────────────────────────────── @app.route("/")