#!/bin/bash
#
# cwp-server-health — server health snapshot
#
# Part of CloudWatch Pro (CWP) v1. Runs every 15 minutes on each cPanel server.
# Snapshots disk %, inode %, load average, memory %, key services, mail queue.
# Emits a finding only when a metric crosses a threshold; sends an email when
# any P1 finding occurs (services down, disk >=95%, inodes >=95%).
#
# Detect-only. Does NOT restart services, free disk, or take corrective action.
#
# COOLDOWN POLICY (per-metric, configurable):
#   Disk / inode high  → 4 hours  (it's not going anywhere; don't spam)
#   Service down       → 30 min   (you want repeat reminders if it stays down)
#   Mail queue high    → 1 hour
#   Load / memory high → 1 hour
#
# INSTALL:
#   sudo install -d /opt/cwp/agent/modules/server-health \
#                   /etc/cwp \
#                   /var/cwp/state/server-health \
#                   /var/cwp/findings \
#                   /var/log/cwp
#   sudo install -m 0755 cwp-server-health /opt/cwp/agent/modules/server-health/
#   sudo install -m 0644 config.example.conf /etc/cwp/server-health.conf
#   sudo $EDITOR /etc/cwp/server-health.conf   # set ALERT_EMAIL, tune thresholds
#   sudo crontab -l 2>/dev/null | { cat; cat cron.example; } | sudo crontab -
#
# USAGE:
#   cwp-server-health                # normal run
#   cwp-server-health --dry-run      # report only, no findings/email/state
#   cwp-server-health --verbose      # log to stderr
#   cwp-server-health --no-email     # write findings, skip email
#   cwp-server-health --version
#
# EXIT CODES:
#   0 — completed
#   1 — config or environment error

set -euo pipefail

VERSION="0.1.0"
SCRIPT_NAME="cwp-server-health"

# ---- defaults ----
CONFIG_FILE="${CWP_SERVER_HEALTH_CONFIG:-/etc/cwp/server-health.conf}"
STATE_DIR="/var/cwp/state/server-health"
FINDINGS_DIR="/var/cwp/findings"
LOG_FILE="/var/log/cwp/server-health.log"
ALERT_EMAIL="root@localhost"
SERVER_NAME="$(hostname -f 2>/dev/null || hostname)"
SENDMAIL_BIN="/usr/sbin/sendmail"

# Disk thresholds
DISK_WARN_PCT=85
DISK_CRIT_PCT=95
INODE_WARN_PCT=85
INODE_CRIT_PCT=95
# Filesystems to ignore (loopbacks, snap mounts, /run, etc.)
DISK_SKIP_FSTYPES="tmpfs devtmpfs squashfs overlay nullfs none proc sysfs"

# Load: alert if load5 > LOAD_FACTOR * cpu_count
LOAD_FACTOR=2

# Memory %
MEM_WARN_PCT=90

# Mail queue
MAIL_QUEUE_WARN=200
MAIL_QUEUE_CRIT=500
MAIL_FROZEN_WARN=50

# Services to check (systemd unit names)
WATCHED_SERVICES="lsws exim cphulkd csf"

# Per-metric cooldowns (minutes)
COOLDOWN_DISK_MIN=240        # 4 hours
COOLDOWN_SERVICE_MIN=30      # 30 min
COOLDOWN_QUEUE_MIN=60
COOLDOWN_LOAD_MIN=60
COOLDOWN_MEM_MIN=60

DRY_RUN=0
VERBOSE=0
NO_EMAIL=0

# ---- helpers ----

log() {
  local level="$1"; shift
  local ts; ts="$(date '+%Y-%m-%d %H:%M:%S')"
  printf '%s [%s] %s\n' "$ts" "$level" "$*" >> "$LOG_FILE" 2>/dev/null || true
  if [[ "$VERBOSE" -eq 1 ]] || [[ "$level" == "ERROR" ]]; then
    printf '%s [%s] %s\n' "$ts" "$level" "$*" >&2
  fi
}

die() { log "ERROR" "$*"; exit 1; }
usage() { sed -n '1,40p' "$0" | sed 's/^# \{0,1\}//'; exit "${1:-0}"; }

load_config() {
  # Shared CWP defaults (ALERT_EMAIL, SERVER_NAME, etc.) sourced first.
  if [[ -r /etc/cwp/common.conf ]]; then
    # shellcheck source=/dev/null
    . /etc/cwp/common.conf
  fi
  if [[ -r "$CONFIG_FILE" ]]; then
    # shellcheck source=/dev/null
    . "$CONFIG_FILE"
    log "INFO" "loaded config from $CONFIG_FILE"
  else
    log "WARN" "no config at $CONFIG_FILE — using built-in defaults"
  fi
  return 0
}

ensure_dirs() {
  for d in "$STATE_DIR" "$FINDINGS_DIR" "$(dirname "$LOG_FILE")"; do
    if [[ ! -d "$d" ]]; then
      mkdir -p "$d" 2>/dev/null || die "cannot create $d"
    fi
  done
}

json_escape() {
  local s="$1"
  s="${s//\\/\\\\}"; s="${s//\"/\\\"}"
  s="${s//$'\n'/\\n}"; s="${s//$'\t'/\\t}"; s="${s//$'\r'/\\r}"
  printf '%s' "$s"
}

# in_cooldown <key> <minutes> -> 0 (suppress) or 1 (alert)
in_cooldown() {
  local key="$1" minutes="$2"
  local f="$STATE_DIR/cooldown.${key}"
  [[ -f "$f" ]] || return 1
  local last now diff
  last="$(cat "$f" 2>/dev/null || echo 0)"
  now="$(date +%s)"
  diff=$(( now - last ))
  if (( diff < minutes * 60 )); then return 0; fi
  return 1
}

mark_cooldown() {
  date +%s > "$STATE_DIR/cooldown.${1}"
}

# emit_finding <metric> <severity> <subject> <value> <recommended_action> <cooldown_key> <cooldown_minutes>
emit_finding() {
  local metric="$1" sev="$2" subj="$3" val="$4" action="$5" cool_key="$6" cool_min="$7"
  if in_cooldown "$cool_key" "$cool_min"; then
    log "INFO" "$metric $subj $val — in cooldown, suppressing"
    return
  fi

  local now_iso now_epoch finding_file id
  now_epoch="$(date +%s)"
  now_iso="$(date '+%Y-%m-%dT%H:%M:%S%z')"
  id="server-health-${metric}-${SERVER_NAME}-${cool_key}"
  finding_file="$FINDINGS_DIR/findings.jsonl"

  local subj_esc val_esc action_esc
  subj_esc="$(json_escape "$subj")"
  val_esc="$(json_escape "$val")"
  action_esc="$(json_escape "$action")"

  local json
  json=$(printf '{"ts":"%s","ts_epoch":%d,"module":"%s","server":"%s","severity":"%s","metric":"%s","subject":"%s","value":"%s","id":"%s","recommended_action":"%s"}' \
    "$now_iso" "$now_epoch" "server-health" "$SERVER_NAME" "$sev" "$metric" \
    "$subj_esc" "$val_esc" "$id" "$action_esc")

  if [[ "$DRY_RUN" -eq 1 ]]; then
    printf 'DRY-RUN finding: %s\n' "$json"
  else
    printf '%s\n' "$json" >> "$finding_file"
    mark_cooldown "$cool_key"
  fi

  DIGEST_LINES+=("[$sev] ${metric}  ${subj}  ${val}")
  if [[ "$sev" == "P1" ]]; then HAS_P1=1; fi
}

# -------- portable system inspectors --------

get_cpu_count() {
  if command -v nproc >/dev/null 2>&1; then
    nproc
  elif command -v sysctl >/dev/null 2>&1; then
    sysctl -n hw.ncpu 2>/dev/null || echo 1
  else
    echo 1
  fi
}

# Returns "load1 load5 load15"
get_load_avg() {
  if [[ -r /proc/loadavg ]]; then
    awk '{print $1, $2, $3}' /proc/loadavg
  else
    uptime | awk -F'load average[s]?: ' '{print $2}' | awk -F', ' '{print $1, $2, $3}'
  fi
}

# Returns "used_pct" or empty if not available
get_mem_pct() {
  if command -v free >/dev/null 2>&1; then
    free | awk '/^Mem:/ {if ($2 > 0) printf "%d", $3 * 100 / $2}'
  fi
}

# Returns "service:state" lines for WATCHED_SERVICES
get_service_states() {
  if ! command -v systemctl >/dev/null 2>&1; then
    return
  fi
  local svc
  for svc in $WATCHED_SERVICES; do
    if systemctl list-unit-files --type=service 2>/dev/null | awk '{print $1}' | grep -qE "^${svc}(\.service)?$"; then
      if systemctl is-active --quiet "$svc"; then
        printf '%s\tactive\n' "$svc"
      else
        printf '%s\tdown\n' "$svc"
      fi
    fi
  done
}

get_mail_queue_size() {
  if command -v exim >/dev/null 2>&1; then
    exim -bpc 2>/dev/null | tr -d '[:space:]'
  fi
}

get_mail_frozen_count() {
  if command -v exim >/dev/null 2>&1; then
    exim -bpr 2>/dev/null | grep -c '*** frozen ***' || echo 0
  fi
}

# -------- check functions --------

# Returns lines: "<fstype>\t<mount>\t<used_pct>"
# Tries GNU df -PT first (gives fstype); falls back to BSD df -P (no fstype).
get_disk_info() {
  if df -PT 2>/dev/null | head -1 | grep -q Type; then
    df -PT 2>/dev/null | awk 'NR>1 {print $2"\t"$NF"\t"$(NF-1)}'
  else
    df -P 2>/dev/null | awk 'NR>1 {print "unknown\t"$NF"\t"$(NF-1)}'
  fi
}

get_inode_info() {
  if df -iPT 2>/dev/null | head -1 | grep -q Type; then
    df -iPT 2>/dev/null | awk 'NR>1 {print $2"\t"$NF"\t"$(NF-1)}'
  else
    df -iP 2>/dev/null | awk 'NR>1 {print "unknown\t"$NF"\t"$(NF-1)}'
  fi
}

# should_skip_mount <fstype> <mount_path> -> 0 (skip) / 1 (process)
should_skip_mount() {
  local fstype="$1" mount="$2" t
  # Skip by fstype if known
  for t in $DISK_SKIP_FSTYPES; do
    [[ "$fstype" == "$t" ]] && return 0
  done
  # Skip pseudo-mount paths regardless of fstype (handles BSD/Mac case)
  case "$mount" in
    /dev|/dev/*|/proc|/proc/*|/sys|/sys/*|/run|/run/*|/System/*|/private/var/vm|/Volumes/Recovery)
      return 0
      ;;
  esac
  return 1
}

check_disk() {
  log "INFO" "checking disk usage (warn=${DISK_WARN_PCT}% crit=${DISK_CRIT_PCT}%)"
  while IFS=$'\t' read -r fstype mount used_pct; do
    [[ -z "$mount" ]] && continue
    should_skip_mount "$fstype" "$mount" && continue
    local pct_num="${used_pct%\%}"
    [[ -z "$pct_num" || ! "$pct_num" =~ ^[0-9]+$ ]] && continue

    local cool_key="disk.${mount//\//_}"
    if (( pct_num >= DISK_CRIT_PCT )); then
      emit_finding "disk_critical" "P1" "$mount" "${used_pct} used" \
        "Disk on ${mount} is at ${used_pct}. Free space immediately. Common culprits: /var/log (rotate or trim logs), /home (find largest accounts via: du -sh /home/*/ | sort -h | tail), /tmp (clear with caution), /var/spool/exim (clear mail queue if mail-anomaly indicates abuse)." \
        "$cool_key" "$COOLDOWN_DISK_MIN"
    elif (( pct_num >= DISK_WARN_PCT )); then
      emit_finding "disk_warn" "P2" "$mount" "${used_pct} used" \
        "Disk on ${mount} is at ${used_pct}. Investigate growth: du -sh ${mount}/*/ | sort -h | tail. Clean before it hits ${DISK_CRIT_PCT}%." \
        "$cool_key" "$COOLDOWN_DISK_MIN"
    fi
  done < <(get_disk_info)
}

check_inodes() {
  log "INFO" "checking inode usage (crit=${INODE_CRIT_PCT}%)"
  while IFS=$'\t' read -r fstype mount used_pct; do
    [[ -z "$mount" ]] && continue
    should_skip_mount "$fstype" "$mount" && continue
    [[ "$used_pct" == "-" ]] && continue
    local pct_num="${used_pct%\%}"
    [[ -z "$pct_num" || ! "$pct_num" =~ ^[0-9]+$ ]] && continue

    local cool_key="inode.${mount//\//_}"
    if (( pct_num >= INODE_CRIT_PCT )); then
      emit_finding "inode_critical" "P1" "$mount" "${used_pct} inodes used" \
        "Filesystem ${mount} is out of inodes (${used_pct}). Even with free space, no new files can be created. Find directories with too many small files: find ${mount} -xdev -type d -exec sh -c 'echo \"\$(ls -A \"\$1\" | wc -l) \$1\"' _ {} \\; | sort -n | tail." \
        "$cool_key" "$COOLDOWN_DISK_MIN"
    elif (( pct_num >= INODE_WARN_PCT )); then
      emit_finding "inode_warn" "P2" "$mount" "${used_pct} inodes used" \
        "Filesystem ${mount} inode usage at ${used_pct}. Investigate before it hits ${INODE_CRIT_PCT}%." \
        "$cool_key" "$COOLDOWN_DISK_MIN"
    fi
  done < <(get_inode_info)
}

check_load() {
  read -r load1 load5 load15 < <(get_load_avg)
  local cpu; cpu="$(get_cpu_count)"
  local threshold=$(( cpu * LOAD_FACTOR ))
  log "INFO" "load: 1=${load1} 5=${load5} 15=${load15}, cpu=${cpu}, threshold=${threshold}"
  # Compare integer part of load5 against threshold
  local load5_int="${load5%.*}"
  [[ -z "$load5_int" ]] && load5_int=0
  if (( load5_int >= threshold )); then
    emit_finding "load_high" "P2" "system" "load5=${load5} (cpu=${cpu})" \
      "5-minute load average is ${load5}, threshold ${threshold} (${LOAD_FACTOR}x cpu count). Identify the runaway process: top -b -n 1 | head -20. Check for: stuck cron, runaway PHP, MySQL slow queries (mysqladmin processlist), DDoS hitting LiteSpeed." \
      "load.system" "$COOLDOWN_LOAD_MIN"
  fi
}

check_memory() {
  local pct; pct="$(get_mem_pct)"
  if [[ -z "$pct" ]]; then
    log "INFO" "memory check skipped (free not available)"
    return
  fi
  log "INFO" "memory: ${pct}% used (warn at ${MEM_WARN_PCT}%)"
  if (( pct >= MEM_WARN_PCT )); then
    emit_finding "memory_high" "P2" "system" "${pct}% used" \
      "Memory is at ${pct}%. Server may swap heavily soon. Top consumers: ps aux --sort=-%mem | head. Common cause on cPanel: too many PHP-FPM workers, MySQL buffer pools too large, runaway WordPress process." \
      "memory.system" "$COOLDOWN_MEM_MIN"
  fi
}

check_services() {
  while IFS=$'\t' read -r svc state; do
    [[ -z "$svc" ]] && continue
    if [[ "$state" == "down" ]]; then
      emit_finding "service_down" "P1" "$svc" "down" \
        "Service ${svc} is not active. Check status: systemctl status ${svc} -l --no-pager | head -50. Restart: systemctl restart ${svc}. If it won't start, check journalctl -u ${svc} -n 100." \
        "service.${svc}" "$COOLDOWN_SERVICE_MIN"
    else
      log "INFO" "service ${svc}: active"
    fi
  done < <(get_service_states)
}

check_mail_queue() {
  local queue; queue="$(get_mail_queue_size)"
  if [[ -z "$queue" ]]; then
    log "INFO" "mail queue check skipped (exim not available)"
    return
  fi
  log "INFO" "mail queue size: ${queue} (warn=${MAIL_QUEUE_WARN} crit=${MAIL_QUEUE_CRIT})"

  if (( queue >= MAIL_QUEUE_CRIT )); then
    emit_finding "mail_queue_critical" "P1" "system" "${queue} messages" \
      "Mail queue has ${queue} messages (critical >${MAIL_QUEUE_CRIT}). Likely causes: outbound spam from compromised account (cross-check mail-anomaly findings) OR upstream MX rejecting mail. Inspect queue: exim -bp | head -50. Identify top senders: exim -bpr | grep '<=' | awk '{print \$5}' | sort | uniq -c | sort -rn | head." \
      "mail_queue.system" "$COOLDOWN_QUEUE_MIN"
  elif (( queue >= MAIL_QUEUE_WARN )); then
    emit_finding "mail_queue_warn" "P2" "system" "${queue} messages" \
      "Mail queue elevated (${queue} messages). Check for backlog: exim -bp | head. Cross-check mail-anomaly + rbl-check findings." \
      "mail_queue.system" "$COOLDOWN_QUEUE_MIN"
  fi

  local frozen; frozen="$(get_mail_frozen_count)"
  if [[ -n "$frozen" ]] && (( frozen >= MAIL_FROZEN_WARN )); then
    emit_finding "mail_frozen" "P2" "system" "${frozen} frozen messages" \
      "Mail has ${frozen} frozen messages. Inspect: exim -bp | grep frozen. Drop them after triage: exim -Mrm \$(exim -bpr | grep frozen | awk '{print \$3}')." \
      "mail_frozen.system" "$COOLDOWN_QUEUE_MIN"
  fi
}

# -------- email --------

send_alert_email() {
  local count="${#DIGEST_LINES[@]}"
  if (( count == 0 )); then
    log "INFO" "no findings — email NOT sent"
    return
  fi
  if [[ "$NO_EMAIL" -eq 1 ]]; then
    log "INFO" "$count finding(s), --no-email set — email skipped"
    return
  fi

  local subject body now_str findings_block sev_tag
  now_str="$(date '+%Y-%m-%d %H:%M:%S %Z')"
  findings_block="$(printf '%s\n' "${DIGEST_LINES[@]}")"
  sev_tag="P2"
  [[ "$HAS_P1" -eq 1 ]] && sev_tag="P1"
  subject="[CWP ${sev_tag}] server-health: ${count} finding(s) on ${SERVER_NAME}"

  IFS='' read -r -d '' body <<EOF || true
CloudWatch Pro — Server Health Snapshot

Server:    ${SERVER_NAME}
Run time:  ${now_str}
Findings:  ${count}

----- findings -----

${findings_block}

----- per-finding actions -----

Each finding above includes a recommended_action field in /var/cwp/findings/findings.jsonl.
Common starting commands:

  Disk:        df -hT  |  du -sh /home/*/ | sort -h | tail
  Inode:       df -i
  Load:        top -b -n 1 | head -20
  Memory:      free -h  |  ps aux --sort=-%mem | head
  Service:     systemctl status <svc> -l --no-pager | head -50
  Mail queue:  exim -bp | head -50  |  exim -bpr | grep '*** frozen ***' | head

Cooldowns are applied per metric to avoid inbox spam from a sustained issue.

Findings file: /var/cwp/findings/findings.jsonl
Health log:    /var/log/cwp/server-health.log
EOF

  if [[ "$DRY_RUN" -eq 1 ]]; then
    printf 'DRY-RUN email to %s:\n  Subject: %s\n%s\n' "$ALERT_EMAIL" "$subject" "$body"
    return
  fi
  if [[ ! -x "$SENDMAIL_BIN" ]]; then
    log "WARN" "sendmail not available; alert email NOT sent"
    return
  fi

  {
    printf 'To: %s\n' "$ALERT_EMAIL"
    printf 'From: cwp-agent@%s\n' "$SERVER_NAME"
    printf 'Subject: %s\n' "$subject"
    printf 'X-CWP-Module: server-health\n'
    printf 'X-CWP-Severity: %s\n' "$sev_tag"
    printf 'Content-Type: text/plain; charset=utf-8\n'
    printf '\n%s\n' "$body"
  } | "$SENDMAIL_BIN" -t -i

  log "INFO" "alert email sent to $ALERT_EMAIL with $count finding(s)"
}

# ---- argument parsing ----
while [[ $# -gt 0 ]]; do
  case "$1" in
    --dry-run)  DRY_RUN=1; shift ;;
    --verbose)  VERBOSE=1; shift ;;
    --no-email) NO_EMAIL=1; shift ;;
    --version)  printf '%s %s\n' "$SCRIPT_NAME" "$VERSION"; exit 0 ;;
    -h|--help)  usage 0 ;;
    *)          printf 'unknown argument: %s\n' "$1" >&2; usage 2 ;;
  esac
done

# ---- main ----
load_config
ensure_dirs

DIGEST_LINES=()
HAS_P1=0

log "INFO" "$SCRIPT_NAME v$VERSION starting (server=$SERVER_NAME, dry_run=$DRY_RUN)"

check_disk
check_inodes
check_load
check_memory
check_services
check_mail_queue

send_alert_email

log "INFO" "$SCRIPT_NAME complete: ${#DIGEST_LINES[@]} finding(s) emitted"
exit 0
