#!/bin/bash
#
# cwp-sync — push agent findings to the central collector on the primary.
#
# Part of CloudWatch Pro (CWP) v1. Runs on EVERY cPanel server (including
# the primary itself, which sends to its own ingest endpoint via 127.0.0.1
# or its public hostname). Cron: every 15 min.
#
# Reads new bytes from /var/cwp/findings/findings.jsonl since last successful
# push, signs body with HMAC-SHA256, POSTs to primary's /ingest.php, advances
# byte offset only on HTTP 200. Empty body = heartbeat (so primary always
# knows we're alive even when nothing new fired).
#
# CONSECUTIVE FAILURE ALERTING:
#   If the push fails N times in a row (default 5 = ~75 min of cron runs),
#   sends a plain email via local sendmail to FALLBACK_ALERT_EMAIL. This is
#   the lifeline — without sync, all detectors are flying blind from the
#   operator's perspective.
#
# INSTALL:
#   sudo install -d /opt/cwp/agent/sync /etc/cwp /var/cwp/state/sync /var/log/cwp
#   sudo install -m 0755 cwp-sync /opt/cwp/agent/sync/cwp-sync
#   sudo install -m 0640 sync.example.conf /etc/cwp/sync.conf
#   sudo $EDITOR /etc/cwp/sync.conf   # set PRIMARY_URL, HMAC_SECRET, SERVER_ID
#   sudo crontab -l 2>/dev/null | { cat; cat sync.cron.example; } | sudo crontab -
#
# USAGE:
#   cwp-sync                # normal run
#   cwp-sync --dry-run      # build + sign request, print what would be sent, no actual POST
#   cwp-sync --verbose      # log to stderr too
#   cwp-sync --reset-offset # snap offset to current EOF (skip backlog after install)
#   cwp-sync --version

set -euo pipefail

VERSION="0.1.0"
SCRIPT_NAME="cwp-sync"

CONFIG_FILE="${CWP_SYNC_CONFIG:-/etc/cwp/sync.conf}"
STATE_DIR="/var/cwp/state/sync"
LOG_FILE="/var/log/cwp/sync.log"
FINDINGS_FILE="/var/cwp/findings/findings.jsonl"
SENDMAIL_BIN="/usr/sbin/sendmail"
CURL_BIN="/usr/bin/curl"
SERVER_NAME="$(hostname -f 2>/dev/null || hostname)"

PRIMARY_URL=""
HMAC_SECRET=""
SERVER_ID=""
FALLBACK_ALERT_EMAIL=""
HTTP_TIMEOUT=15
MAX_CONSECUTIVE_FAILURES=5
# Optional: pin the collector hostname to an IP for this POST (curl --resolve).
# Lets the collector's OWN sync reach https://cwp.<fqdn>/ingest.php over loopback
# (RESOLVE_TO="127.0.0.1") before public DNS propagates. The TLS cert is still
# validated against the real hostname, so this is safe once AutoSSL is live.
RESOLVE_TO=""

# LOCAL INGEST (collector self-ingest). When set, this box writes its own findings
# STRAIGHT into the local SQLite DB via ingest-local.php — no HTTP/DNS/TLS/HMAC.
# The collector sets these; remote agents leave them empty and POST over HTTPS.
LOCAL_INGEST=""                                      # path to ingest-local.php
LOCAL_INGEST_PHP="/usr/local/cpanel/3rdparty/bin/php" # php CLI to run it
LOCAL_INGEST_USER=""                                 # run the CLI as this user (e.g. cwp)

DRY_RUN=0
VERBOSE=0
RESET_OFFSET=0

log() {
  local level="$1"; shift
  local ts; ts="$(date '+%Y-%m-%d %H:%M:%S')"
  printf '%s [%s] %s\n' "$ts" "$level" "$*" >> "$LOG_FILE" 2>/dev/null || true
  if [[ "$VERBOSE" -eq 1 ]] || [[ "$level" == "ERROR" ]]; then
    printf '%s [%s] %s\n' "$ts" "$level" "$*" >&2
  fi
}
die() { log "ERROR" "$*"; exit 1; }
usage() { sed -n '1,40p' "$0" | sed 's/^# \{0,1\}//'; exit "${1:-0}"; }

load_config() {
  # Shared CWP defaults (FALLBACK_ALERT_EMAIL, SERVER_NAME, etc.) sourced first.
  if [[ -r /etc/cwp/common.conf ]]; then
    # shellcheck source=/dev/null
    . /etc/cwp/common.conf
  fi
  if [[ -r "$CONFIG_FILE" ]]; then
    # shellcheck source=/dev/null
    . "$CONFIG_FILE"
    log "INFO" "loaded config from $CONFIG_FILE"
  else
    die "no config at $CONFIG_FILE — sync cannot run (PRIMARY_URL/HMAC_SECRET/SERVER_ID required)"
  fi
  return 0
}

ensure_dirs() {
  for d in "$STATE_DIR" "$(dirname "$LOG_FILE")"; do
    if [[ ! -d "$d" ]]; then
      mkdir -p "$d" 2>/dev/null || die "cannot create $d"
    fi
  done
}

preflight() {
  [[ -n "$SERVER_ID" ]] || die "SERVER_ID not set in $CONFIG_FILE"
  if [[ -n "$LOCAL_INGEST" ]]; then
    # Collector self-ingest: no network, no HMAC. Just need the CLI + a PHP.
    [[ -f "$LOCAL_INGEST" ]] || die "LOCAL_INGEST=$LOCAL_INGEST not found"
    [[ -x "$LOCAL_INGEST_PHP" ]] || command -v "$LOCAL_INGEST_PHP" >/dev/null \
      || die "LOCAL_INGEST_PHP=$LOCAL_INGEST_PHP not executable"
    return 0
  fi
  [[ -n "$PRIMARY_URL" ]] || die "PRIMARY_URL not set in $CONFIG_FILE"
  [[ -n "$HMAC_SECRET" ]] || die "HMAC_SECRET not set in $CONFIG_FILE"
  command -v openssl >/dev/null || die "openssl not found"
  [[ -x "$CURL_BIN" ]] || command -v curl >/dev/null || die "curl not found"
}

# file_size_portable
file_size_portable() {
  stat -c %s "$1" 2>/dev/null || stat -f %z "$1" 2>/dev/null || echo 0
}

# read_offset / write_offset
read_offset() {
  local f="$STATE_DIR/offset"
  [[ -f "$f" ]] && cat "$f" || echo 0
}
write_offset() {
  printf '%s' "$1" > "$STATE_DIR/offset"
}

# read/write/clear consecutive-failure counter
read_failures() {
  local f="$STATE_DIR/consecutive_failures"
  [[ -f "$f" ]] && cat "$f" || echo 0
}
write_failures() {
  printf '%s' "$1" > "$STATE_DIR/consecutive_failures"
}
clear_failures() {
  rm -f "$STATE_DIR/consecutive_failures" 2>/dev/null
  rm -f "$STATE_DIR/fallback_alert_sent"  2>/dev/null
}

# send_fallback_alert <reason>
# Sent when sync has failed MAX_CONSECUTIVE_FAILURES times in a row.
# Uses a separate state file so we only send the email ONCE per failure streak.
send_fallback_alert() {
  local reason="$1"
  local marker="$STATE_DIR/fallback_alert_sent"
  if [[ -f "$marker" ]]; then
    return  # already sent for this streak
  fi
  if [[ -z "$FALLBACK_ALERT_EMAIL" ]]; then
    log "WARN" "consecutive sync failures, but no FALLBACK_ALERT_EMAIL configured"
    return
  fi
  if [[ ! -x "$SENDMAIL_BIN" ]]; then
    log "WARN" "sendmail not available — cannot send fallback alert"
    return
  fi

  local subject body now_str
  now_str="$(date '+%Y-%m-%d %H:%M:%S %Z')"
  subject="[CWP P1] sync FAILING on ${SERVER_NAME} — primary not receiving findings"

  IFS='' read -r -d '' body <<EOF || true
CloudWatch Pro — sync to primary has failed repeatedly.

Server:    ${SERVER_NAME}
Primary:   ${PRIMARY_URL}
Time:      ${now_str}
Failures:  $(read_failures) consecutive
Reason:    ${reason}

Findings collected by this server's detectors are NOT reaching the central
dashboard. Local logs and emails still work, but the fleet view is partial.

Common causes:
  1. Primary server unreachable (DNS, firewall, OVH null-route)
  2. Primary's cwp account suspended or AutoSSL cert expired
  3. HMAC_SECRET on this server doesn't match primary.conf entry
  4. SERVER_ID on this server isn't in primary.conf 'secrets' map
  5. Primary's /home/cwp filesystem out of space

Diagnose:
  curl -i ${PRIMARY_URL}                         # expect 405 Method Not Allowed
  tail -50 /var/log/cwp/sync.log                 # see last failure detail
  tail -200 ${PRIMARY_URL}/../../var/log/cwp/...  # primary-side ingest log

This alert sent ONCE per failure streak. Will reset and re-arm after the
next successful sync.
EOF

  {
    printf 'To: %s\n' "$FALLBACK_ALERT_EMAIL"
    printf 'From: cwp-agent@%s\n' "$SERVER_NAME"
    printf 'Subject: %s\n' "$subject"
    printf 'X-CWP-Module: sync\n'
    printf 'X-CWP-Severity: P1\n'
    printf 'Content-Type: text/plain; charset=utf-8\n'
    printf '\n%s\n' "$body"
  } | "$SENDMAIL_BIN" -t -i

  touch "$marker"
  log "INFO" "fallback alert email sent to $FALLBACK_ALERT_EMAIL"
}

# compute_hmac <timestamp> <body>  -> hex hmac-sha256
compute_hmac() {
  local ts="$1" body="$2"
  printf '%s\n%s' "$ts" "$body" | openssl dgst -sha256 -hmac "$HMAC_SECRET" -hex \
    | awk '{print $NF}'
}

# do_post <body> -> echoes "<status_code>", returns 0 on 200, non-zero otherwise
do_post() {
  local body="$1"
  local ts; ts="$(date +%s)"
  local sig; sig="$(compute_hmac "$ts" "$body")"

  local body_bytes; body_bytes="${#body}"
  log "INFO" "POSTing ${body_bytes} bytes (sig=${sig:0:16}…) to $PRIMARY_URL"

  if [[ "$DRY_RUN" -eq 1 ]]; then
    printf 'DRY-RUN POST:\n  URL: %s\n  X-CWP-Server-ID: %s\n  X-CWP-Timestamp: %s\n  X-CWP-Signature: %s\n  body bytes: %s\n  body preview:\n%s\n' \
      "$PRIMARY_URL" "$SERVER_ID" "$ts" "$sig" "$body_bytes" \
      "$(printf '%s' "$body" | head -c 500)"
    echo "200"
    return 0
  fi

  # Collector self-ingest: pipe straight into the local SQLite DB, no network.
  if [[ -n "$LOCAL_INGEST" ]]; then
    local li=("$LOCAL_INGEST_PHP" "$LOCAL_INGEST" "$SERVER_ID")
    [[ -n "$LOCAL_INGEST_USER" ]] && li=(sudo -u "$LOCAL_INGEST_USER" -- "${li[@]}")
    local out rc
    out="$(printf '%s' "$body" | "${li[@]}" 2>&1)"; rc=$?
    if [[ $rc -eq 0 ]]; then
      log "INFO" "local ingest ok: $out"
      echo "200"; return 0
    fi
    log "ERROR" "local ingest failed (rc=$rc): $out"
    echo "500"; return 1
  fi

  local response_file; response_file="$(mktemp -t cwp-sync-resp.XXXX)"
  trap "rm -f '$response_file'" EXIT

  # Optional loopback/DNS override for the collector's own sync (see RESOLVE_TO).
  local resolve_args=()
  if [[ -n "$RESOLVE_TO" ]]; then
    local h="${PRIMARY_URL#*://}"; h="${h%%/*}"
    local port=443; [[ "$PRIMARY_URL" == http://* ]] && port=80
    case "$h" in *:*) port="${h##*:}"; h="${h%%:*}";; esac
    resolve_args=(--resolve "${h}:${port}:${RESOLVE_TO}")
  fi

  local status
  # IMPORTANT: pipe with `printf '%s'` (no trailing newline). The here-string
  # form `<<< "$body"` appends a newline, which would diverge from the bytes
  # we just signed and produce HMAC mismatches at the receiver.
  status="$(printf '%s' "$body" | curl -sS \
    --max-time "$HTTP_TIMEOUT" \
    ${resolve_args[@]+"${resolve_args[@]}"} \
    -X POST \
    -H "X-CWP-Server-ID: $SERVER_ID" \
    -H "X-CWP-Timestamp: $ts" \
    -H "X-CWP-Signature: $sig" \
    -H 'Content-Type: application/x-ndjson' \
    --data-binary "@-" \
    -o "$response_file" \
    -w '%{http_code}' \
    "$PRIMARY_URL" 2>/dev/null)" || status="000"

  local response; response="$(cat "$response_file" 2>/dev/null | head -c 500)"
  rm -f "$response_file"
  trap - EXIT

  log "INFO" "response: status=$status body=$response"
  # Always return 0 — caller decides what to do based on status echo'd above.
  # Returning non-zero would trip `set -e` in the caller's command substitution
  # and bypass our consecutive-failure tracking.
  echo "$status"
  return 0
}

# ---- argument parsing ----
while [[ $# -gt 0 ]]; do
  case "$1" in
    --dry-run)      DRY_RUN=1; shift ;;
    --verbose)      VERBOSE=1; shift ;;
    --reset-offset) RESET_OFFSET=1; shift ;;
    --version)      printf '%s %s\n' "$SCRIPT_NAME" "$VERSION"; exit 0 ;;
    -h|--help)      usage 0 ;;
    *)              printf 'unknown argument: %s\n' "$1" >&2; usage 2 ;;
  esac
done

# ---- main ----
load_config
ensure_dirs
preflight

log "INFO" "$SCRIPT_NAME v$VERSION starting (server=$SERVER_NAME, server_id=$SERVER_ID, dry_run=$DRY_RUN)"

# --reset-offset mode: snap offset to current EOF and exit
if [[ "$RESET_OFFSET" -eq 1 ]]; then
  if [[ -f "$FINDINGS_FILE" ]]; then
    sz="$(file_size_portable "$FINDINGS_FILE")"
    write_offset "$sz"
    log "INFO" "offset reset to $sz (skipping backlog)"
  else
    write_offset 0
    log "INFO" "offset reset to 0 (no findings file yet)"
  fi
  exit 0
fi

# Determine body to send
BODY=""
SEND_OFFSET=""   # the byte offset we have actually read up to; written on success
if [[ -f "$FINDINGS_FILE" ]]; then
  cur_size="$(file_size_portable "$FINDINGS_FILE")"
  SEND_OFFSET="$cur_size"
  last_off="$(read_offset)"

  if (( cur_size < last_off )); then
    log "INFO" "findings file rotated/truncated (size $cur_size < offset $last_off) — resetting"
    last_off=0
  fi

  if (( cur_size > last_off )); then
    bytes_to_read=$(( cur_size - last_off ))
    BODY="$(tail -c "+$((last_off + 1))" "$FINDINGS_FILE" 2>/dev/null | head -c "$bytes_to_read")"
    log "INFO" "have $bytes_to_read bytes new since last sync"
  else
    log "INFO" "no new findings — sending heartbeat"
  fi
else
  log "INFO" "findings file missing — sending heartbeat"
fi

# Send the request
status="$(do_post "$BODY")"

if [[ "$status" == "200" ]]; then
  # Advance offset only on confirmed success, and only to the size we actually
  # read (SEND_OFFSET), NOT a fresh stat — anything appended during the POST
  # must be picked up by the next run, not silently skipped.
  if [[ -n "$SEND_OFFSET" ]]; then
    [[ "$DRY_RUN" -eq 0 ]] && write_offset "$SEND_OFFSET"
  fi
  clear_failures
  log "INFO" "$SCRIPT_NAME complete: success"
  exit 0
else
  fails=$(($(read_failures) + 1))
  [[ "$DRY_RUN" -eq 0 ]] && write_failures "$fails"
  log "ERROR" "$SCRIPT_NAME failed: status=$status (consecutive failures: $fails)"

  if (( fails >= MAX_CONSECUTIVE_FAILURES )); then
    send_fallback_alert "HTTP $status from $PRIMARY_URL after $fails consecutive failures"
  fi
  exit 1
fi
