#!/bin/bash
#
# cwp-phishing-sweep — daily phishing-kit + webshell sweep
#
# Part of CloudWatch Pro (CWP) v1. Runs once a day on each cPanel server.
# Walks /home/*/public_html (configurable), matches each file against the
# webshell + phishing signature libraries, emits findings to the CWP queue,
# sends one consolidated email digest per run.
#
# Detect-only. Does NOT quarantine, delete, or modify any file.
#
# INSTALL (on each cPanel server, after Week 0 is done):
#   sudo install -d /opt/cwp/agent/modules/phishing-sweep \
#                   /opt/cwp/signatures/webshell \
#                   /opt/cwp/signatures/phishing \
#                   /etc/cwp \
#                   /var/cwp/state/phishing-sweep \
#                   /var/cwp/findings \
#                   /var/log/cwp
#   sudo install -m 0755 cwp-phishing-sweep /opt/cwp/agent/modules/phishing-sweep/
#   sudo install -m 0644 config.example.conf /etc/cwp/phishing-sweep.conf
#   sudo install -m 0644 ../../../signatures/webshell/webshell-v1.patterns /opt/cwp/signatures/webshell/
#   sudo install -m 0644 ../../../signatures/phishing/phishing-v1.patterns /opt/cwp/signatures/phishing/
#   sudo $EDITOR /etc/cwp/phishing-sweep.conf   # set ALERT_EMAIL, SCAN_ROOTS, SERVER_NAME
#   sudo crontab -l 2>/dev/null | { cat; cat cron.example; } | sudo crontab -
#
# USAGE:
#   cwp-phishing-sweep                       # normal scan
#   cwp-phishing-sweep --dry-run             # report only, no findings file write, no email
#   cwp-phishing-sweep --verbose             # log to stderr too
#   cwp-phishing-sweep --root /home/x        # scan only this root (testing)
#   cwp-phishing-sweep --no-email            # write findings, skip the digest email
#   cwp-phishing-sweep --version
#
# EXIT CODES:
#   0 — scan completed (findings may or may not have been emitted)
#   1 — config or environment error (see /var/log/cwp/phishing-sweep.log)
#   2 — invalid arguments

set -euo pipefail

VERSION="0.1.0"
SCRIPT_NAME="cwp-phishing-sweep"

# ---- defaults (overridden by /etc/cwp/phishing-sweep.conf) ----
CONFIG_FILE="${CWP_PHISHING_SWEEP_CONFIG:-/etc/cwp/phishing-sweep.conf}"
SCAN_ROOTS=("/home")
WEBSHELL_PATTERNS_FILE="/opt/cwp/signatures/webshell/webshell-v1.patterns"
PHISHING_PATTERNS_FILE="/opt/cwp/signatures/phishing/phishing-v1.patterns"
STATE_DIR="/var/cwp/state/phishing-sweep"
FINDINGS_DIR="/var/cwp/findings"
LOG_FILE="/var/log/cwp/phishing-sweep.log"
ALERT_EMAIL="root@localhost"
SERVER_NAME="$(hostname -f 2>/dev/null || hostname)"
MAX_FILE_SIZE_BYTES=$((5 * 1024 * 1024))      # skip files > 5MB
SCAN_EXTENSIONS="php phtml html htm js htaccess"
SEEN_TTL_DAYS=7                                # don't re-alert on same hash within N days
SENDMAIL_BIN="/usr/sbin/sendmail"
SCAN_PATH_LIMIT=""                             # override --root via config; usually empty

DRY_RUN=0
VERBOSE=0
NO_EMAIL=0

# ---- helpers ----

log() {
  local level="$1"; shift
  local msg="$*"
  local ts
  ts="$(date '+%Y-%m-%d %H:%M:%S')"
  printf '%s [%s] %s\n' "$ts" "$level" "$msg" >> "$LOG_FILE" 2>/dev/null || true
  if [[ "$VERBOSE" -eq 1 ]] || [[ "$level" == "ERROR" ]]; then
    printf '%s [%s] %s\n' "$ts" "$level" "$msg" >&2
  fi
}

die() {
  log "ERROR" "$*"
  exit 1
}

usage() {
  sed -n '1,40p' "$0" | sed 's/^# \{0,1\}//'
  exit "${1:-0}"
}

load_config() {
  # Shared CWP defaults (ALERT_EMAIL, SERVER_NAME, etc.) sourced first.
  if [[ -r /etc/cwp/common.conf ]]; then
    # shellcheck source=/dev/null
    . /etc/cwp/common.conf
  fi
  if [[ -r "$CONFIG_FILE" ]]; then
    # shellcheck source=/dev/null
    . "$CONFIG_FILE"
    log "INFO" "loaded config from $CONFIG_FILE"
  else
    log "WARN" "no config at $CONFIG_FILE — using built-in defaults"
  fi
  if [[ -n "$SCAN_PATH_LIMIT" ]]; then
    SCAN_ROOTS=("$SCAN_PATH_LIMIT")
  fi
  return 0
}

ensure_dirs() {
  for d in "$STATE_DIR" "$FINDINGS_DIR" "$(dirname "$LOG_FILE")"; do
    if [[ ! -d "$d" ]]; then
      mkdir -p "$d" 2>/dev/null || die "cannot create $d (run as root or pre-create)"
    fi
  done
}

preflight() {
  [[ -r "$WEBSHELL_PATTERNS_FILE" ]] || die "missing webshell patterns: $WEBSHELL_PATTERNS_FILE"
  [[ -r "$PHISHING_PATTERNS_FILE" ]] || die "missing phishing patterns: $PHISHING_PATTERNS_FILE"
  for cmd in find grep awk sha256sum sed; do
    command -v "$cmd" >/dev/null || die "$cmd not found"
  done
  if [[ "$NO_EMAIL" -eq 0 ]] && [[ ! -x "$SENDMAIL_BIN" ]]; then
    log "WARN" "$SENDMAIL_BIN not found — email digest will be skipped"
  fi
}

# load_patterns_file <patterns_file>
# Appends entries to global arrays CONTENT_PATTERNS and PATH_PATTERNS.
# Each entry is "id<TAB>severity<TAB>regex<TAB>desc".
# (No nameref — works on bash 3.2 too.)
load_patterns_file() {
  local file="$1"
  local before_c=${#CONTENT_PATTERNS[@]}
  local before_p=${#PATH_PATTERNS[@]}
  local id sev scope pat desc
  while IFS=$'\t' read -r id sev scope pat desc; do
    [[ -z "$id" ]] && continue
    [[ "$id" =~ ^[[:space:]]*# ]] && continue
    case "$scope" in
      content) CONTENT_PATTERNS+=("$id"$'\t'"$sev"$'\t'"$pat"$'\t'"$desc") ;;
      path)    PATH_PATTERNS+=("$id"$'\t'"$sev"$'\t'"$pat"$'\t'"$desc") ;;
      *)       log "WARN" "unknown scope '$scope' in $file (id=$id), skipping" ;;
    esac
  done < "$file"
  local added_c=$(( ${#CONTENT_PATTERNS[@]} - before_c ))
  local added_p=$(( ${#PATH_PATTERNS[@]} - before_p ))
  log "INFO" "loaded $((added_c + added_p)) patterns from $(basename "$file") ($added_c content, $added_p path)"
}

# file_size <path> — portable: tries GNU stat, falls back to BSD stat
file_size() {
  stat -c %s "$1" 2>/dev/null || stat -f %z "$1" 2>/dev/null || echo 0
}

# build_find_extensions_args -> populates global FIND_EXT_ARGS array.
# Using an array preserves word boundaries and avoids the shell glob-expanding
# patterns like *.php against the current directory before find sees them.
build_find_extensions_args() {
  FIND_EXT_ARGS=()
  local first=1
  local ext
  for ext in $SCAN_EXTENSIONS; do
    if [[ "$first" -eq 1 ]]; then first=0; else FIND_EXT_ARGS+=("-o"); fi
    if [[ "$ext" == "htaccess" ]]; then
      FIND_EXT_ARGS+=("-name" ".htaccess")
    else
      FIND_EXT_ARGS+=("-name" "*.$ext")
    fi
  done
}

# json_escape <string> -> escapes for JSON string literal
json_escape() {
  local s="$1"
  # backslash, double-quote, newline, tab, carriage return
  s="${s//\\/\\\\}"
  s="${s//\"/\\\"}"
  s="${s//$'\n'/\\n}"
  s="${s//$'\t'/\\t}"
  s="${s//$'\r'/\\r}"
  printf '%s' "$s"
}

# extract_account <file_path> -> account name (first segment after /home/)
extract_account() {
  local p="$1"
  awk -F/ '$2 == "home" { print $3 }' <<< "$p"
}

# is_seen_recently <hash> -> 0 if seen within SEEN_TTL_DAYS, 1 otherwise
is_seen_recently() {
  local hash="$1"
  local seen_file="$STATE_DIR/seen.${hash}"
  [[ -f "$seen_file" ]] || return 1
  local last_ts now diff
  last_ts="$(cat "$seen_file" 2>/dev/null || echo 0)"
  now="$(date +%s)"
  diff=$(( now - last_ts ))
  if (( diff < SEEN_TTL_DAYS * 86400 )); then
    return 0
  fi
  return 1
}

mark_seen() {
  local hash="$1"
  date +%s > "$STATE_DIR/seen.${hash}"
}

# emit_finding <id> <severity> <description> <file_path> <hash>
emit_finding() {
  local sig_id="$1" sev="$2" desc="$3" file_path="$4" hash="$5"
  local now_iso now_epoch finding_file account uid
  now_epoch="$(date +%s)"
  now_iso="$(date '+%Y-%m-%dT%H:%M:%S%z')"
  account="$(extract_account "$file_path")"
  uid="phishing-sweep-${SERVER_NAME}-${hash:0:16}"
  finding_file="$FINDINGS_DIR/findings.jsonl"

  local action="Inspect ${file_path}. If malicious: move to /var/cwp/vault/manual/ and inform client. Audit /home/${account}/public_html for related backdoors using: find /home/${account}/public_html -type f -newer ${file_path} -ls."

  local file_path_esc desc_esc action_esc account_esc
  file_path_esc="$(json_escape "$file_path")"
  desc_esc="$(json_escape "$desc")"
  action_esc="$(json_escape "$action")"
  account_esc="$(json_escape "${account:-unknown}")"

  local json
  json=$(printf '{"ts":"%s","ts_epoch":%d,"module":"%s","server":"%s","severity":"%s","account":"%s","metric":"%s","value":1,"id":"%s","signature_id":"%s","description":"%s","file_path":"%s","file_sha256":"%s","recommended_action":"%s"}' \
    "$now_iso" "$now_epoch" "phishing-sweep" "$SERVER_NAME" "$sev" "$account_esc" \
    "phishing_or_webshell_match" "$uid" "$sig_id" "$desc_esc" "$file_path_esc" "$hash" "$action_esc")

  if [[ "$DRY_RUN" -eq 1 ]]; then
    printf 'DRY-RUN finding: %s\n' "$json"
  else
    printf '%s\n' "$json" >> "$finding_file"
  fi

  # Add to in-memory digest list (used for email at end of run)
  DIGEST_LINES+=("[$sev] $sig_id  $file_path  ($desc)")
}

# sha256_of <path> — portable wrapper (sha256sum on Linux, shasum -a 256 on BSD/Mac)
sha256_of() {
  local f="$1"
  if command -v sha256sum >/dev/null 2>&1; then
    sha256sum "$f" 2>/dev/null | awk '{print $1}'
  else
    shasum -a 256 "$f" 2>/dev/null | awk '{print $1}'
  fi
}

# scan: walks scan roots, runs patterns, emits findings for new/changed matches
run_scan() {
  CONTENT_PATTERNS=()
  PATH_PATTERNS=()
  load_patterns_file "$WEBSHELL_PATTERNS_FILE"
  load_patterns_file "$PHISHING_PATTERNS_FILE"

  build_find_extensions_args

  local roots_count=${#SCAN_ROOTS[@]}
  log "INFO" "scanning ${roots_count} root(s): ${SCAN_ROOTS[*]}"
  log "INFO" "extensions: $SCAN_EXTENSIONS, max file size: $MAX_FILE_SIZE_BYTES bytes"

  local files_scanned=0 hits=0 new_hits=0
  local file
  while IFS= read -r -d '' file; do
    files_scanned=$(( files_scanned + 1 ))

    # path-based checks (cheap, run first)
    local matched_path=0
    local p_entry
    for p_entry in "${PATH_PATTERNS[@]}"; do
      IFS=$'\t' read -r pid psev ppat pdesc <<< "$p_entry"
      if [[ "$file" =~ $ppat ]]; then
        hits=$(( hits + 1 ))
        local hash
        hash="$(sha256_of "$file")"
        [[ -z "$hash" ]] && continue
        if is_seen_recently "$hash"; then matched_path=1; break; fi
        emit_finding "$pid" "$psev" "$pdesc" "$file" "$hash"
        mark_seen "$hash"
        new_hits=$(( new_hits + 1 ))
        matched_path=1
        break
      fi
    done
    [[ "$matched_path" -eq 1 ]] && continue

    # content-based checks (only if file is under size limit)
    local size
    size="$(file_size "$file")"
    [[ "$size" -gt "$MAX_FILE_SIZE_BYTES" ]] && continue
    [[ "$size" -eq 0 ]] && continue

    local c_entry
    for c_entry in "${CONTENT_PATTERNS[@]}"; do
      IFS=$'\t' read -r cid csev cpat cdesc <<< "$c_entry"
      if grep -E -l -m 1 -I --binary-files=without-match "$cpat" "$file" >/dev/null 2>&1; then
        hits=$(( hits + 1 ))
        local hash
        hash="$(sha256_of "$file")"
        [[ -z "$hash" ]] && continue
        if is_seen_recently "$hash"; then break; fi
        emit_finding "$cid" "$csev" "$cdesc" "$file" "$hash"
        mark_seen "$hash"
        new_hits=$(( new_hits + 1 ))
        break  # one finding per file per run
      fi
    done

  done < <(find "${SCAN_ROOTS[@]}" \
              -xdev \
              -type f \
              -size -"$((MAX_FILE_SIZE_BYTES + 1))"c \
              \( "${FIND_EXT_ARGS[@]}" \) \
              -print0 2>/dev/null)

  log "INFO" "scan complete: files_scanned=$files_scanned hits=$hits new_findings=$new_hits"
  TOTAL_FILES_SCANNED="$files_scanned"
  TOTAL_HITS="$hits"
  TOTAL_NEW_FINDINGS="$new_hits"
}

# send_digest_email: one email per run, summarising new findings
send_digest_email() {
  local count="${#DIGEST_LINES[@]}"
  if (( count == 0 )); then
    log "INFO" "no new findings — digest email NOT sent"
    return
  fi

  if [[ "$NO_EMAIL" -eq 1 ]]; then
    log "INFO" "$count new findings, but --no-email set — digest skipped"
    return
  fi

  local subject body now_str sev_summary
  now_str="$(date '+%Y-%m-%d %H:%M:%S %Z')"
  sev_summary="$(printf '%s\n' "${DIGEST_LINES[@]}" | grep -oE '^\[P[123]\]' | sort | uniq -c | tr -s ' ' | sed 's/^ //' | tr '\n' ',' | sed 's/,$//')"

  subject="[CWP] phishing-sweep: ${count} new finding(s) on ${SERVER_NAME} (${sev_summary})"

  body=$(cat <<EOF
CloudWatch Pro — Phishing-Kit + Webshell Daily Sweep

Server:      ${SERVER_NAME}
Run time:    ${now_str}
Files scanned: ${TOTAL_FILES_SCANNED}
Total signature hits: ${TOTAL_HITS}
New findings (this run): ${count}

----- new findings -----

$(printf '%s\n' "${DIGEST_LINES[@]}")

----- next steps -----

For each P1 finding:
  1. Inspect the file:
       less <file_path>
  2. Verify with the account owner whether the file is legitimate.
  3. If malicious: move to /var/cwp/vault/manual/ (preserves forensic copy)
       mkdir -p /var/cwp/vault/manual
       mv <file_path> /var/cwp/vault/manual/
  4. Audit the same account for related backdoors:
       find /home/<account>/public_html -type f -newer <file_path> -ls
  5. If compromise confirmed: rotate cPanel password + force WP admin password reset.

This module is detect-only. CWP did NOT move, delete, or modify any file.

Findings file:  /var/cwp/findings/findings.jsonl
Sweep log:      /var/log/cwp/phishing-sweep.log
Each finding will not re-alert for ${SEEN_TTL_DAYS} days unless the file content changes.
EOF
  )

  if [[ "$DRY_RUN" -eq 1 ]]; then
    printf 'DRY-RUN digest email to %s:\n  Subject: %s\n%s\n' "$ALERT_EMAIL" "$subject" "$body"
    return
  fi

  if [[ ! -x "$SENDMAIL_BIN" ]]; then
    log "WARN" "sendmail not available; digest email NOT sent"
    return
  fi

  {
    printf 'To: %s\n' "$ALERT_EMAIL"
    printf 'From: cwp-agent@%s\n' "$SERVER_NAME"
    printf 'Subject: %s\n' "$subject"
    printf 'X-CWP-Module: phishing-sweep\n'
    printf 'Content-Type: text/plain; charset=utf-8\n'
    printf '\n%s\n' "$body"
  } | "$SENDMAIL_BIN" -t -i

  log "INFO" "digest email sent to $ALERT_EMAIL with $count finding(s)"
}

# ---- argument parsing ----
while [[ $# -gt 0 ]]; do
  case "$1" in
    --dry-run)  DRY_RUN=1; shift ;;
    --verbose)  VERBOSE=1; shift ;;
    --no-email) NO_EMAIL=1; shift ;;
    --root)     SCAN_PATH_LIMIT="$2"; shift 2 ;;
    --version)  printf '%s %s\n' "$SCRIPT_NAME" "$VERSION"; exit 0 ;;
    -h|--help)  usage 0 ;;
    *)          printf 'unknown argument: %s\n' "$1" >&2; usage 2 ;;
  esac
done

# ---- main ----
load_config
ensure_dirs
preflight

DIGEST_LINES=()
TOTAL_FILES_SCANNED=0
TOTAL_HITS=0
TOTAL_NEW_FINDINGS=0

log "INFO" "$SCRIPT_NAME v$VERSION starting (server=$SERVER_NAME, dry_run=$DRY_RUN)"

run_scan
send_digest_email

log "INFO" "$SCRIPT_NAME complete: ${TOTAL_NEW_FINDINGS} new finding(s)"
exit 0
