#!/usr/bin/env bash
# curogram-mcp supervisor.
#
# Streamable HTTP MCP server on port 18817 (Curogram patient messaging, v1).
# Bun + restart-on-crash. Locked with flock so multiple invocations are safe.
#
# Env file (~/.config/curogram-mcp.env, mode 0600) must define:
#   CUROGRAM_AGENT_USERNAME   service-account email (agent@exulthealthcare.com)
#   CUROGRAM_AGENT_PASSWORD   service-account password
#   MCP_BEARER_TOKEN          gates the HTTP transport (same token as the
#                             rendered .mcp.json Authorization header)
# Optional:
#   CUROGRAM_PRACTICE_ID      pin the Exult tenant per session (multi-practice)
#   CUROGRAM_COOKIE / CUROGRAM_XSRF_TOKEN   CDP-harvest fallback if MFA-gated
set -euo pipefail

# Keep runtime state out of world-writable /tmp (symlink/clobber risk). Use a
# private per-user state dir (0700) so another local user can't pre-create or
# tamper with the lock/log/health files.
STATE_DIR="${XDG_STATE_HOME:-$HOME/.local/state}/curogram-mcp"
mkdir -p "$STATE_DIR"
chmod 700 "$STATE_DIR"
LOCK_FILE="$STATE_DIR/curogram-mcp.lock"
LOG_FILE="$STATE_DIR/curogram-mcp.log"
HEALTH_FILE="$STATE_DIR/curogram-mcp-health.json"
ENV_FILE="$HOME/.config/curogram-mcp.env"
RESTART_DELAY=3
STABILITY_THRESHOLD=60
# Derive the server dir from this script's own location so the supervisor is
# not tied to a fixed checkout path.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SERVER_DIR="$SCRIPT_DIR/../tools/curogram-mcp"

# Single-instance guard via a real flock advisory lock. fd 9 holds the lock
# for the lifetime of this process; the kernel releases it on exit, so no
# cleanup trap or PID-file race (TOCTOU) is possible.
exec 9>"$LOCK_FILE"
if ! flock -n 9; then
  echo "[$(date)] another instance holds the lock. Exiting." >&2
  exit 0
fi

log_msg() { echo "[$(date '+%Y-%m-%dT%H:%M:%S%z')] $1" | tee -a "$LOG_FILE"; }

BUN_BIN="$(command -v bun || echo "$HOME/.bun/bin/bun")"
[ -x "$BUN_BIN" ] || { log_msg "ERROR: bun binary not found"; exit 1; }

[ -d "$SERVER_DIR" ] || { log_msg "ERROR: server dir missing: $SERVER_DIR"; exit 1; }

# Load env (CUROGRAM creds + MCP_BEARER_TOKEN)
if [ -f "$ENV_FILE" ]; then
  set -a
  # shellcheck disable=SC1090
  source "$ENV_FILE"
  set +a
else
  log_msg "ERROR: env file missing: $ENV_FILE"
  exit 1
fi

# Required env sanity. Either programmatic creds OR a harvested cookie pair.
if [ -z "${MCP_BEARER_TOKEN:-}" ]; then
  log_msg "ERROR: MCP_BEARER_TOKEN not set in $ENV_FILE"
  exit 1
fi
if [ -z "${CUROGRAM_AGENT_USERNAME:-}" ] || [ -z "${CUROGRAM_AGENT_PASSWORD:-}" ]; then
  if [ -z "${CUROGRAM_COOKIE:-}" ] || [ -z "${CUROGRAM_XSRF_TOKEN:-}" ]; then
    log_msg "ERROR: need CUROGRAM_AGENT_USERNAME+CUROGRAM_AGENT_PASSWORD or CUROGRAM_COOKIE+CUROGRAM_XSRF_TOKEN in $ENV_FILE"
    exit 1
  fi
fi
export MCP_BEARER_TOKEN
# Export optional vars only when actually set, so unset stays undefined in the
# child (not an empty string that would shadow `process.env.X === undefined`).
# Guarded with `if` because `&&` would trip `set -e` when the test is false.
for _var in CUROGRAM_AGENT_USERNAME CUROGRAM_AGENT_PASSWORD \
  CUROGRAM_PRACTICE_ID CUROGRAM_COOKIE CUROGRAM_XSRF_TOKEN; do
  if [ -n "${!_var:-}" ]; then export "${_var?}"; fi
done
unset _var

consecutive_failures=0
last_failure_ts=0
# Restarts since this supervisor process started (not a calendar-day counter).
total_restarts=0

backoff() {
  if [ "$consecutive_failures" -ge 5 ]; then echo 60
  elif [ "$consecutive_failures" -ge 3 ]; then echo 15
  else echo "$RESTART_DELAY"; fi
}

while true; do
  start_ts=$(date +%s)
  log_msg "starting bun server (port=18817 failures=$consecutive_failures)"

  "$BUN_BIN" "$SERVER_DIR/server.ts" >>"$LOG_FILE" 2>&1 || true

  end_ts=$(date +%s)
  dur=$((end_ts - start_ts))

  if [ "$dur" -gt "$STABILITY_THRESHOLD" ]; then
    consecutive_failures=0
    log_msg "session ran ${dur}s (stable)"
  else
    if [ "$last_failure_ts" -gt 0 ] && [ $((end_ts - last_failure_ts)) -gt 300 ]; then
      consecutive_failures=1
    else
      consecutive_failures=$((consecutive_failures + 1))
    fi
    last_failure_ts="$end_ts"
    log_msg "session ran ${dur}s (consecutive failure #$consecutive_failures)"
  fi

  total_restarts=$((total_restarts + 1))
  # Atomic write so a monitor reading concurrently never sees partial JSON.
  health_tmp="${HEALTH_FILE}.tmp"
  cat >"$health_tmp" <<EOF
{"total_restarts":$total_restarts,"last_restart":"$(date -u +%FT%TZ)","last_duration_seconds":$dur,"consecutive_failures":$consecutive_failures}
EOF
  mv -f "$health_tmp" "$HEALTH_FILE"

  delay=$(backoff)
  log_msg "restart in ${delay}s"
  sleep "$delay"
done
