#!/usr/bin/env bash
# Auto-restarting Sendblue channel session for Claude Code.
# Runs as a launchd agent (always-on) or in a tmux pane for dev.
# Uses raw binary path — no shell function wrapping needed.

set -euo pipefail

# ── Single-instance lock ─────────────────────────────────
# Only one run-channel.sh may run at a time. If another instance holds the
# lock, this one exits immediately to prevent triple-spawn.
LOCK_FILE="/tmp/sendblue-channel.lock"

cleanup_lock() {
  rm -f "$LOCK_FILE" 2>/dev/null
}

if [ -f "$LOCK_FILE" ]; then
  other_pid=$(cat "$LOCK_FILE" 2>/dev/null)
  if [ -n "$other_pid" ] && kill -0 "$other_pid" 2>/dev/null; then
    echo "[$(date)] run-channel.sh: another instance running (pid $other_pid). Exiting."
    exit 0
  fi
  # Stale lock — previous instance died without cleanup
  rm -f "$LOCK_FILE"
fi
echo $$ > "$LOCK_FILE"
trap cleanup_lock EXIT

# Clean up stale FIFOs from previous crashed runs
timeout 5 find /var/folders -maxdepth 4 -name "sendblue-stdin-*" -user "$(whoami)" -delete 2>/dev/null || true

RESTART_DELAY=5
LOG_FILE="/tmp/sendblue-channel.log"
HEALTH_FILE="/tmp/sendblue-channel-health.json"
STABILITY_THRESHOLD=120  # seconds — session running longer than this resets failure counter

consecutive_failures=0
last_failure_ts=0

log_msg() {
  local msg="[$(date '+%Y-%m-%dT%H:%M:%S%z')] $1"
  echo "$msg"
  echo "$msg" >> "$LOG_FILE"
}

write_health() {
  local restarts_today last_restart stable_duration
  restarts_today="$1"
  last_restart="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
  stable_duration="$2"

  cat > "$HEALTH_FILE" <<HEALTHEOF
{
  "restarts_today": ${restarts_today},
  "last_restart": "${last_restart}",
  "last_stable_duration_seconds": ${stable_duration},
  "consecutive_failures": ${consecutive_failures}
}
HEALTHEOF
}

# Load existing restart count for today (if health file exists and date matches)
restarts_today=0
if [ -f "$HEALTH_FILE" ]; then
  existing_date=$(python3 -c "
import json, sys
try:
    d = json.load(open('$HEALTH_FILE'))
    print(d.get('last_restart','')[:10])
except: print('')
" 2>/dev/null || true)
  today=$(date -u '+%Y-%m-%d')
  if [ "$existing_date" = "$today" ]; then
    restarts_today=$(python3 -c "
import json
try:
    d = json.load(open('$HEALTH_FILE'))
    print(d.get('restarts_today', 0))
except: print(0)
" 2>/dev/null || echo 0)
  fi
fi

get_backoff_delay() {
  if [ "$consecutive_failures" -ge 5 ]; then
    echo 60
  elif [ "$consecutive_failures" -ge 3 ]; then
    echo 30
  else
    echo "$RESTART_DELAY"
  fi
}

# Resolve the real Claude binary (Mach-O), bypassing shell wrappers.
# The ~/.local/bin/claude wrapper runs an account-switcher that needs fzf/TTY,
# which fails under FIFO stdin in headless/launchd environments.
CLAUDE_BIN=""
for candidate in "$HOME/.local/share/claude/versions"/*; do
  [ -x "$candidate" ] && CLAUDE_BIN="$candidate"
done
if [ -z "$CLAUDE_BIN" ]; then
  echo "[sendblue-channel] ERROR: no claude binary found in ~/.local/share/claude/versions/"
  exit 1
fi

# Detect environment: tmux pane (has TTY) vs launchd (stdin=/dev/null)
use_fifo=false
if ! [ -t 0 ]; then
  use_fifo=true
fi

# Run from repo root — agent definition provides orchestrator instructions.
# Avoids CLAUDE.md conflict (tools/sendblue-channel/CLAUDE.md is fallback only).
# SENDBLUE_REPO_ROOT lets hosts override; otherwise derive from script location
# (tools/sendblue-channel/run-channel.sh → repo root is two levels up).
SENDBLUE_REPO_ROOT="${SENDBLUE_REPO_ROOT:-$(cd "$(dirname "$0")/../.." && pwd)}"
cd "$SENDBLUE_REPO_ROOT" || exit 1

# Initialize FIFO path (set before trap so set -u doesn't trip)
_FIFO=""
_AUTOCONFIRM_PID=""

# Clean up FIFO on exit
trap '[ -n "$_FIFO" ] && rm -f "$_FIFO" 2>/dev/null' EXIT

# Raise autocompact threshold to prevent rapid compaction killing orchestrator sessions
export CLAUDE_AUTOCOMPACT_PCT_OVERRIDE=65

while true; do
  session_start=$(date +%s)
  log_msg "Starting session (binary: $CLAUDE_BIN, fifo: $use_fifo, cwd: $(pwd), consecutive_failures: $consecutive_failures)"
  if [ "$use_fifo" = true ]; then
    # Headless: FIFO stdin trick to prevent --print mode
    # Remove stale FIFO from previous iteration before creating a new one
    [ -n "$_FIFO" ] && rm -f "$_FIFO" 2>/dev/null || true
    _FIFO=$(mktemp -t sendblue-stdin-XXXXXX)
    rm -f "$_FIFO" && mkfifo "$_FIFO"
    # Use read-write mode (<>) to avoid blocking — opening write-only on a
    # FIFO blocks until a reader opens it, but claude hasn't started yet.
    exec 9<>"$_FIFO"
    # Seed the FIFO so Claude sees stdin data and enters interactive mode
    # instead of falling into --print mode.
    echo "Initialize sendblue channel orchestrator session. Load CLAUDE.md and begin startup sequence." >&9
    "$CLAUDE_BIN" \
      --agent orchestrator \
      --brief \
      --dangerously-load-development-channels server:sendblue-channel \
      --permission-mode bypassPermissions \
      --name main-iMessage-agent \
      < "$_FIFO" \
      || true
    exec 9>&-
    rm -f "$_FIFO"
  else
    # tmux pane: real TTY, no FIFO needed
    # Auto-confirm the development channels warning prompt by sending
    # Enter to the current tmux pane after a short delay.
    if [ -n "${TMUX:-}" ]; then
      _PANE_ID="$(tmux display-message -p '#{pane_id}')"
      (sleep 3 && tmux send-keys -t "$_PANE_ID" Enter) &
      _AUTOCONFIRM_PID=$!
    fi
    "$CLAUDE_BIN" \
      --agent orchestrator \
      --brief \
      --dangerously-load-development-channels server:sendblue-channel \
      --permission-mode bypassPermissions \
      --name main-iMessage-agent \
      || true
    # Clean up the auto-confirm background job if still running
    [ -n "${_AUTOCONFIRM_PID:-}" ] && kill "$_AUTOCONFIRM_PID" 2>/dev/null || true
    _AUTOCONFIRM_PID=""
  fi
  session_end=$(date +%s)
  session_duration=$((session_end - session_start))
  now_ts=$(date +%s)

  # Determine if this was a stable run or a failure
  if [ "$session_duration" -gt "$STABILITY_THRESHOLD" ]; then
    # Stable session — reset failure counter
    consecutive_failures=0
    log_msg "Session ran for ${session_duration}s (stable). Resetting failure counter."
  else
    # Short-lived session — check if it's consecutive
    time_since_last=$((now_ts - last_failure_ts))
    if [ "$last_failure_ts" -gt 0 ] && [ "$time_since_last" -gt 60 ]; then
      # More than 60s since last failure — not consecutive, reset
      consecutive_failures=1
      log_msg "Session ran for ${session_duration}s. Previous failure was ${time_since_last}s ago — resetting counter to 1."
    else
      consecutive_failures=$((consecutive_failures + 1))
      log_msg "Session ran for ${session_duration}s. Consecutive failure #${consecutive_failures}."
    fi
    last_failure_ts="$now_ts"
  fi

  # Update health file
  restarts_today=$((restarts_today + 1))
  write_health "$restarts_today" "$session_duration"

  # Calculate backoff delay
  delay=$(get_backoff_delay)
  log_msg "Restarting in ${delay}s (consecutive_failures: $consecutive_failures, restarts_today: $restarts_today)"
  sleep "$delay"
done
