#!/usr/bin/env bash
# Rerun the single-shot repro many times.
# Stop on the first crash (exit via signal) or hang (deadlock).
set -u
cd "$(dirname "$0")"

RUNS=${RUNS:-100}
PER_RUN_TIMEOUT=${PER_RUN_TIMEOUT:-20}
RUBY=${RUBY:-ruby}
# YJIT disabled. Tiny malloc limit -> frequent malloc-triggered GC
# (handle_malloc_failure -> garbage_collect), to coincide with the resume storm.
export RUBYOPT="${RUBYOPT:-}"
export RUBY_GC_MALLOC_LIMIT=${RUBY_GC_MALLOC_LIMIT:-16384}
export RUBY_GC_MALLOC_LIMIT_MAX=${RUBY_GC_MALLOC_LIMIT_MAX:-16384}
export REACTORS=${REACTORS:-8} FIBERS=${FIBERS:-32} PRODUCERS=${PRODUCERS:-16} ALLOC=${ALLOC:-400} MAXTIME=${MAXTIME:-5}

echo "ruby: $($RUBY -v | head -1)"
echo "RUNS=$RUNS REACTORS=$REACTORS FIBERS=$FIBERS PRODUCERS=$PRODUCERS ALLOC=$ALLOC MAXTIME=$MAXTIME malloc_limit=$RUBY_GC_MALLOC_LIMIT timeout=${PER_RUN_TIMEOUT}s"

log_line() { printf '%s\n' "$*"; }

for i in $(seq 1 "$RUNS"); do
  log="repro.run.log"
  log_line "run $i/$RUNS: starting (timeout ${PER_RUN_TIMEOUT}s)..."
  start=$SECONDS

  "$RUBY" -Ilib -Iext/gvltools repro.rb >"$log" 2>&1 &
  pid=$!

  # Poll until the run exits on its own or the timeout elapses. Unlike a
  # kill-on-timeout watchdog, this leaves a hung process ALIVE so a debugger
  # can attach to it.
  deadlocked=1
  while [ $((SECONDS - start)) -lt "$PER_RUN_TIMEOUT" ]; do
    if ! kill -0 "$pid" 2>/dev/null; then
      deadlocked=0
      break
    fi
    sleep 0.5
  done

  if [ "$deadlocked" -eq 1 ]; then
    elapsed=$((SECONDS - start))
    log_line ""
    log_line "############################################################"
    log_line "### run $i/$RUNS: DEADLOCK after ${elapsed}s (>${PER_RUN_TIMEOUT}s)"
    log_line "### PID = $pid   (left ALIVE -- attach your debugger now)"
    log_line "###   lldb -p $pid -o 'thread backtrace all'"
    log_line "###   sample $pid 5"
    log_line "############################################################"
    # Block here so the hung process stays attachable. When you're done
    # debugging, press Enter to SIGKILL it and stop the loop.
    if { exec 3</dev/tty; } 2>/dev/null; then
      read -r -p "Press Enter to SIGKILL pid $pid and exit... " _ <&3
      exec 3<&-
      kill -KILL "$pid" 2>/dev/null
    else
      log_line "(no controlling tty: leaving pid $pid running -- kill it yourself when done)"
    fi
    exit 1
  fi

  wait "$pid"; code=$?
  elapsed=$((SECONDS - start))

  if [ "$code" -eq 0 ]; then
    log_line "run $i/$RUNS: survived in ${elapsed}s"
    continue
  fi

  # Exited on its own with a non-zero status -> crashed (e.g. SIGSEGV=139).
  sig=$((code - 128))
  log_line "=== run $i/$RUNS: CRASH after ${elapsed}s, exit=$code (signal $sig) ==="
  log_line "----- repro.run.log -----"
  cat "$log"
  cp "$log" "repro.crash.log"
  log_line "----- saved to repro.crash.log -----"
  exit 1
done
log_line "No crash across $RUNS runs."
