From 54d5ad1a7225e356cf9bd53c62334b140582de57 Mon Sep 17 00:00:00 2001
From: Wyatt Walter <wyattwalter@gmail.com>
Date: Thu, 14 May 2026 12:56:33 -0500
Subject: [PATCH 1/4] feat(diagnostics): add memory-analysis.sh for sizing
 diagnostics

Self-contained shell script that prints a single-page memory report
from inside the Appsmith container: system/cgroup RAM, per-role RSS,
JVM flag origins (ergonomic vs. explicit), MongoDB WiredTiger cache
state + data-dir size, live usage vs. configured budget, and a verdict
(OK / TIGHT / AT RISK).

Wired into diagnostics.sh so every support tarball includes the report
alongside the existing heap/thread dumps.

Optional --threads flag adds a single-snapshot thread breakdown
(state histogram, name-prefix groups, hot stacks, deadlock check).
--threads-sample takes two snapshots N seconds apart to surface
threads that stayed RUNNABLE on the same frame.
---
 deploy/docker/fs/opt/appsmith/diagnostics.sh  |   3 +
 .../docker/fs/opt/appsmith/memory-analysis.sh | 638 ++++++++++++++++++
 2 files changed, 641 insertions(+)
 create mode 100755 deploy/docker/fs/opt/appsmith/memory-analysis.sh

diff --git a/deploy/docker/fs/opt/appsmith/diagnostics.sh b/deploy/docker/fs/opt/appsmith/diagnostics.sh
index 597e6f43da7c..fc3200881eb0 100644
--- a/deploy/docker/fs/opt/appsmith/diagnostics.sh
+++ b/deploy/docker/fs/opt/appsmith/diagnostics.sh
@@ -47,6 +47,9 @@ cp /tmp/appsmith/infra.json "$tmpdir/infra-info.json"
 # gather the healthcheck
 /opt/appsmith/healthcheck.sh > "$tmpdir/healthcheck.txt"
 
+# gather the memory-analysis report (sizing + live usage + verdict)
+/opt/appsmith/memory-analysis.sh --threads > "$tmpdir/memory-analysis.txt" 2>&1 || true
+
 #
 # Java info
 #
diff --git a/deploy/docker/fs/opt/appsmith/memory-analysis.sh b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
new file mode 100755
index 000000000000..fd6f635f205d
--- /dev/null
+++ b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
@@ -0,0 +1,638 @@
+#!/usr/bin/env bash
+#
+# memory-analysis.sh — Appsmith memory snapshot + sizing verdict.
+#
+# Runs inside the Appsmith container and prints a single-page report:
+#   1. System RAM / cgroup limit
+#   2. Per-process RSS for known Appsmith components
+#   3. JVM detail (heap committed/used, MaxHeapSize ceiling, metaspace, NMT)
+#   4. Threads — total count by default; --threads adds state/pool/stack breakdown
+#   5. MongoDB WiredTiger cache (configured ceiling + current usage) and data-dir size
+#   6. Memory currently in use — live RSS roll-up vs. available memory
+#   7. Memory budget — configured ceilings vs. available memory
+#   8. Verdict (OK / TIGHT / AT RISK)
+#
+# Output is plain text, safe to paste into a support ticket. No heap dumps,
+# no tarballs — see diagnostics.sh for the heavy capture.
+#
+# Usage:
+#   bash memory-analysis.sh                       # full report
+#   bash memory-analysis.sh --no-mongo            # skip mongo probe
+#   bash memory-analysis.sh --threads             # add thread breakdown (single snapshot)
+#   bash memory-analysis.sh --threads-sample      # add stuck-thread sampling (10s window)
+#   bash memory-analysis.sh --threads-sample=20   # sampling window in seconds (3-60)
+#
+# Exit codes: always 0 on a captured snapshot. Verdict severity is in the
+# text, not the exit code, so callers can tee/share without surprises.
+
+set -u
+
+SKIP_MONGO=0
+WITH_THREADS=0
+SAMPLE_SECS=0
+for arg in "$@"; do
+  case "$arg" in
+    --no-mongo)            SKIP_MONGO=1 ;;
+    --threads)             WITH_THREADS=1 ;;
+    --threads-sample)      WITH_THREADS=1; SAMPLE_SECS=10 ;;
+    --threads-sample=*)    WITH_THREADS=1; SAMPLE_SECS="${arg#*=}" ;;
+    -h|--help)
+      sed -n '2,24p' "$0"; exit 0 ;;
+  esac
+done
+# Clamp sample window to a sane range.
+if [[ "$SAMPLE_SECS" =~ ^[0-9]+$ ]]; then
+  [[ "$SAMPLE_SECS" -gt 0 && "$SAMPLE_SECS" -lt 3 ]] && SAMPLE_SECS=3
+  [[ "$SAMPLE_SECS" -gt 60 ]] && SAMPLE_SECS=60
+else
+  SAMPLE_SECS=10
+fi
+
+# ---------- helpers ----------------------------------------------------------
+
+hr() { printf -- '----------------------------------------------------------\n'; }
+hdr() { printf '\n== %s ==\n' "$1"; }
+
+human_kb() {  # KB -> "1.2 GiB" / "456 MiB"
+  awk -v k="$1" 'BEGIN{
+    if (k=="" || k==0) {print "-"; exit}
+    if (k >= 1048576) printf "%.2f GiB\n", k/1048576;
+    else if (k >= 1024) printf "%.1f MiB\n", k/1024;
+    else printf "%d KiB\n", k;
+  }'
+}
+human_bytes() {  # bytes -> human
+  awk -v b="$1" 'BEGIN{
+    if (b=="" || b==0) {print "-"; exit}
+    # Some JVM flags report effectively-uncapped as ~uint64_t::max (~16 EiB).
+    # Anything beyond a sane physical scale is "unlimited" for our purposes.
+    if (b > 1099511627776 * 100) {print "(uncapped)"; exit}
+    if (b >= 1073741824) printf "%.2f GiB\n", b/1073741824;
+    else if (b >= 1048576) printf "%.1f MiB\n", b/1048576;
+    else if (b >= 1024) printf "%.1f KiB\n", b/1024;
+    else printf "%d B\n", b;
+  }'
+}
+
+# Total RSS (in KB) of all PIDs whose full command line matches the pattern.
+sum_rss_kb() {
+  local pattern="$1"
+  ps -eo pid,rss,args --no-headers 2>/dev/null \
+    | awk -v p="$pattern" 'BEGIN{IGNORECASE=1} $0 ~ p {sum+=$2} END{printf "%d", sum+0}'
+}
+count_procs() {
+  local pattern="$1"
+  ps -eo pid,args --no-headers 2>/dev/null \
+    | awk -v p="$pattern" 'BEGIN{IGNORECASE=1} $0 ~ p {n++} END{printf "%d", n+0}'
+}
+
+# ---------- 1. system memory -------------------------------------------------
+
+hdr "System"
+echo "Host:        $(hostname)"
+echo "Kernel:      $(uname -srm)"
+echo "Date (UTC):  $(date -u '+%Y-%m-%d %H:%M:%S')"
+if [[ -f /opt/appsmith/info.json ]]; then
+  ver=$(grep -oE '"version"[^,}]*' /opt/appsmith/info.json | head -1 | tr -d '"' | cut -d: -f2 | xargs)
+  # No "edition" key in info.json — infer from the commit URL (CE vs EE repo).
+  commit_url=$(grep -oE '"commitUrl"[^,}]*' /opt/appsmith/info.json | head -1 | tr -d '"' | cut -d: -f2- | xargs)
+  if [[ "$commit_url" == *appsmith-ee* ]]; then
+    edition=EE
+  elif [[ "$commit_url" == *appsmithorg/appsmith* ]]; then
+    edition=CE
+  else
+    edition=""
+  fi
+  echo "Appsmith:    ${edition:+$edition }${ver:-unknown}"
+fi
+
+mem_total_kb=$(awk '/^MemTotal:/{print $2}' /proc/meminfo)
+mem_avail_kb=$(awk '/^MemAvailable:/{print $2}' /proc/meminfo)
+swap_total_kb=$(awk '/^SwapTotal:/{print $2}' /proc/meminfo)
+
+# Cgroup detection (v2 first, then v1)
+cgroup_limit_bytes=""
+cgroup_current_bytes=""
+if [[ -f /sys/fs/cgroup/memory.max ]]; then
+  v=$(cat /sys/fs/cgroup/memory.max 2>/dev/null)
+  [[ "$v" != "max" && -n "$v" ]] && cgroup_limit_bytes="$v"
+  [[ -f /sys/fs/cgroup/memory.current ]] && cgroup_current_bytes=$(cat /sys/fs/cgroup/memory.current 2>/dev/null)
+elif [[ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]]; then
+  v=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes 2>/dev/null)
+  # cgroup v1 "unlimited" sentinel is a giant number; treat anything >host as unlimited
+  if [[ -n "$v" && "$v" -lt $((mem_total_kb * 1024 * 2)) ]]; then
+    cgroup_limit_bytes="$v"
+  fi
+  [[ -f /sys/fs/cgroup/memory/memory.usage_in_bytes ]] && \
+    cgroup_current_bytes=$(cat /sys/fs/cgroup/memory/memory.usage_in_bytes 2>/dev/null)
+fi
+
+echo "Host RAM:    $(human_kb "$mem_total_kb")  (available: $(human_kb "$mem_avail_kb"))"
+echo "Host swap:   $(human_kb "$swap_total_kb")"
+if [[ -n "$cgroup_limit_bytes" ]]; then
+  echo "cgroup cap:  $(human_bytes "$cgroup_limit_bytes")  (current: $(human_bytes "${cgroup_current_bytes:-0}"))"
+  # Convert to KB for the budget math
+  budget_total_kb=$(( cgroup_limit_bytes / 1024 ))
+  budget_source="container cgroup limit"
+else
+  echo "cgroup cap:  (none — container can use full host RAM)"
+  budget_total_kb=$mem_total_kb
+  budget_source="host RAM"
+fi
+
+# ---------- 2. per-process RSS ----------------------------------------------
+
+hdr "Processes (RSS)"
+printf '  %-12s  %10s  %s\n' "ROLE" "RSS" "MATCH"
+hr
+
+declare -A role_kb
+declare -A role_n
+declare -a role_order=(java rts mongod postgres redis keycloak temporal caddy supervisord)
+
+# Note on the prefix class: ps -eo pid,rss,args prints args with the executable
+# name preceded by whitespace (after the rss column), so patterns must accept
+# space, slash, OR line start. (^|/)name fails to match the common "  170752 mongod ..."
+# layout. Same idea for the suffix.
+role_pattern_java="(server\.jar|java .* -jar)"
+role_pattern_rts="(rts.*server|node .*server\.bundle\.js|node .*server\.js)"
+role_pattern_mongod="( |/|^)mongod( |$)"
+# Postgres forks one backend per connection; each backend's "RSS" includes the
+# shared_buffers segment, so summing them overcounts dramatically. The postmaster
+# (started with -D /path) owns the shared region — its RSS is a much better
+# proxy for actual unique memory than sum-across-all-backends.
+role_pattern_postgres="( |/|^)postgres .* -D "
+role_pattern_redis="( |/|^)redis-server( |$)"
+role_pattern_keycloak="(keycloak|quarkus|kc\.sh|java .*keycloak)"
+role_pattern_temporal="(temporal-server|temporalio|temporal start|temporal.*frontend)"
+role_pattern_caddy="( |/|^)caddy( |$)"
+role_pattern_supervisord="( |/|^)supervisord( |$)"
+
+total_other_kb=0
+total_java_kb=0
+total_mongod_kb=0
+for role in "${role_order[@]}"; do
+  var="role_pattern_${role}"
+  pat="${!var}"
+  kb=$(sum_rss_kb "$pat")
+  n=$(count_procs "$pat")
+  role_kb[$role]=$kb
+  role_n[$role]=$n
+  if [[ "$kb" -gt 0 ]]; then
+    printf '  %-12s  %10s  %s (n=%d)\n' "$role" "$(human_kb "$kb")" "$pat" "$n"
+    case "$role" in
+      java)    total_java_kb=$kb ;;
+      mongod)  total_mongod_kb=$kb ;;
+      *)       total_other_kb=$(( total_other_kb + kb )) ;;
+    esac
+  fi
+done
+
+# Anything not matched but worth showing? top non-Appsmith RSS hogs.
+hr
+echo "  (top 5 RSS processes overall, for sanity:)"
+ps -eo pid,rss,comm --sort=-rss --no-headers 2>/dev/null | head -5 \
+  | awk '{printf "    pid %-7d  %8.1f MiB  %s\n", $1, $2/1024, $3}'
+
+# ---------- 3. JVM ----------------------------------------------------------
+
+hdr "JVM (Appsmith server)"
+java_pid=$(pgrep -f -- "-jar\s*server\.jar" | head -1 || true)
+if [[ -z "${java_pid:-}" ]]; then
+  echo "  (no server.jar process found — Appsmith backend not running)"
+else
+  java_rss_kb=$(awk '/^VmRSS:/{print $2}' /proc/$java_pid/status 2>/dev/null)
+  java_threads=$(awk '/^Threads:/{print $2}' /proc/$java_pid/status 2>/dev/null)
+  java_uptime=$(ps -o etime= -p "$java_pid" 2>/dev/null | xargs)
+  echo "  pid:        $java_pid   uptime: ${java_uptime:-?}"
+  echo "  RSS:        $(human_kb "${java_rss_kb:-0}")"
+  echo "  Threads:    ${java_threads:-?}   (rerun with --threads for state breakdown + deadlock check)"
+
+  if command -v jcmd >/dev/null 2>&1; then
+    # VM.flags -all prints every flag with value + origin (default / ergonomic /
+    # command line / etc). That's the only reliable source for MaxHeapSize when
+    # the customer hasn't set -Xmx explicitly: the JVM picks one from
+    # MaxRAMPercentage and reports it as "ergonomic".
+    flags_all=$(jcmd "$java_pid" VM.flags -all 2>/dev/null || true)
+
+    flag_value() {  # $1 = flag name → bytes value (or empty)
+      grep -E "^[[:space:]]*[a-z_]+[[:space:]]+$1[[:space:]]*=" <<<"$flags_all" \
+        | head -1 | awk -F'=' '{print $2}' | awk '{print $1}'
+    }
+    flag_origin() {  # $1 = flag name → origin token in braces (e.g. "ergonomic")
+      grep -E "^[[:space:]]*[a-z_]+[[:space:]]+$1[[:space:]]*=" <<<"$flags_all" \
+        | head -1 | grep -oE '\{[^}]+\}' | tail -1 | tr -d '{}'
+    }
+
+    max_heap=$(flag_value MaxHeapSize)
+    max_heap_origin=$(flag_origin MaxHeapSize)
+    init_heap=$(flag_value InitialHeapSize)
+    metaspace_cap=$(flag_value MaxMetaspaceSize)
+    direct_cap=$(flag_value MaxDirectMemorySize)
+    ram_pct=$(flag_value MaxRAMPercentage)
+    init_ram_pct=$(flag_value InitialRAMPercentage)
+
+    # Active GC — match only real GC selector flags (not unrelated *GC tuning
+    # flags like UseMaximumCompactionOnSystemGC). JDK 17+ defaults to G1.
+    gc_selectors='UseG1GC|UseZGC|UseShenandoahGC|UseParallelGC|UseSerialGC|UseConcMarkSweepGC|UseEpsilonGC'
+    gc=$(grep -E "($gc_selectors)[[:space:]]*=[[:space:]]*true" <<<"$flags_all" \
+      | grep -oE "($gc_selectors)" | head -1)
+
+    case "$max_heap_origin" in
+      *"command line"*) origin_note="explicit (-Xmx on command line)" ;;
+      *ergonomic*)      origin_note="ergonomic — JVM picked this from MaxRAMPercentage; no -Xmx set" ;;
+      *default*)        origin_note="default" ;;
+      "")               origin_note="unknown" ;;
+      *)                origin_note="$max_heap_origin" ;;
+    esac
+
+    echo "  GC:         ${gc:-?}"
+    echo "  -Xmx:       $(human_bytes "${max_heap:-0}")   [$origin_note]"
+    echo "  -Xms:       $(human_bytes "${init_heap:-0}")"
+    [[ -n "$ram_pct" ]]      && echo "  MaxRAM%:    ${ram_pct} (of $([[ -n "$cgroup_limit_bytes" ]] && echo cgroup || echo host) RAM)"
+    [[ -n "$init_ram_pct" ]] && echo "  InitRAM%:   ${init_ram_pct}"
+    echo "  Metaspace:  cap=$(human_bytes "${metaspace_cap:-0}")"
+    echo "  DirectMem:  cap=$(human_bytes "${direct_cap:-0}")"
+
+    # heap_info format varies across GCs (G1 uses K, ZGC/Shenandoah use M, fields
+    # differ). Print the first ~6 lines verbatim — readable and parse-free.
+    heap_info=$(jcmd "$java_pid" GC.heap_info 2>/dev/null | sed 1d || true)
+    if [[ -n "$heap_info" ]]; then
+      echo "  Heap info:"
+      sed -n '1,8p' <<<"$heap_info" | sed 's/^/    /'
+      # Parse first "used N(K|M)" we see — first occurrence is the whole heap
+      # for G1/Z/Shenandoah. (Parallel/Serial report per-generation; we accept
+      # young-gen as a partial answer there rather than failing.)
+      heap_used_k=$(grep -oE 'used [0-9]+K' <<<"$heap_info" | head -1 | grep -oE '[0-9]+')
+      if [[ -z "$heap_used_k" ]]; then
+        heap_used_m=$(grep -oE 'used [0-9]+M' <<<"$heap_info" | head -1 | grep -oE '[0-9]+')
+        [[ -n "$heap_used_m" ]] && heap_used_k=$(( heap_used_m * 1024 ))
+      fi
+    fi
+
+    nmt=$(jcmd "$java_pid" VM.native_memory summary 2>&1 || true)
+    if grep -qi "native memory tracking is not" <<<"$nmt" \
+       || grep -qi "NMT is not enabled" <<<"$nmt"; then
+      echo "  NMT:        disabled  (enable with APPSMITH_JAVA_ARGS='-XX:NativeMemoryTracking=summary' + restart for deeper attribution)"
+    elif [[ -n "$nmt" ]]; then
+      reserved=$(awk '/Total: reserved=/{print; exit}' <<<"$nmt")
+      echo "  NMT:        ${reserved:-(present)}"
+    fi
+  else
+    echo "  jcmd not on PATH — can't introspect JVM. Install JDK tools in the image to enable."
+  fi
+fi
+
+# ---------- 4. Threads (optional, --threads) --------------------------------
+
+if [[ "$WITH_THREADS" -eq 1 && -n "${java_pid:-}" ]] && command -v jcmd >/dev/null 2>&1; then
+  hdr "Threads (single snapshot — signals, not diagnoses)"
+  threads_out=$(jcmd "$java_pid" Thread.print 2>/dev/null || true)
+  if [[ -z "$threads_out" ]]; then
+    echo "  (Thread.print produced no output)"
+  else
+    total=$(grep -cE '^"' <<<"$threads_out")
+    echo "  Total threads: $total  (OS thread count: ${java_threads:-?})"
+
+    # Deadlock check — the JVM appends a "Found one Java-level deadlock" block
+    # at the bottom of Thread.print when it detects any. Clear yes/no.
+    if grep -q 'Found .* Java-level deadlock' <<<"$threads_out"; then
+      echo ""
+      echo "  !! DEADLOCK DETECTED — see relevant section of Thread.print:"
+      sed -n '/Found .* Java-level deadlock/,/^Java stack information/p' <<<"$threads_out" \
+        | head -40 | sed 's/^/    /'
+    else
+      echo "  Deadlocks:     none reported"
+    fi
+
+    # State histogram. Threads not currently parked on Java code may lack a
+    # Thread.State line (e.g. native JVM threads) — those are counted as "other".
+    echo ""
+    echo "  State breakdown:"
+    grep -oE 'java\.lang\.Thread\.State: [A-Z_]+' <<<"$threads_out" \
+      | awk '{print $NF}' | sort | uniq -c | sort -rn \
+      | awk '{printf "    %5d  %s\n", $1, $2}'
+    with_state=$(grep -cE 'java\.lang\.Thread\.State:' <<<"$threads_out")
+    other=$(( total - with_state ))
+    if [[ "$other" -gt 0 ]]; then
+      printf '    %5d  (native / no Thread.State line)\n' "$other"
+    fi
+
+    # Group by thread name prefix. Strip trailing -N or -N-M digit suffixes so
+    # pool families collapse into one row.
+    echo ""
+    echo "  Top thread name groups (after stripping trailing -N suffixes):"
+    grep -oE '^"[^"]+"' <<<"$threads_out" \
+      | tr -d '"' \
+      | sed -E 's/(-[0-9]+)+$//; s/-[0-9]+-thread-[0-9]+$/-thread/' \
+      | sort | uniq -c | sort -rn | head -10 \
+      | awk '{n=$1; $1=""; sub(/^ /,""); printf "    %5d  %s\n", n, $0}'
+
+    # Top frames — the first "at ..." line after each thread header. A pile of
+    # threads parked on the same frame is a strong saturation signal.
+    echo ""
+    echo "  Top 5 hot frames (first stack frame per thread):"
+    awk '/^"/{flag=1; next} flag && /^[[:space:]]+at /{print; flag=0}' <<<"$threads_out" \
+      | sed -E 's/^[[:space:]]+at /at /' \
+      | sort | uniq -c | sort -rn | head -5 \
+      | awk '{n=$1; $1=""; sub(/^ /,""); printf "    %5d  %s\n", n, $0}'
+
+    if [[ "$SAMPLE_SECS" -eq 0 ]]; then
+      echo ""
+      echo "  Note: a single snapshot can spot deadlocks and pool saturation but"
+      echo "  cannot prove a thread is 'stuck' — rerun with --threads-sample for that."
+    fi
+  fi
+fi
+
+# ---------- 4b. Stuck-thread sampling (optional, --threads-sample) ----------
+
+if [[ "$WITH_THREADS" -eq 1 && "$SAMPLE_SECS" -gt 0 && -n "${java_pid:-}" ]] \
+   && command -v jcmd >/dev/null 2>&1; then
+  hdr "Stuck thread sampling (${SAMPLE_SECS}s window)"
+
+  # Awk parser: emits one TSV line per thread — name<TAB>state<TAB>topframe.
+  parse_threads() {
+    awk '
+      function flush() {
+        if (name != "" && state != "" && topframe != "")
+          print name "\t" state "\t" topframe
+      }
+      /^"/ {
+        flush()
+        name = $0; sub(/^"/, "", name); sub(/".*$/, "", name)
+        state = ""; topframe = ""; next
+      }
+      /java\.lang\.Thread\.State:/ {
+        if (match($0, /State: [A-Z_]+/))
+          state = substr($0, RSTART+7, RLENGTH-7)
+        next
+      }
+      /^[[:space:]]+at / {
+        if (topframe == "") {
+          line = $0; sub(/^[[:space:]]+at /, "", line)
+          topframe = line
+        }
+        next
+      }
+      END { flush() }
+    '
+  }
+
+  # Reuse the snapshot the basic --threads section already captured as snap1
+  # when possible, so the sample window is the gap between sections rather
+  # than an extra delay. Otherwise capture both fresh.
+  if [[ -n "${threads_out:-}" ]]; then
+    snap1="$threads_out"
+    echo "  Snapshot 1: reused from threads section above"
+  else
+    echo "  Capturing snapshot 1..."
+    snap1=$(jcmd "$java_pid" Thread.print 2>/dev/null || true)
+  fi
+  echo "  Sleeping ${SAMPLE_SECS}s..."
+  sleep "$SAMPLE_SECS"
+  echo "  Capturing snapshot 2..."
+  snap2=$(jcmd "$java_pid" Thread.print 2>/dev/null || true)
+
+  if [[ -z "$snap1" || -z "$snap2" ]]; then
+    echo "  (one or both snapshots were empty — cannot compare)"
+  else
+    t1=$(mktemp); t2=$(mktemp)
+    parse_threads <<<"$snap1" | sort -t$'\t' -k1,1 > "$t1"
+    parse_threads <<<"$snap2" | sort -t$'\t' -k1,1 > "$t2"
+
+    # Join on thread name; flag rows where both snapshots show RUNNABLE on the
+    # same top frame.
+    stuck=$(join -t $'\t' "$t1" "$t2" \
+      | awk -F'\t' '$2=="RUNNABLE" && $4=="RUNNABLE" && $3==$5 {print $1 "\t" $3}')
+    rm -f "$t1" "$t2"
+
+    if [[ -z "$stuck" ]]; then
+      stuck_count=0
+    else
+      stuck_count=$(wc -l <<<"$stuck" | tr -d ' ')
+    fi
+
+    if [[ "$stuck_count" -eq 0 ]]; then
+      echo ""
+      echo "  No threads stayed RUNNABLE on the same top frame across both snapshots."
+    else
+      echo ""
+      echo "  $stuck_count thread(s) RUNNABLE on the same top frame in both snapshots:"
+      # Group identical (name-prefix, frame) pairs so pool members collapse.
+      echo "$stuck" \
+        | sed -E 's/^([^\t]+)(-[0-9]+(-[0-9]+)*)$/\1/' \
+        | awk -F'\t' '{key=$1"\t"$2; count[key]++} END {for (k in count) print count[k]"\t"k}' \
+        | sort -t$'\t' -k1,1 -rn \
+        | head -20 \
+        | awk -F'\t' '{printf "    [%3d]  %-32s  %s\n", $1, $2, $3}'
+    fi
+
+    echo ""
+    echo "  How to read this:"
+    echo "  - RUNNABLE on the same frame for ${SAMPLE_SECS}s = the JVM scheduler ran"
+    echo "    this thread but it didn't advance past that frame. Strong signal."
+    echo "  - Benign frames to ignore: Selector.select, EPoll.wait, KQueue.poll,"
+    echo "    LockSupport.park (these *should* be in non-RUNNABLE states, but"
+    echo "    short native polls can appear RUNNABLE transiently)."
+    echo "  - Suspicious: application package frames (com.appsmith.*) or library"
+    echo "    code doing work (regex, JSON parsing, DB driver internals)."
+  fi
+fi
+
+# ---------- 5. MongoDB ------------------------------------------------------
+
+hdr "MongoDB"
+mongo_rss_kb=${role_kb[mongod]:-0}
+mongo_is_local=0
+[[ "${mongo_rss_kb:-0}" -gt 0 ]] && mongo_is_local=1
+if [[ "$mongo_is_local" -eq 1 ]]; then
+  echo "  Topology:   embedded (mongod runs in this container)"
+  echo "  RSS:        $(human_kb "$mongo_rss_kb")"
+
+  # Parse --dbpath from the running mongod's args; fall back to image default.
+  mongo_dbpath=$(ps -eo args --no-headers 2>/dev/null \
+    | awk '/[m]ongod / && /--dbpath/{
+        for (i=1;i<=NF;i++) if ($i=="--dbpath") {print $(i+1); exit}
+      }')
+  mongo_dbpath="${mongo_dbpath:-/appsmith-stacks/data/mongodb}"
+  if [[ -d "$mongo_dbpath" ]]; then
+    total_size=$(du -sh "$mongo_dbpath" 2>/dev/null | awk '{print $1}')
+    echo "  Data dir:   $mongo_dbpath  (on disk: ${total_size:-?})"
+    # Show journal subdir if present — it's a common growth path under heavy writes.
+    if [[ -d "$mongo_dbpath/journal" ]]; then
+      journal_size=$(du -sh "$mongo_dbpath/journal" 2>/dev/null | awk '{print $1}')
+      echo "              journal: ${journal_size:-?}"
+    fi
+    echo "              (larger data on disk → more pages WiredTiger may pull into cache)"
+  fi
+else
+  echo "  Topology:   external (no mongod RSS in this container)"
+fi
+
+wt_max_bytes=0
+wt_used_bytes=0
+if [[ "$SKIP_MONGO" -eq 1 ]]; then
+  echo "  (--no-mongo passed; skipping live probe)"
+elif ! command -v mongosh >/dev/null 2>&1 && ! command -v mongo >/dev/null 2>&1; then
+  echo "  (no mongosh/mongo client — skipping WT cache probe)"
+else
+  mclient=mongosh
+  command -v mongosh >/dev/null 2>&1 || mclient=mongo
+  # `docker exec` doesn't inherit APPSMITH_DB_URL from the container's
+  # supervisord env. Read it from the running JVM's /proc/<pid>/environ so we
+  # use the same authenticated URL the running app uses.
+  url="${APPSMITH_DB_URL:-}"
+  if [[ -z "$url" && -n "${java_pid:-}" && -r "/proc/$java_pid/environ" ]]; then
+    url=$(tr '\0' '\n' < "/proc/$java_pid/environ" 2>/dev/null \
+      | awk -F= '/^APPSMITH_DB_URL=/{sub(/^APPSMITH_DB_URL=/,""); print; exit}')
+  fi
+  url="${url:-mongodb://localhost:27017/appsmith?replicaSet=mr1}"
+  # Hide creds when echoing
+  safe_url=$(sed -E 's#(mongodb(\+srv)?://)[^@/]+@#\1***:***@#' <<<"$url")
+  echo "  URL:        $safe_url  (via $mclient)"
+  js='var s=db.serverStatus().wiredTiger.cache;
+print("WT_MAX="+s["maximum bytes configured"]);
+print("WT_USED="+s["bytes currently in the cache"]);'
+  out=$("$mclient" --quiet "$url" --eval "$js" 2>&1 || true)
+  wt_max_bytes=$(grep -oE 'WT_MAX=[0-9]+' <<<"$out" | head -1 | cut -d= -f2)
+  wt_used_bytes=$(grep -oE 'WT_USED=[0-9]+' <<<"$out" | head -1 | cut -d= -f2)
+  if [[ -n "${wt_max_bytes:-}" ]]; then
+    echo "  WT cache:   configured ceiling=$(human_bytes "$wt_max_bytes")  in use=$(human_bytes "${wt_used_bytes:-0}")"
+    echo "              (live from db.serverStatus().wiredTiger.cache —"
+    echo "               'maximum bytes configured' and 'bytes currently in the cache')"
+    if [[ "$mongo_is_local" -eq 0 ]]; then
+      echo "              ^ on the external Mongo host — does NOT consume memory in this container."
+      echo "                (Still worth right-sizing on that host if it's > available RAM there.)"
+    fi
+  else
+    echo "  WT cache:   (probe failed) — $(head -3 <<<"$out" | tr '\n' ' ')"
+    wt_max_bytes=0
+  fi
+fi
+
+# Fallback estimate (only for embedded mongo, only when probe failed):
+# Mongo default = max((RAM-1)/2, 256MB).
+if [[ "$mongo_is_local" -eq 1 && "${wt_max_bytes:-0}" -eq 0 ]]; then
+  est_cache_bytes=$(awk -v kb="$budget_total_kb" 'BEGIN{
+    gb = kb/1048576;
+    cap = (gb-1)/2;
+    if (cap < 0.256) cap = 0.256;
+    printf "%d", cap*1073741824;
+  }')
+  echo "  WT cache:   (estimated default ceiling on this RAM) ~$(human_bytes "$est_cache_bytes")"
+  wt_max_bytes=$est_cache_bytes
+fi
+
+# ---------- 6. Current usage (mirrors the budget structure) -----------------
+
+hdr "Memory currently in use"
+budget_total_bytes=$(( budget_total_kb * 1024 ))
+
+# Total in use: prefer cgroup.current (the most authoritative for a container),
+# fall back to (MemTotal - MemAvailable) when there's no cgroup limit.
+if [[ -n "${cgroup_current_bytes:-}" ]]; then
+  in_use_bytes=$cgroup_current_bytes
+  in_use_source="cgroup.current"
+else
+  in_use_bytes=$(( (mem_total_kb - mem_avail_kb) * 1024 ))
+  in_use_source="MemTotal - MemAvailable"
+fi
+
+java_rss_bytes=$(( total_java_kb * 1024 ))
+mongo_rss_bytes=$(( total_mongod_kb * 1024 ))
+other_rss_bytes=$(( total_other_kb * 1024 ))
+heap_used_bytes=$(( ${heap_used_k:-0} * 1024 ))
+tracked_sum_bytes=$(( java_rss_bytes + mongo_rss_bytes + other_rss_bytes ))
+
+# Unaccounted = whatever the OS says is in use beyond the procs we recognize.
+# Includes kernel page cache (which cgroup.current counts), shared anon, etc.
+unaccounted_bytes=$(( in_use_bytes - tracked_sum_bytes ))
+[[ "$unaccounted_bytes" -lt 0 ]] && unaccounted_bytes=0
+free_bytes=$(( budget_total_bytes - in_use_bytes ))
+[[ "$free_bytes" -lt 0 ]] && free_bytes=0
+
+printf '  source:                    %s\n' "$budget_source"
+printf '  total available:           %s\n' "$(human_bytes "$budget_total_bytes")"
+printf '  - JVM RSS:                 %s\n' "$(human_bytes "$java_rss_bytes")"
+if [[ -n "${heap_used_k:-}" ]]; then
+  printf '      heap used:             %s\n' "$(human_bytes "$heap_used_bytes")"
+fi
+if [[ "$mongo_is_local" -eq 1 ]]; then
+  printf '  - Mongo RSS:               %s\n' "$(human_bytes "$mongo_rss_bytes")"
+  if [[ -n "${wt_used_bytes:-}" && "${wt_used_bytes:-0}" -gt 0 ]]; then
+    printf '      WT cache in use:       %s\n' "$(human_bytes "$wt_used_bytes")"
+  fi
+else
+  printf '  - Mongo RSS:               (external — not counted here)\n'
+fi
+printf '  - Other Appsmith procs:    %s\n' "$(human_bytes "$other_rss_bytes")"
+printf '  - Other / kernel buffers:  %s   (in-use beyond tracked procs)\n' "$(human_bytes "$unaccounted_bytes")"
+printf '  = total in use:            %s   (from %s)\n' "$(human_bytes "$in_use_bytes")" "$in_use_source"
+printf '  + free:                    %s\n' "$(human_bytes "$free_bytes")"
+
+# ---------- 7. Budget reconciliation ----------------------------------------
+
+hdr "Memory budget"
+
+# JVM "worst case" footprint: Xmx + estimated non-heap overhead.
+jvm_max_heap=${max_heap:-0}
+jvm_nonheap_estimate=$(( 500 * 1024 * 1024 ))  # 500 MiB rough overhead from Notion analysis
+jvm_ceiling_bytes=$(( jvm_max_heap + jvm_nonheap_estimate ))
+
+other_bytes=$(( total_other_kb * 1024 ))
+headroom_bytes=$(( 1024 * 1024 * 1024 ))  # 1 GiB safety for spikes (refactor APIs etc)
+
+# Only count WT cache against this container's budget when mongod is local.
+if [[ "$mongo_is_local" -eq 1 ]]; then
+  wt_in_budget=$wt_max_bytes
+else
+  wt_in_budget=0
+fi
+ceiling_sum=$(( jvm_ceiling_bytes + wt_in_budget + other_bytes + headroom_bytes ))
+
+printf '  source:                    %s\n' "$budget_source"
+printf '  total available:           %s\n' "$(human_bytes "$budget_total_bytes")"
+printf '  - JVM Xmx ceiling:         %s\n' "$(human_bytes "$jvm_max_heap")"
+printf '  - JVM non-heap (est):      %s\n' "$(human_bytes "$jvm_nonheap_estimate")"
+if [[ "$mongo_is_local" -eq 1 ]]; then
+  printf '  - Mongo WT cache ceiling:  %s\n' "$(human_bytes "$wt_in_budget")"
+else
+  printf '  - Mongo WT cache ceiling:  (external Mongo — not counted)\n'
+fi
+printf '  - Other Appsmith procs:    %s   (observed RSS: rts+keycloak+temporal+caddy+supervisord+redis+postgres)\n' "$(human_bytes "$other_bytes")"
+printf '  - Reserved spike headroom: %s\n' "$(human_bytes "$headroom_bytes")"
+printf '  = sum of ceilings:         %s\n' "$(human_bytes "$ceiling_sum")"
+
+# ---------- 8. Verdict ------------------------------------------------------
+
+hdr "Verdict"
+
+gb_avail=$(awk -v b="$budget_total_bytes" 'BEGIN{printf "%.1f", b/1073741824}')
+overshoot_bytes=$(( ceiling_sum - budget_total_bytes ))
+margin=$(( budget_total_bytes - ceiling_sum ))
+tight_threshold=$(( 512 * 1024 * 1024 ))  # < 512 MiB free is uncomfortably close
+
+echo "  Available:  $gb_avail GiB (source: $budget_source)"
+echo ""
+if [[ "$overshoot_bytes" -gt 0 ]]; then
+  echo "  STATUS: AT RISK — configured ceilings exceed available memory by $(human_bytes "$overshoot_bytes")."
+  echo "          Under load (e.g. refactor / large widget rename) the JVM heap can grow"
+  echo "          toward -Xmx while WiredTiger holds its cache, forcing OS-level OOM."
+elif [[ "$margin" -lt "$tight_threshold" ]]; then
+  echo "  STATUS: TIGHT — ceilings fit, but only $(human_bytes "$margin") headroom remains."
+  echo "          A single large operation (refactor, wide query) could push the container"
+  echo "          over its limit."
+else
+  echo "  STATUS: OK — ceilings fit within available memory ($(human_bytes "$margin") headroom)."
+fi
+
+# Flag non-default deployments — informational, not prescriptive.
+if [[ -n "${java_pid:-}" ]] && pgrep -f -- "dynatrace|oneagent|liboneagent" >/dev/null 2>&1; then
+  echo ""
+  echo "  NOTE: Dynatrace OneAgent detected — typically adds ~200-400 MiB to JVM RSS."
+fi
+
+hr
+echo "Snapshot captured at $(date -u '+%Y-%m-%dT%H:%M:%SZ')."
+echo "Run again under load (during a slow refactor / large action) for peak numbers."

From 6b9815d50aa37f469f916457ad854de1fdd96cae Mon Sep 17 00:00:00 2001
From: Wyatt Walter <wyattwalter@gmail.com>
Date: Thu, 14 May 2026 13:52:10 -0500
Subject: [PATCH 2/4] fix: anchor stuck-thread sed on \t, not end-of-line

The pattern `(-[0-9]+(-[0-9]+)*)$` never matched because the suffix it's
stripping lives between the thread name and the top frame (separated by
a tab), not at end-of-line. Result: pool members like boundedElastic-1,
boundedElastic-2 never collapsed into a single grouping row.

Anchor on \t instead so the trailing -N(-M)* is stripped from the name
field before the tab.
---
 deploy/docker/fs/opt/appsmith/memory-analysis.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deploy/docker/fs/opt/appsmith/memory-analysis.sh b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
index fd6f635f205d..f2799d540b7c 100755
--- a/deploy/docker/fs/opt/appsmith/memory-analysis.sh
+++ b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
@@ -420,8 +420,10 @@ if [[ "$WITH_THREADS" -eq 1 && "$SAMPLE_SECS" -gt 0 && -n "${java_pid:-}" ]] \
       echo ""
       echo "  $stuck_count thread(s) RUNNABLE on the same top frame in both snapshots:"
       # Group identical (name-prefix, frame) pairs so pool members collapse.
+      # The suffix to strip lives before the TAB separating name and frame —
+      # not at end-of-line — so anchor on \t rather than $.
       echo "$stuck" \
-        | sed -E 's/^([^\t]+)(-[0-9]+(-[0-9]+)*)$/\1/' \
+        | sed -E 's/-[0-9]+(-[0-9]+)*\t/\t/' \
         | awk -F'\t' '{key=$1"\t"$2; count[key]++} END {for (k in count) print count[k]"\t"k}' \
         | sort -t$'\t' -k1,1 -rn \
         | head -20 \

From 2fd6ea610649497433e3313e84de60d69d1a84f9 Mon Sep 17 00:00:00 2001
From: Wyatt Walter <wyattwalter@gmail.com>
Date: Thu, 14 May 2026 15:04:29 -0500
Subject: [PATCH 3/4] fix: clamp --threads-sample=0 up to the 3s minimum

Previous clamp only nudged 1/2 up to 3, leaving 0 untouched. A user
passing --threads-sample=0 then got WITH_THREADS=1 with SAMPLE_SECS=0,
which silently skipped the sampling section while still printing the
basic threads view. Honor the flag's intent by clamping any in-range
integer below 3 up to 3.
---
 deploy/docker/fs/opt/appsmith/memory-analysis.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/docker/fs/opt/appsmith/memory-analysis.sh b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
index f2799d540b7c..eff363d8f7bb 100755
--- a/deploy/docker/fs/opt/appsmith/memory-analysis.sh
+++ b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
@@ -42,7 +42,7 @@ for arg in "$@"; do
 done
 # Clamp sample window to a sane range.
 if [[ "$SAMPLE_SECS" =~ ^[0-9]+$ ]]; then
-  [[ "$SAMPLE_SECS" -gt 0 && "$SAMPLE_SECS" -lt 3 ]] && SAMPLE_SECS=3
+  [[ "$SAMPLE_SECS" -lt 3 ]] && SAMPLE_SECS=3
   [[ "$SAMPLE_SECS" -gt 60 ]] && SAMPLE_SECS=60
 else
   SAMPLE_SECS=10

From 4055d635762a8dc1f583f525429f44147e97291a Mon Sep 17 00:00:00 2001
From: Wyatt Walter <wyattwalter@gmail.com>
Date: Thu, 14 May 2026 15:13:34 -0500
Subject: [PATCH 4/4] fix: separate --threads from --threads-sample; redact
 mongo client errors

Two issues caught by review:

1. Last commit's unconditional <3 clamp regressed --threads by clamping
   the default SAMPLE_SECS=0 up to 3, which then triggered the stuck-
   thread sampling section. Introduce WITH_THREAD_SAMPLE to distinguish
   "sampling explicitly requested" from "default value," and gate the
   clamp on it. Plain --threads now correctly leaves SAMPLE_SECS=0 and
   skips sampling.

2. The Mongo probe-failure line printed raw mongosh/mongo error output,
   which can include the connection string with credentials. Apply the
   same URI-redaction sed to a safe_out variable used only for display.
---
 .../docker/fs/opt/appsmith/memory-analysis.sh | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/deploy/docker/fs/opt/appsmith/memory-analysis.sh b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
index eff363d8f7bb..67f8c4a0d1e4 100755
--- a/deploy/docker/fs/opt/appsmith/memory-analysis.sh
+++ b/deploy/docker/fs/opt/appsmith/memory-analysis.sh
@@ -29,23 +29,28 @@ set -u
 
 SKIP_MONGO=0
 WITH_THREADS=0
+WITH_THREAD_SAMPLE=0
 SAMPLE_SECS=0
 for arg in "$@"; do
   case "$arg" in
     --no-mongo)            SKIP_MONGO=1 ;;
     --threads)             WITH_THREADS=1 ;;
-    --threads-sample)      WITH_THREADS=1; SAMPLE_SECS=10 ;;
-    --threads-sample=*)    WITH_THREADS=1; SAMPLE_SECS="${arg#*=}" ;;
+    --threads-sample)      WITH_THREADS=1; WITH_THREAD_SAMPLE=1; SAMPLE_SECS=10 ;;
+    --threads-sample=*)    WITH_THREADS=1; WITH_THREAD_SAMPLE=1; SAMPLE_SECS="${arg#*=}" ;;
     -h|--help)
       sed -n '2,24p' "$0"; exit 0 ;;
   esac
 done
-# Clamp sample window to a sane range.
-if [[ "$SAMPLE_SECS" =~ ^[0-9]+$ ]]; then
-  [[ "$SAMPLE_SECS" -lt 3 ]] && SAMPLE_SECS=3
-  [[ "$SAMPLE_SECS" -gt 60 ]] && SAMPLE_SECS=60
-else
-  SAMPLE_SECS=10
+# Clamp sample window to a sane range — only when sampling was explicitly
+# requested. Plain --threads leaves SAMPLE_SECS at the 0 default so the
+# stuck-thread section is correctly skipped.
+if [[ "$WITH_THREAD_SAMPLE" -eq 1 ]]; then
+  if [[ "$SAMPLE_SECS" =~ ^[0-9]+$ ]]; then
+    [[ "$SAMPLE_SECS" -lt 3 ]] && SAMPLE_SECS=3
+    [[ "$SAMPLE_SECS" -gt 60 ]] && SAMPLE_SECS=60
+  else
+    SAMPLE_SECS=10
+  fi
 fi
 
 # ---------- helpers ----------------------------------------------------------
@@ -497,6 +502,9 @@ else
 print("WT_MAX="+s["maximum bytes configured"]);
 print("WT_USED="+s["bytes currently in the cache"]);'
   out=$("$mclient" --quiet "$url" --eval "$js" 2>&1 || true)
+  # Drivers sometimes echo the connection string in auth/parse errors; redact
+  # before any of this output reaches the diagnostics tarball.
+  safe_out=$(sed -E 's#(mongodb(\+srv)?://)[^@/]+@#\1***:***@#' <<<"$out")
   wt_max_bytes=$(grep -oE 'WT_MAX=[0-9]+' <<<"$out" | head -1 | cut -d= -f2)
   wt_used_bytes=$(grep -oE 'WT_USED=[0-9]+' <<<"$out" | head -1 | cut -d= -f2)
   if [[ -n "${wt_max_bytes:-}" ]]; then
@@ -508,7 +516,7 @@ print("WT_USED="+s["bytes currently in the cache"]);'
       echo "                (Still worth right-sizing on that host if it's > available RAM there.)"
     fi
   else
-    echo "  WT cache:   (probe failed) — $(head -3 <<<"$out" | tr '\n' ' ')"
+    echo "  WT cache:   (probe failed) — $(head -3 <<<"$safe_out" | tr '\n' ' ')"
     wt_max_bytes=0
   fi
 fi