-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwatchdog.ps1
More file actions
156 lines (132 loc) · 5.1 KB
/
watchdog.ps1
File metadata and controls
156 lines (132 loc) · 5.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# watchdog.ps1 — Verify Enso guardian + server are running; restart if needed.
# Runs on a 2-minute interval via the "Enso Guardian Watchdog" Scheduled Task.
#
# Defense layer 3: catches the case where both guardian and server are dead.
# Under normal operation the guardian (layer 2) handles server restarts, so
# this script only intervenes when the guardian itself is gone.
$ErrorActionPreference = "SilentlyContinue"
$EnsoDir = "$env:USERPROFILE\.enso"
$LogFile = "$EnsoDir\watchdog.log"
$MaxLogLines = 500
$GuardianPid = "$EnsoDir\guardian.pid"
$RepoDir = Split-Path -Parent $MyInvocation.MyCommand.Path
$Port = 3001
# Ensure directory
if (-not (Test-Path $EnsoDir)) { New-Item -ItemType Directory -Path $EnsoDir -Force | Out-Null }
function Write-Log($msg) {
$ts = Get-Date -Format "yyyy-MM-dd HH:mm:ss"
Add-Content -Path $LogFile -Value "$ts $msg"
}
function Trim-Log {
if (Test-Path $LogFile) {
$lines = Get-Content $LogFile
if ($lines.Count -gt $MaxLogLines) {
$lines | Select-Object -Last $MaxLogLines | Set-Content $LogFile
}
}
}
function Test-ProcessAlive($pid) {
if (-not $pid) { return $false }
try {
$proc = Get-Process -Id $pid -ErrorAction Stop
return ($proc -ne $null)
} catch {
return $false
}
}
Trim-Log
$allHealthy = $true
# ── 1. Check guardian process via PID file ──
$guardianAlive = $false
if (Test-Path $GuardianPid) {
$gpid = [int](Get-Content $GuardianPid -Raw).Trim()
$guardianAlive = Test-ProcessAlive $gpid
}
# ── 2. Check server health endpoint ──
$serverUp = $false
try {
$r = Invoke-WebRequest -Uri "http://localhost:$Port/health" -TimeoutSec 10 -UseBasicParsing
if ($r.StatusCode -eq 200) { $serverUp = $true }
} catch {}
# ── 3. Decide what to do ──
if ($guardianAlive -and $serverUp) {
Write-Log "[ok] Guardian (PID $gpid) + server healthy"
}
elseif ($guardianAlive -and -not $serverUp) {
# Guardian is alive but server is down — let the guardian handle it.
# It has its own health polling and will restart the server.
Write-Log "[wait] Guardian alive (PID $gpid) but server not responding — guardian will handle"
$allHealthy = $false
}
else {
# Guardian is dead (or PID file missing) — we need to restart it.
$allHealthy = $false
Write-Log "[FAIL] Guardian not running — restarting"
# Kill any orphaned Enso node processes
$nodeProcs = Get-WmiObject Win32_Process -Filter "Name='node.exe'" | Where-Object {
$_.CommandLine -match "standalone\.ts" -or $_.CommandLine -match "guardian\.ts"
}
foreach ($proc in $nodeProcs) {
Write-Log "[restart] Killing orphaned node.exe PID $($proc.ProcessId)"
Stop-Process -Id $proc.ProcessId -Force -ErrorAction SilentlyContinue
}
if ($nodeProcs) { Start-Sleep -Seconds 2 }
# Start the guardian
$nullInput = Join-Path $env:TEMP "enso-null-input.txt"
if (-not (Test-Path $nullInput)) { [System.IO.File]::WriteAllText($nullInput, "") }
$guardianLog = Join-Path $env:TEMP "enso-guardian-stdout.log"
$guardianErrLog = Join-Path $env:TEMP "enso-guardian-stderr.log"
Start-Process -FilePath "cmd.exe" `
-ArgumentList "/c", "npx tsx server/guardian.ts > `"$guardianLog`" 2> `"$guardianErrLog`"" `
-WorkingDirectory $RepoDir `
-WindowStyle Hidden
# Wait for server to become healthy
$recovered = $false
for ($i = 1; $i -le 30; $i++) {
try {
$r = Invoke-WebRequest -Uri "http://localhost:$Port/health" -TimeoutSec 5 -UseBasicParsing
if ($r.StatusCode -eq 200) {
Write-Log "[restart] Server recovered after ${i}s"
$recovered = $true
break
}
} catch {}
Start-Sleep -Seconds 1
}
if (-not $recovered) {
Write-Log "[restart] Server did NOT recover within 30s"
}
}
# ── 4. Check Vite dev server (optional, non-critical) ──
$viteUp = $false
try {
$tcp = New-Object System.Net.Sockets.TcpClient
$tcp.Connect("127.0.0.1", 5173)
$tcp.Close()
$viteUp = $true
} catch {}
if ($viteUp) {
Write-Log "[ok] Vite dev server healthy (:5173)"
} else {
Write-Log "[info] Vite dev server not running (:5173) — this is normal in production"
}
# ── 5. Check Cloudflare tunnel ──
$cfProc = Get-Process -Name cloudflared -ErrorAction SilentlyContinue
$cloudflaredExe = "C:\Program Files (x86)\cloudflared\cloudflared.exe"
if ($cfProc) {
Write-Log "[ok] Cloudflare tunnel running (PID $($cfProc[0].Id))"
} elseif (Test-Path $cloudflaredExe) {
$allHealthy = $false
Write-Log "[FAIL] Cloudflare tunnel not running — restarting"
Start-Process -FilePath $cloudflaredExe `
-ArgumentList "tunnel", "run", "enso" `
-WindowStyle Hidden
Start-Sleep -Seconds 2
$cfCheck = Get-Process -Name cloudflared -ErrorAction SilentlyContinue
if ($cfCheck) {
Write-Log "[restart] Cloudflare tunnel started (PID $($cfCheck[0].Id))"
} else {
Write-Log "[restart] Cloudflare tunnel failed to start"
}
}
if ($allHealthy) { exit 0 } else { exit 1 }