-
Notifications
You must be signed in to change notification settings - Fork 9
158 lines (145 loc) · 6.76 KB
/
Copy pathobservability.docs.plus.yml
File metadata and controls
158 lines (145 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Server-only observability stack deploy. Decoupled from the app's blue-green
# release: no quality-gate dependency and outside the app deploy concurrency
# group — deliberate (the self-hosted runner serializes jobs). See docs/observability.md.
name: CI/CD Observability
on:
workflow_call:
inputs:
action:
required: false
default: setup
type: string
workflow_dispatch:
inputs:
action:
description: 'Action to run on the stack'
required: true
default: setup
type: choice
options: [setup, update, restart, down]
defaults:
run:
shell: bash
permissions:
contents: read
env:
COMPOSE_FILE: docker-compose.observability.yml
ENV_SOURCE: /opt/projects/prod.docs.plus/.env.observability
ENV_FILE: .env.observability
jobs:
deploy:
name: 🔭 Deploy Observability
runs-on: prod.docs.plus
timeout-minutes: 15
concurrency:
group: observability-deploy
cancel-in-progress: false
# Gate on the action input, NOT github.event_name: inside a reusable called via
# workflow_call, github.event_name is the CALLER's event ('push'), never
# 'workflow_call', so the old check skipped every orchestrated deploy. This
# workflow only triggers on workflow_call/workflow_dispatch (no push), and
# `action` defaults to 'setup' in both, so this runs unless it's a teardown.
if: inputs.action != 'down'
# No `environment: production` — it would inherit the production environment's
# required-reviewer protection and gate every monitoring deploy on manual approval.
permissions:
contents: read
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 1
- name: 🔐 Prepare environment
run: |
cp "${ENV_SOURCE}" "${ENV_FILE}"
# Require a real first character: dotenv strips quotes, so KEY="" or
# whitespace-only values are empty at runtime and must fail here.
for k in GRAFANA_ADMIN_PASSWORD GLITCHTIP_SECRET_KEY GLITCHTIP_POSTGRES_PASSWORD TELEGRAM_BOT_TOKEN TELEGRAM_CHAT_ID DATABASE_URL DEADMAN_WEBHOOK_URL; do
if ! grep -Eq "^${k}=[\"']?[^\"'[:space:]]" "${ENV_FILE}"; then
echo "::error::${k} missing or empty in ${ENV_SOURCE}"; exit 1
fi
done
echo "✅ Environment ready"
- name: 📨 Render Telegram contact point
# Grafana's native ${VAR} expansion in alerting provisioning mistypes a numeric
# chat id (-100…) into a JSON number and fails to load; envsubst renders a literal
# quoted string instead. Only the allowlisted vars are substituted so the Go
# message template and the LogQL/PromQL `$` in rules-*.yml are left intact.
run: |
set -a
# shellcheck disable=SC1090
. "./${ENV_FILE}"
set +a
# Warnings route falls back to the main ops chat until a dedicated one is set.
: "${TELEGRAM_WARNINGS_CHAT_ID:=${TELEGRAM_CHAT_ID}}"
export TELEGRAM_WARNINGS_CHAT_ID
# shellcheck disable=SC2016
envsubst '${TELEGRAM_BOT_TOKEN} ${TELEGRAM_CHAT_ID} ${TELEGRAM_WARNINGS_CHAT_ID} ${DEADMAN_WEBHOOK_URL}' \
< scripts/observability/grafana/provisioning/alerting/contactpoints.yml.tmpl \
> scripts/observability/grafana/provisioning/alerting/contactpoints.yml
- name: 💾 Disk guard
run: |
AVAIL_GB=$(( $(df --output=avail / | tail -1) / 1024 / 1024 ))
echo "📊 ${AVAIL_GB} GB free on /"
if [ "${AVAIL_GB}" -lt 10 ]; then
echo "::error::Less than 10 GB free on /. Aborting."; df -h /; exit 1
fi
- name: 🔧 Ensure network
run: docker network create docsplus-network 2>/dev/null || true
- name: 🚀 Deploy stack
env:
ACTION_IN: ${{ inputs.action }}
run: |
ACTION="${ACTION_IN:-setup}"
case "${ACTION}" in
setup|update|restart|down) ;;
*) echo "::error::unknown observability action '${ACTION}'"; exit 1 ;;
esac
case "${ACTION}" in
restart)
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" restart ;;
*)
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" pull
# Validate the mounted config before touching the running stack.
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
run --rm --entrypoint promtool prometheus check config /etc/prometheus/prometheus.yml
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" up -d --remove-orphans
# `up -d` never re-reads bind-mounted config files on unchanged containers;
# restart forces prometheus.yml + grafana provisioning to be picked up.
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" restart prometheus grafana ;;
esac
- name: 🩺 Verify
run: |
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" ps
# Grafana ships wget; a single probe after a short settle is fine.
sleep 10
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T grafana \
wget -qO- http://localhost:3000/api/health | grep -q ok && echo "✅ grafana healthy"
# GlitchTip cold-boots + migrates and ships no wget/curl (only python3) — poll up to ~90s.
ok=0
for _i in $(seq 1 18); do
if docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T glitchtip-web \
python3 -c 'import urllib.request,sys; sys.exit(0 if urllib.request.urlopen("http://localhost:8080/_health/").status==200 else 1)' 2>/dev/null; then
ok=1; echo "✅ glitchtip healthy"; break
fi
sleep 5
done
[ "${ok}" = 1 ] || { echo "::error::glitchtip-web not healthy after ~90s"; exit 1; }
- name: 📣 Notify failure
if: failure()
# Reads ENV_SOURCE (not the copied file) — the copy step itself may be what failed.
run: bash scripts/ci/notify-telegram.sh "${ENV_SOURCE}" "🔭 Observability deploy FAILED (${GITHUB_SHA}, run ${GITHUB_RUN_ID})"
teardown:
name: 🧹 Down
runs-on: prod.docs.plus
timeout-minutes: 10
if: github.event_name == 'workflow_dispatch' && inputs.action == 'down'
permissions:
contents: read
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🔐 Prepare environment
run: cp "${ENV_SOURCE}" "${ENV_FILE}"
- name: 🧹 Down (keep volumes)
run: docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" down