From ab91d7421e18b39dadf4c466cb37feec3b71bfcd Mon Sep 17 00:00:00 2001
From: Nick Cote <ncote@ucar.edu>
Date: Tue, 10 Feb 2026 13:17:27 -0700
Subject: [PATCH 1/4] Add readme with details on cnpg postgres clusters,
 rebuilding, and other good info

---
 README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4a09180..3dab035 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,69 @@
-RDA python Package to manage RDA PostgreSQL databases.
+# RDA PostgreSQL Management
+
+This repository contains:
+- Python package to manage RDA PostgreSQL databases
+- Helm charts for PostgreSQL clusters via CloudNativePG
+
+## PostgreSQL Clusters
+
+### Architecture
+- **pgdb01** (NWSC cluster): Primary database
+- **pgdb02** (ML cluster): Replication target (streaming replica from pgdb01)
+- **pgdb03** (ML cluster)
+
+Both clusters are managed via the [CloudNativePG operator](https://cloudnative-pg.io/) using Helm charts from `https://cloudnative-pg.github.io/charts`.
+
+### Deployment
+All PostgreSQL resources are automatically deployed and updated via **ArgoCD**. To make changes:
+
+1. Update the relevant YAML files in this repository
+2. Commit and push changes
+3. ArgoCD will automatically apply the updates to the clusters
+
+No manual `kubectl apply` or Helm commands are needed for routine updates.
+
+### Rebuilding Replication (pgdb02)
+
+If replication fails or pgdb02 becomes out of sync with pgdb01, you can rebuild it by deleting the cluster resource. The cluster will automatically re-bootstrap from pgdb01 via `pg_basebackup`.
+
+**Note:** Make sure you are targeting the right database before running any of these commands. 
+
+**Option 1: Via ArgoCD UI**
+1. Navigate to the pgdb02 application in [ArgoCD](https://mlc1-argo.k8s.ucar.edu/applications/argocd/rda-pgdb02?view=tree&resource=)
+2. Find the `Cluster` resource named `pgdb02`
+3. Delete the resource
+4. Click "Sync" to recreate it
+5. Monitor the logs in ArgoCD
+
+**Option 2: Via kubectl**
+```bash
+# Delete the cluster
+kubectl delete cluster pgdb02 -n rda --context mlc1
+
+# ArgoCD will automatically recreate it on the next sync
+# Or manually sync in the ArgoCD UI
+```
+
+**Monitoring rebuild progress:**
+```bash
+# Watch cluster status
+kubectl get cluster pgdb02 -n rda --context mlc1 -w
+
+# Get the pg_basebackup pod name (it will have a random suffix)
+kubectl get pods -n rda --context mlc1 | grep pgbasebackup
+
+# View pg_basebackup logs (replace with actual pod name from above)
+kubectl logs -n rda pgdb02-1-pgbasebackup-xxxxx -f --context mlc1
+
+# Check PVC size growth
+kubectl get pvc -n rda --context mlc1 | grep pgdb02
+
+# Watch PVC size in real-time
+watch -n 5 'kubectl get pvc -n rda --context mlc1 | grep pgdb02'
+```
+
+The rebuild process typically takes several hours depending on database size. The pg_basebackup job runs in a pod with a name like `pgdb02-1-pgbasebackup-xxxxx` (where `xxxxx` is a random suffix). Once the backup completes, the main `pgdb02-1` pod will start and the cluster status will show as "Cluster in healthy state".
+
+### Important Notes
+- **Version Compatibility**: pgdb02 must use the same PostgreSQL major version as pgdb01. Currently both run `ghcr.io/cloudnative-pg/postgresql:17.4`
+- **Data Loss**: Deleting and rebuilding pgdb02 will resync all data from pgdb01. This is safe for the replica but takes time.
\ No newline at end of file

From b9a223b7e7f3a1f1fec8923183c836c081f164bc Mon Sep 17 00:00:00 2001
From: Nick Cote <ncote@ucar.edu>
Date: Tue, 10 Feb 2026 14:15:04 -0700
Subject: [PATCH 2/4] Fix some things copilot called out

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3dab035..c59e31a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # RDA PostgreSQL Management
 
 This repository contains:
+
 - Python package to manage RDA PostgreSQL databases
 - Helm charts for PostgreSQL clusters via CloudNativePG
 
@@ -11,7 +12,7 @@ This repository contains:
 - **pgdb02** (ML cluster): Replication target (streaming replica from pgdb01)
 - **pgdb03** (ML cluster)
 
-Both clusters are managed via the [CloudNativePG operator](https://cloudnative-pg.io/) using Helm charts from `https://cloudnative-pg.github.io/charts`.
+All Postgres clusters are managed via the [CloudNativePG operator](https://cloudnative-pg.io/) using Helm charts from `https://cloudnative-pg.github.io/charts`.
 
 ### Deployment
 All PostgreSQL resources are automatically deployed and updated via **ArgoCD**. To make changes:
@@ -26,7 +27,7 @@ No manual `kubectl apply` or Helm commands are needed for routine updates.
 
 If replication fails or pgdb02 becomes out of sync with pgdb01, you can rebuild it by deleting the cluster resource. The cluster will automatically re-bootstrap from pgdb01 via `pg_basebackup`.
 
-**Note:** Make sure you are targeting the right database before running any of these commands. 
+**Note:** Make sure you are targeting the right database before running any of these commands.
 
 **Option 1: Via ArgoCD UI**
 1. Navigate to the pgdb02 application in [ArgoCD](https://mlc1-argo.k8s.ucar.edu/applications/argocd/rda-pgdb02?view=tree&resource=)

From 5c3676b99b9aad69c11b350f4bdad85981d92741 Mon Sep 17 00:00:00 2001
From: Nick Cote <ncote@ucar.edu>
Date: Thu, 12 Feb 2026 09:58:58 -0700
Subject: [PATCH 3/4] Add working alert configuration

---
 pgdb02-cirrus/templates/alert-email.yaml      | 27 +++++++++++++++++++
 .../{alerts.yaml => alert-rule.yaml}          | 18 ++++++-------
 2 files changed, 36 insertions(+), 9 deletions(-)
 create mode 100644 pgdb02-cirrus/templates/alert-email.yaml
 rename pgdb02-cirrus/templates/{alerts.yaml => alert-rule.yaml} (66%)

diff --git a/pgdb02-cirrus/templates/alert-email.yaml b/pgdb02-cirrus/templates/alert-email.yaml
new file mode 100644
index 0000000..38ef924
--- /dev/null
+++ b/pgdb02-cirrus/templates/alert-email.yaml
@@ -0,0 +1,27 @@
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: AlertmanagerConfig
+metadata:
+  name: gdex-alerts
+  namespace: rda
+  labels:
+    alertmanagerConfig: gdex
+    namespace: rda
+spec:
+  route:
+    receiver: gdex-alerts
+    groupBy:
+      - alertname
+    groupWait: 10s
+    groupInterval: 1m
+    repeatInterval: 5m 
+    matchers:
+      - name: namespace
+        value: rda
+        matchType: "="
+
+  receivers:
+    - name: gdex-alerts
+      emailConfigs:
+        - to: decs-info@ucar.edu
+          from: alertmanager@k8s.ucar.edu
+          smarthost: vdir.ucar.edu:25  
\ No newline at end of file
diff --git a/pgdb02-cirrus/templates/alerts.yaml b/pgdb02-cirrus/templates/alert-rule.yaml
similarity index 66%
rename from pgdb02-cirrus/templates/alerts.yaml
rename to pgdb02-cirrus/templates/alert-rule.yaml
index 1bcd531..bc0254e 100644
--- a/pgdb02-cirrus/templates/alerts.yaml
+++ b/pgdb02-cirrus/templates/alert-rule.yaml
@@ -1,16 +1,16 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
-  name: cnpg-replication-alerts
+  name: gdex-pg-replication-alerts
   namespace: rda
   labels:
     team: gdex-app-team
 spec:
   groups:
-  - name: cnpg.replication
+  - name: pg.replication
     interval: 60s
     rules:
-    - alert: CNPGReplicationLagHigh
+    - alert: PGReplicationLagHigh
       expr: |
         cnpg_pg_replication_lag{namespace="rda"} > 100
       for: 15m
@@ -19,10 +19,10 @@ spec:
         team: gdex-app-team
         namespace: rda
       annotations:
-        summary: "CNPG replication lag high on {{ $labels.pod }}"
+        summary: "PostgresDB replication lag high on {{ $labels.pod }}"
         description: "Replication lag is {{ $value }} WAL segments behind on {{ $labels.pod }} in cluster {{ $labels.cluster }}"
 
-    - alert: CNPGReplicationBroken
+    - alert: PGReplicationBroken
       expr: |
         cnpg_pg_replication_streaming_replicas{namespace="rda"} == 0
       for: 5m
@@ -31,10 +31,10 @@ spec:
         team: gdex-app-team
         namespace: rda
       annotations:
-        summary: "CNPG replication broken for {{ $labels.cluster }}"
+        summary: "PostgresDB replication broken for {{ $labels.cluster }}"
         description: "Cluster {{ $labels.cluster }} has no streaming replicas. Replication may be broken."
 
-    - alert: CNPGClusterNotHealthy
+    - alert: PGClusterNotHealthy
       expr: |
         cnpg_collector_up{namespace="rda"} == 0
       for: 5m
@@ -43,5 +43,5 @@ spec:
         team: gdex-app-team
         namespace: rda
       annotations:
-        summary: "CNPG cluster {{ $labels.cluster }} is not healthy"
-        description: "The CNPG exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues"
\ No newline at end of file
+        summary: "PostgresDB cluster {{ $labels.cluster }} is not healthy"
+        description: "The Postgres exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues"
\ No newline at end of file

From b9daa5e7dc7d3e3ec0c909f44801b4d4ecd4e035 Mon Sep 17 00:00:00 2001
From: Nick Cote <ncote@ucar.edu>
Date: Thu, 12 Feb 2026 11:48:48 -0700
Subject: [PATCH 4/4] Increase email timing to 1 hr and ensure team name is the
 same across the board

---
 pgdb02-cirrus/templates/alert-email.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pgdb02-cirrus/templates/alert-email.yaml b/pgdb02-cirrus/templates/alert-email.yaml
index 38ef924..465e4f8 100644
--- a/pgdb02-cirrus/templates/alert-email.yaml
+++ b/pgdb02-cirrus/templates/alert-email.yaml
@@ -1,26 +1,26 @@
 apiVersion: monitoring.coreos.com/v1alpha1
 kind: AlertmanagerConfig
 metadata:
-  name: gdex-alerts
+  name: gdex-app-team
   namespace: rda
   labels:
     alertmanagerConfig: gdex
     namespace: rda
 spec:
   route:
-    receiver: gdex-alerts
+    receiver: gdex-app-team
     groupBy:
       - alertname
     groupWait: 10s
     groupInterval: 1m
-    repeatInterval: 5m 
+    repeatInterval: 60m 
     matchers:
       - name: namespace
         value: rda
         matchType: "="
 
   receivers:
-    - name: gdex-alerts
+    - name: gdex-app-team
       emailConfigs:
         - to: decs-info@ucar.edu
           from: alertmanager@k8s.ucar.edu