From ab91d7421e18b39dadf4c466cb37feec3b71bfcd Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 10 Feb 2026 13:17:27 -0700 Subject: [PATCH 1/4] Add readme with details on cnpg postgres clusters, rebuilding, and other good info --- README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a09180..3dab035 100644 --- a/README.md +++ b/README.md @@ -1 +1,69 @@ -RDA python Package to manage RDA PostgreSQL databases. +# RDA PostgreSQL Management + +This repository contains: +- Python package to manage RDA PostgreSQL databases +- Helm charts for PostgreSQL clusters via CloudNativePG + +## PostgreSQL Clusters + +### Architecture +- **pgdb01** (NWSC cluster): Primary database +- **pgdb02** (ML cluster): Replication target (streaming replica from pgdb01) +- **pgdb03** (ML cluster) + +Both clusters are managed via the [CloudNativePG operator](https://cloudnative-pg.io/) using Helm charts from `https://cloudnative-pg.github.io/charts`. + +### Deployment +All PostgreSQL resources are automatically deployed and updated via **ArgoCD**. To make changes: + +1. Update the relevant YAML files in this repository +2. Commit and push changes +3. ArgoCD will automatically apply the updates to the clusters + +No manual `kubectl apply` or Helm commands are needed for routine updates. + +### Rebuilding Replication (pgdb02) + +If replication fails or pgdb02 becomes out of sync with pgdb01, you can rebuild it by deleting the cluster resource. The cluster will automatically re-bootstrap from pgdb01 via `pg_basebackup`. + +**Note:** Make sure you are targeting the right database before running any of these commands. + +**Option 1: Via ArgoCD UI** +1. Navigate to the pgdb02 application in [ArgoCD](https://mlc1-argo.k8s.ucar.edu/applications/argocd/rda-pgdb02?view=tree&resource=) +2. Find the `Cluster` resource named `pgdb02` +3. Delete the resource +4. Click "Sync" to recreate it +5. Monitor the logs in ArgoCD + +**Option 2: Via kubectl** +```bash +# Delete the cluster +kubectl delete cluster pgdb02 -n rda --context mlc1 + +# ArgoCD will automatically recreate it on the next sync +# Or manually sync in the ArgoCD UI +``` + +**Monitoring rebuild progress:** +```bash +# Watch cluster status +kubectl get cluster pgdb02 -n rda --context mlc1 -w + +# Get the pg_basebackup pod name (it will have a random suffix) +kubectl get pods -n rda --context mlc1 | grep pgbasebackup + +# View pg_basebackup logs (replace with actual pod name from above) +kubectl logs -n rda pgdb02-1-pgbasebackup-xxxxx -f --context mlc1 + +# Check PVC size growth +kubectl get pvc -n rda --context mlc1 | grep pgdb02 + +# Watch PVC size in real-time +watch -n 5 'kubectl get pvc -n rda --context mlc1 | grep pgdb02' +``` + +The rebuild process typically takes several hours depending on database size. The pg_basebackup job runs in a pod with a name like `pgdb02-1-pgbasebackup-xxxxx` (where `xxxxx` is a random suffix). Once the backup completes, the main `pgdb02-1` pod will start and the cluster status will show as "Cluster in healthy state". + +### Important Notes +- **Version Compatibility**: pgdb02 must use the same PostgreSQL major version as pgdb01. Currently both run `ghcr.io/cloudnative-pg/postgresql:17.4` +- **Data Loss**: Deleting and rebuilding pgdb02 will resync all data from pgdb01. This is safe for the replica but takes time. \ No newline at end of file From b9a223b7e7f3a1f1fec8923183c836c081f164bc Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 10 Feb 2026 14:15:04 -0700 Subject: [PATCH 2/4] Fix some things copilot called out --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3dab035..c59e31a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # RDA PostgreSQL Management This repository contains: + - Python package to manage RDA PostgreSQL databases - Helm charts for PostgreSQL clusters via CloudNativePG @@ -11,7 +12,7 @@ This repository contains: - **pgdb02** (ML cluster): Replication target (streaming replica from pgdb01) - **pgdb03** (ML cluster) -Both clusters are managed via the [CloudNativePG operator](https://cloudnative-pg.io/) using Helm charts from `https://cloudnative-pg.github.io/charts`. +All Postgres clusters are managed via the [CloudNativePG operator](https://cloudnative-pg.io/) using Helm charts from `https://cloudnative-pg.github.io/charts`. ### Deployment All PostgreSQL resources are automatically deployed and updated via **ArgoCD**. To make changes: @@ -26,7 +27,7 @@ No manual `kubectl apply` or Helm commands are needed for routine updates. If replication fails or pgdb02 becomes out of sync with pgdb01, you can rebuild it by deleting the cluster resource. The cluster will automatically re-bootstrap from pgdb01 via `pg_basebackup`. -**Note:** Make sure you are targeting the right database before running any of these commands. +**Note:** Make sure you are targeting the right database before running any of these commands. **Option 1: Via ArgoCD UI** 1. Navigate to the pgdb02 application in [ArgoCD](https://mlc1-argo.k8s.ucar.edu/applications/argocd/rda-pgdb02?view=tree&resource=) From 5c3676b99b9aad69c11b350f4bdad85981d92741 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 12 Feb 2026 09:58:58 -0700 Subject: [PATCH 3/4] Add working alert configuration --- pgdb02-cirrus/templates/alert-email.yaml | 27 +++++++++++++++++++ .../{alerts.yaml => alert-rule.yaml} | 18 ++++++------- 2 files changed, 36 insertions(+), 9 deletions(-) create mode 100644 pgdb02-cirrus/templates/alert-email.yaml rename pgdb02-cirrus/templates/{alerts.yaml => alert-rule.yaml} (66%) diff --git a/pgdb02-cirrus/templates/alert-email.yaml b/pgdb02-cirrus/templates/alert-email.yaml new file mode 100644 index 0000000..38ef924 --- /dev/null +++ b/pgdb02-cirrus/templates/alert-email.yaml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: gdex-alerts + namespace: rda + labels: + alertmanagerConfig: gdex + namespace: rda +spec: + route: + receiver: gdex-alerts + groupBy: + - alertname + groupWait: 10s + groupInterval: 1m + repeatInterval: 5m + matchers: + - name: namespace + value: rda + matchType: "=" + + receivers: + - name: gdex-alerts + emailConfigs: + - to: decs-info@ucar.edu + from: alertmanager@k8s.ucar.edu + smarthost: vdir.ucar.edu:25 \ No newline at end of file diff --git a/pgdb02-cirrus/templates/alerts.yaml b/pgdb02-cirrus/templates/alert-rule.yaml similarity index 66% rename from pgdb02-cirrus/templates/alerts.yaml rename to pgdb02-cirrus/templates/alert-rule.yaml index 1bcd531..bc0254e 100644 --- a/pgdb02-cirrus/templates/alerts.yaml +++ b/pgdb02-cirrus/templates/alert-rule.yaml @@ -1,16 +1,16 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: cnpg-replication-alerts + name: gdex-pg-replication-alerts namespace: rda labels: team: gdex-app-team spec: groups: - - name: cnpg.replication + - name: pg.replication interval: 60s rules: - - alert: CNPGReplicationLagHigh + - alert: PGReplicationLagHigh expr: | cnpg_pg_replication_lag{namespace="rda"} > 100 for: 15m @@ -19,10 +19,10 @@ spec: team: gdex-app-team namespace: rda annotations: - summary: "CNPG replication lag high on {{ $labels.pod }}" + summary: "PostgresDB replication lag high on {{ $labels.pod }}" description: "Replication lag is {{ $value }} WAL segments behind on {{ $labels.pod }} in cluster {{ $labels.cluster }}" - - alert: CNPGReplicationBroken + - alert: PGReplicationBroken expr: | cnpg_pg_replication_streaming_replicas{namespace="rda"} == 0 for: 5m @@ -31,10 +31,10 @@ spec: team: gdex-app-team namespace: rda annotations: - summary: "CNPG replication broken for {{ $labels.cluster }}" + summary: "PostgresDB replication broken for {{ $labels.cluster }}" description: "Cluster {{ $labels.cluster }} has no streaming replicas. Replication may be broken." - - alert: CNPGClusterNotHealthy + - alert: PGClusterNotHealthy expr: | cnpg_collector_up{namespace="rda"} == 0 for: 5m @@ -43,5 +43,5 @@ spec: team: gdex-app-team namespace: rda annotations: - summary: "CNPG cluster {{ $labels.cluster }} is not healthy" - description: "The CNPG exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues" \ No newline at end of file + summary: "PostgresDB cluster {{ $labels.cluster }} is not healthy" + description: "The Postgres exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues" \ No newline at end of file From b9daa5e7dc7d3e3ec0c909f44801b4d4ecd4e035 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 12 Feb 2026 11:48:48 -0700 Subject: [PATCH 4/4] Increase email timing to 1 hr and ensure team name is the same across the board --- pgdb02-cirrus/templates/alert-email.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pgdb02-cirrus/templates/alert-email.yaml b/pgdb02-cirrus/templates/alert-email.yaml index 38ef924..465e4f8 100644 --- a/pgdb02-cirrus/templates/alert-email.yaml +++ b/pgdb02-cirrus/templates/alert-email.yaml @@ -1,26 +1,26 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: AlertmanagerConfig metadata: - name: gdex-alerts + name: gdex-app-team namespace: rda labels: alertmanagerConfig: gdex namespace: rda spec: route: - receiver: gdex-alerts + receiver: gdex-app-team groupBy: - alertname groupWait: 10s groupInterval: 1m - repeatInterval: 5m + repeatInterval: 60m matchers: - name: namespace value: rda matchType: "=" receivers: - - name: gdex-alerts + - name: gdex-app-team emailConfigs: - to: decs-info@ucar.edu from: alertmanager@k8s.ucar.edu