From bdceecbb0bd3d82d5889ced7da81e3fa97362762 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 18 Jun 2025 11:45:03 -0600 Subject: [PATCH 001/126] Move the cluster and service out to create secrets first --- pgdb01-cirrus/{templates => }/pg_service.yaml | 0 pgdb01-cirrus/{templates => }/postgres_cluster.yaml | 0 pgdb02-cirrus/{templates => }/pg_service.yaml | 0 pgdb02-cirrus/{templates => }/postgres_cluster.yaml | 0 pgdb03-cirrus/{templates => }/pg_service.yaml | 0 pgdb03-cirrus/{templates => }/postgres_cluster.yaml | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename pgdb01-cirrus/{templates => }/pg_service.yaml (100%) rename pgdb01-cirrus/{templates => }/postgres_cluster.yaml (100%) rename pgdb02-cirrus/{templates => }/pg_service.yaml (100%) rename pgdb02-cirrus/{templates => }/postgres_cluster.yaml (100%) rename pgdb03-cirrus/{templates => }/pg_service.yaml (100%) rename pgdb03-cirrus/{templates => }/postgres_cluster.yaml (100%) diff --git a/pgdb01-cirrus/templates/pg_service.yaml b/pgdb01-cirrus/pg_service.yaml similarity index 100% rename from pgdb01-cirrus/templates/pg_service.yaml rename to pgdb01-cirrus/pg_service.yaml diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/postgres_cluster.yaml similarity index 100% rename from pgdb01-cirrus/templates/postgres_cluster.yaml rename to pgdb01-cirrus/postgres_cluster.yaml diff --git a/pgdb02-cirrus/templates/pg_service.yaml b/pgdb02-cirrus/pg_service.yaml similarity index 100% rename from pgdb02-cirrus/templates/pg_service.yaml rename to pgdb02-cirrus/pg_service.yaml diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/postgres_cluster.yaml similarity index 100% rename from pgdb02-cirrus/templates/postgres_cluster.yaml rename to pgdb02-cirrus/postgres_cluster.yaml diff --git a/pgdb03-cirrus/templates/pg_service.yaml b/pgdb03-cirrus/pg_service.yaml similarity index 100% rename from pgdb03-cirrus/templates/pg_service.yaml rename to pgdb03-cirrus/pg_service.yaml diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/postgres_cluster.yaml similarity index 100% rename from pgdb03-cirrus/templates/postgres_cluster.yaml rename to pgdb03-cirrus/postgres_cluster.yaml From 5adfeb4d1383fc2e88f2b8e919299f10e0a5a185 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 23 Jun 2025 12:09:15 -0600 Subject: [PATCH 002/126] update secret store name to be rda-ro --- pgdb01-cirrus/templates/su_external_secret.yaml | 2 +- pgdb02-cirrus/templates/su_external_secret.yaml | 2 +- pgdb03-cirrus/templates/su_external_secret.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgdb01-cirrus/templates/su_external_secret.yaml b/pgdb01-cirrus/templates/su_external_secret.yaml index 8ceb3bf..c1b0829 100644 --- a/pgdb01-cirrus/templates/su_external_secret.yaml +++ b/pgdb01-cirrus/templates/su_external_secret.yaml @@ -6,7 +6,7 @@ metadata: spec: refreshInterval: 1h secretStoreRef: - name: user-ro + name: rda-ro kind: SecretStore target: name: {{ .Values.db.name }}-superuser diff --git a/pgdb02-cirrus/templates/su_external_secret.yaml b/pgdb02-cirrus/templates/su_external_secret.yaml index 8ceb3bf..c1b0829 100644 --- a/pgdb02-cirrus/templates/su_external_secret.yaml +++ b/pgdb02-cirrus/templates/su_external_secret.yaml @@ -6,7 +6,7 @@ metadata: spec: refreshInterval: 1h secretStoreRef: - name: user-ro + name: rda-ro kind: SecretStore target: name: {{ .Values.db.name }}-superuser diff --git a/pgdb03-cirrus/templates/su_external_secret.yaml b/pgdb03-cirrus/templates/su_external_secret.yaml index 8ceb3bf..c1b0829 100644 --- a/pgdb03-cirrus/templates/su_external_secret.yaml +++ b/pgdb03-cirrus/templates/su_external_secret.yaml @@ -6,7 +6,7 @@ metadata: spec: refreshInterval: 1h secretStoreRef: - name: user-ro + name: rda-ro kind: SecretStore target: name: {{ .Values.db.name }}-superuser From f82de55e705ca5cf3a7b785c1b9fab2e51047a60 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 23 Jun 2025 12:16:59 -0600 Subject: [PATCH 003/126] add in the service and cluster --- pgdb01-cirrus/{ => templates}/pg_service.yaml | 0 pgdb01-cirrus/{ => templates}/postgres_cluster.yaml | 0 pgdb01-cirrus/templates/su_external_secret.yaml | 1 - pgdb02-cirrus/{ => templates}/pg_service.yaml | 0 pgdb02-cirrus/{ => templates}/postgres_cluster.yaml | 0 pgdb02-cirrus/templates/su_external_secret.yaml | 1 - pgdb03-cirrus/{ => templates}/pg_service.yaml | 0 pgdb03-cirrus/{ => templates}/postgres_cluster.yaml | 0 pgdb03-cirrus/templates/su_external_secret.yaml | 1 - 9 files changed, 3 deletions(-) rename pgdb01-cirrus/{ => templates}/pg_service.yaml (100%) rename pgdb01-cirrus/{ => templates}/postgres_cluster.yaml (100%) rename pgdb02-cirrus/{ => templates}/pg_service.yaml (100%) rename pgdb02-cirrus/{ => templates}/postgres_cluster.yaml (100%) rename pgdb03-cirrus/{ => templates}/pg_service.yaml (100%) rename pgdb03-cirrus/{ => templates}/postgres_cluster.yaml (100%) diff --git a/pgdb01-cirrus/pg_service.yaml b/pgdb01-cirrus/templates/pg_service.yaml similarity index 100% rename from pgdb01-cirrus/pg_service.yaml rename to pgdb01-cirrus/templates/pg_service.yaml diff --git a/pgdb01-cirrus/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml similarity index 100% rename from pgdb01-cirrus/postgres_cluster.yaml rename to pgdb01-cirrus/templates/postgres_cluster.yaml diff --git a/pgdb01-cirrus/templates/su_external_secret.yaml b/pgdb01-cirrus/templates/su_external_secret.yaml index c1b0829..d00f818 100644 --- a/pgdb01-cirrus/templates/su_external_secret.yaml +++ b/pgdb01-cirrus/templates/su_external_secret.yaml @@ -10,7 +10,6 @@ spec: kind: SecretStore target: name: {{ .Values.db.name }}-superuser - type: kubernetes.io/basic-auth data: - secretKey: username remoteRef: diff --git a/pgdb02-cirrus/pg_service.yaml b/pgdb02-cirrus/templates/pg_service.yaml similarity index 100% rename from pgdb02-cirrus/pg_service.yaml rename to pgdb02-cirrus/templates/pg_service.yaml diff --git a/pgdb02-cirrus/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml similarity index 100% rename from pgdb02-cirrus/postgres_cluster.yaml rename to pgdb02-cirrus/templates/postgres_cluster.yaml diff --git a/pgdb02-cirrus/templates/su_external_secret.yaml b/pgdb02-cirrus/templates/su_external_secret.yaml index c1b0829..d00f818 100644 --- a/pgdb02-cirrus/templates/su_external_secret.yaml +++ b/pgdb02-cirrus/templates/su_external_secret.yaml @@ -10,7 +10,6 @@ spec: kind: SecretStore target: name: {{ .Values.db.name }}-superuser - type: kubernetes.io/basic-auth data: - secretKey: username remoteRef: diff --git a/pgdb03-cirrus/pg_service.yaml b/pgdb03-cirrus/templates/pg_service.yaml similarity index 100% rename from pgdb03-cirrus/pg_service.yaml rename to pgdb03-cirrus/templates/pg_service.yaml diff --git a/pgdb03-cirrus/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml similarity index 100% rename from pgdb03-cirrus/postgres_cluster.yaml rename to pgdb03-cirrus/templates/postgres_cluster.yaml diff --git a/pgdb03-cirrus/templates/su_external_secret.yaml b/pgdb03-cirrus/templates/su_external_secret.yaml index c1b0829..d00f818 100644 --- a/pgdb03-cirrus/templates/su_external_secret.yaml +++ b/pgdb03-cirrus/templates/su_external_secret.yaml @@ -10,7 +10,6 @@ spec: kind: SecretStore target: name: {{ .Values.db.name }}-superuser - type: kubernetes.io/basic-auth data: - secretKey: username remoteRef: From ed1b0b392a782bd9e28432ba96ec141821cf27da Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 23 Jun 2025 12:26:22 -0600 Subject: [PATCH 004/126] Update postgres settings to remove fixed parameters from cnpg --- pgdb01-cirrus/templates/postgres_cluster.yaml | 11 ----------- pgdb02-cirrus/templates/postgres_cluster.yaml | 14 ++------------ pgdb03-cirrus/templates/postgres_cluster.yaml | 12 +----------- 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 49242ca..4451f14 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -27,14 +27,10 @@ spec: postgresql: parameters: # Connection settings - listen_addresses: "*" - port: "5432" max_connections: "500" # SSL Configuration - ssl: "on" ssl_ciphers: "HIGH:!aNULL" - ssl_prefer_server_ciphers: "on" ssl_min_protocol_version: "TLSv1.3" # Memory settings @@ -59,19 +55,12 @@ spec: max_replication_slots: "3" wal_keep_size: "256" max_slot_wal_keep_size: "-1" - hot_standby: "on" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" # Logging settings - log_destination: "stderr" logging_collector: "on" - log_directory: "log" - log_filename: "postgresql-%Y-%m-%d_%H%M%S.log" - log_file_mode: "0644" log_rotation_age: "0" - log_rotation_size: "1GB" - log_truncate_on_rotation: "off" log_min_duration_statement: "120000" log_line_prefix: "%t %a [%p] " log_timezone: "America/Denver" diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 09a38cc..7559d7b 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -29,17 +29,14 @@ spec: name: "{{ .Values.db.name }}-superuser" # Allow outside hosts to connect to the database + postgresql: postgresql: parameters: # Connection settings - listen_addresses: "*" - port: "5432" max_connections: "500" # SSL Configuration - ssl: "on" ssl_ciphers: "HIGH:!aNULL" - ssl_prefer_server_ciphers: "on" ssl_min_protocol_version: "TLSv1.3" # Memory settings @@ -64,19 +61,12 @@ spec: max_replication_slots: "3" wal_keep_size: "256" max_slot_wal_keep_size: "-1" - hot_standby: "on" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" # Logging settings - log_destination: "stderr" logging_collector: "on" - log_directory: "log" - log_filename: "postgresql-%Y-%m-%d_%H%M%S.log" - log_file_mode: "0644" log_rotation_age: "0" - log_rotation_size: "1GB" - log_truncate_on_rotation: "off" log_min_duration_statement: "120000" log_line_prefix: "%t %a [%p] " log_timezone: "America/Denver" @@ -92,7 +82,7 @@ spec: # Lock management max_locks_per_transaction: "1024" - + pg_hba: # Local connections with md5 authentication - local all root md5 diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index 09a38cc..edd9f2c 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -29,17 +29,14 @@ spec: name: "{{ .Values.db.name }}-superuser" # Allow outside hosts to connect to the database + postgresql: postgresql: parameters: # Connection settings - listen_addresses: "*" - port: "5432" max_connections: "500" # SSL Configuration - ssl: "on" ssl_ciphers: "HIGH:!aNULL" - ssl_prefer_server_ciphers: "on" ssl_min_protocol_version: "TLSv1.3" # Memory settings @@ -64,19 +61,12 @@ spec: max_replication_slots: "3" wal_keep_size: "256" max_slot_wal_keep_size: "-1" - hot_standby: "on" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" # Logging settings - log_destination: "stderr" logging_collector: "on" - log_directory: "log" - log_filename: "postgresql-%Y-%m-%d_%H%M%S.log" - log_file_mode: "0644" log_rotation_age: "0" - log_rotation_size: "1GB" - log_truncate_on_rotation: "off" log_min_duration_statement: "120000" log_line_prefix: "%t %a [%p] " log_timezone: "America/Denver" From 14cff9fcb5986791e095d0fd5924d81117aebd3d Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 24 Jun 2025 09:57:16 -0600 Subject: [PATCH 005/126] only need 1 instance --- pgdb01-cirrus/values.yaml | 2 +- pgdb02-cirrus/values.yaml | 2 +- pgdb03-cirrus/values.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 3bae528..8db756e 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 3 + instances: 1 size: 5000Gi superUser: usernameKey: username diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 12a3629..2dea6cf 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb02 group: pgdb02 - instances: 3 + instances: 1 size: 5000Gi superUser: usernameKey: username diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index 2c17abc..df7e1b9 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb03 group: pgdb03 - instances: 3 + instances: 1 size: 7000Gi superUser: usernameKey: username From 4a65e72f6c6d23d95b1c0f5479c8d8a9cafd9af1 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 26 Jun 2025 11:48:54 -0600 Subject: [PATCH 006/126] Change pgdb01 wal_level to logical --- pgdb01-cirrus/templates/postgres_cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 4451f14..748ba29 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -44,7 +44,7 @@ spec: max_files_per_process: "2000" # WAL settings - wal_level: "replica" + wal_level: "logical" checkpoint_timeout: "15min" checkpoint_completion_target: "0.9" max_wal_size: "20GB" From 6b2740069e9e401cc31de3148d36d9e98d159303 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 26 Jun 2025 12:52:32 -0600 Subject: [PATCH 007/126] 02 to logical wal_level --- pgdb02-cirrus/templates/postgres_cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 7559d7b..2c5495e 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -50,7 +50,7 @@ spec: max_files_per_process: "2000" # WAL settings - wal_level: "replica" + wal_level: "logical" checkpoint_timeout: "15min" checkpoint_completion_target: "0.9" max_wal_size: "20GB" From b4b6558914703afac4d607cfa0764f42efe9d441 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 26 Jun 2025 12:53:08 -0600 Subject: [PATCH 008/126] All cirrus pgdbs to 2 instances --- pgdb01-cirrus/values.yaml | 2 +- pgdb02-cirrus/values.yaml | 2 +- pgdb03-cirrus/values.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 8db756e..b61c8e8 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 1 + instances: 2 size: 5000Gi superUser: usernameKey: username diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 2dea6cf..87c16d6 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb02 group: pgdb02 - instances: 1 + instances: 2 size: 5000Gi superUser: usernameKey: username diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index df7e1b9..246179b 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb03 group: pgdb03 - instances: 1 + instances: 2 size: 7000Gi superUser: usernameKey: username From 1fced41973f2edaef41b05a8c27ee6bd5a14aef2 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 14 Jul 2025 17:20:32 -0600 Subject: [PATCH 009/126] update postgres01 --- pgdb01-cirrus/templates/postgres_cluster.yaml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 748ba29..33dc37a 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -16,14 +16,18 @@ spec: serverTLSSecret: {{ .Values.db.name }}-server-cert serverCASecret: {{ .Values.db.name }}-server-cert + # Allow outside hosts to connect to the database + postgresql: + pg_hba: + - "host all all 0.0.0.0/0 md5" + # Enable superuser access enableSuperuserAccess: true # Configure postgres superuser from su_external_secret superuserSecret: name: "{{ .Values.db.name }}-superuser" - - # Allow outside hosts to connect to the database + postgresql: parameters: # Connection settings @@ -76,7 +80,7 @@ spec: # Lock management max_locks_per_transaction: "1024" - + pg_hba: # Local connections with md5 authentication - local all root md5 From 6cdf3474ae86ff8f1f9fb1e815c62dae57220299 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 18 Jul 2025 10:28:37 -0600 Subject: [PATCH 010/126] Change wal_level back to replica --- pgdb01-cirrus/templates/postgres_cluster.yaml | 2 +- pgdb02-cirrus/templates/postgres_cluster.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 33dc37a..3a460df 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -48,7 +48,7 @@ spec: max_files_per_process: "2000" # WAL settings - wal_level: "logical" + wal_level: "replica" checkpoint_timeout: "15min" checkpoint_completion_target: "0.9" max_wal_size: "20GB" diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 2c5495e..7559d7b 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -50,7 +50,7 @@ spec: max_files_per_process: "2000" # WAL settings - wal_level: "logical" + wal_level: "replica" checkpoint_timeout: "15min" checkpoint_completion_target: "0.9" max_wal_size: "20GB" From c670016e29b209ec956bf2ebd3085a387a7d5a36 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 21 Jul 2025 14:47:19 -0600 Subject: [PATCH 011/126] Add replication user to eso --- pgdb02-cirrus/templates/su_external_secret.yaml | 8 ++++++++ pgdb02-cirrus/values.yaml | 2 ++ 2 files changed, 10 insertions(+) diff --git a/pgdb02-cirrus/templates/su_external_secret.yaml b/pgdb02-cirrus/templates/su_external_secret.yaml index d00f818..d8419a0 100644 --- a/pgdb02-cirrus/templates/su_external_secret.yaml +++ b/pgdb02-cirrus/templates/su_external_secret.yaml @@ -19,4 +19,12 @@ spec: remoteRef: key: {{ .Values.db.superUser.secretPath }} property: {{ .Values.db.superUser.passwordKey }} + - secretKey: replication-username + remoteRef: + key: {{ .Values.db.superUser.secretPath }} + property: {{ .Values.db.superUser.replicationUserKey }} + - secretKey: replication-password + remoteRef: + key: {{ .Values.db.superUser.secretPath }} + property: {{ .Values.db.superUser.replicationPassKey }} \ No newline at end of file diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 87c16d6..ac117f0 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -6,4 +6,6 @@ db: superUser: usernameKey: username passwordKey: password + replicationUserKey: repuser + replicationPassKey: reppass secretPath: gdex/pgdb02 \ No newline at end of file From 336b5c6c1ac43a4afbeaf1f51d5345d8d384f1af Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 22 Jul 2025 09:57:01 -0600 Subject: [PATCH 012/126] try to add replication to pgdb02 --- pgdb02-cirrus/templates/postgres_cluster.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 7559d7b..c6666de 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -11,6 +11,22 @@ spec: storage: size: {{ .Values.db.size }} + replica: + enabled: true + source: pgdb01-external + + externalClusters: + - name: pgdb01-external + connectionParameters: + host: pgdb01.k8s.ucar.edu + user: + name: {{ .Values.db.name }}-superuser + key: replication-username + password: + name: {{ .Values.db.name }}-superuser + key: replication-password + sslmode: prefer + # Add TLS certificates for encrypted communication certificates: serverTLSSecret: {{ .Values.db.name }}-server-cert From 92952c0c9fa9a8fad45d28a0a2371082a7b4b03c Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 22 Jul 2025 11:11:06 -0600 Subject: [PATCH 013/126] hardcode replication username --- pgdb02-cirrus/templates/postgres_cluster.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index c6666de..653f803 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -19,9 +19,7 @@ spec: - name: pgdb01-external connectionParameters: host: pgdb01.k8s.ucar.edu - user: - name: {{ .Values.db.name }}-superuser - key: replication-username + user: repl password: name: {{ .Values.db.name }}-superuser key: replication-password From 3f96393cadc4d9183a56d5d635ebf4ba9f43bdf0 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 22 Jul 2025 11:24:13 -0600 Subject: [PATCH 014/126] user to string --- pgdb02-cirrus/templates/postgres_cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 653f803..f163f59 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -19,7 +19,7 @@ spec: - name: pgdb01-external connectionParameters: host: pgdb01.k8s.ucar.edu - user: repl + user: "repl" password: name: {{ .Values.db.name }}-superuser key: replication-password From 99fd0a936c558537b82601603828600193d34f40 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 22 Jul 2025 11:33:07 -0600 Subject: [PATCH 015/126] Fix indentation for the password --- pgdb02-cirrus/templates/postgres_cluster.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index f163f59..88d578d 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -20,10 +20,10 @@ spec: connectionParameters: host: pgdb01.k8s.ucar.edu user: "repl" - password: - name: {{ .Values.db.name }}-superuser - key: replication-password sslmode: prefer + password: + name: {{ .Values.db.name }}-superuser + key: replication-password # Add TLS certificates for encrypted communication certificates: From a123dff8e0f41589238a95a6931a30dc65958bdd Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 22 Jul 2025 11:38:44 -0600 Subject: [PATCH 016/126] add pg_basebackup --- pgdb02-cirrus/templates/postgres_cluster.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 88d578d..b089903 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -11,6 +11,10 @@ spec: storage: size: {{ .Values.db.size }} + bootstrap: + pg_basebackup: + source: pgdb01-external + replica: enabled: true source: pgdb01-external From 9977d1e72e8e0d05ecdc4a310608fe20bb24fbbf Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 22 Jul 2025 11:40:36 -0600 Subject: [PATCH 017/126] Update wal senders and replication slots --- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 3a460df..34f68ab 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -55,8 +55,8 @@ spec: min_wal_size: "1GB" # Replication settings - max_wal_senders: "3" - max_replication_slots: "3" + max_wal_senders: "6" + max_replication_slots: "6" wal_keep_size: "256" max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" From afc9c8ba7223681d4b42c44e3412abf434dec77b Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 31 Jul 2025 10:52:15 -0600 Subject: [PATCH 018/126] Reduce WAL size to see if that helps the issue --- pgdb03-cirrus/templates/postgres_cluster.yaml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index edd9f2c..6ae3fee 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -16,11 +16,6 @@ spec: serverTLSSecret: {{ .Values.db.name }}-server-cert serverCASecret: {{ .Values.db.name }}-server-cert - # Allow outside hosts to connect to the database - postgresql: - pg_hba: - - "host all all 0.0.0.0/0 md5" - # Enable superuser access enableSuperuserAccess: true @@ -30,7 +25,8 @@ spec: # Allow outside hosts to connect to the database postgresql: - postgresql: + pg_hba: + - "host all all 0.0.0.0/0 md5" parameters: # Connection settings max_connections: "500" @@ -53,8 +49,8 @@ spec: wal_level: "replica" checkpoint_timeout: "15min" checkpoint_completion_target: "0.9" - max_wal_size: "20GB" - min_wal_size: "1GB" + max_wal_size: "5GB" + min_wal_size: "512MB" # Replication settings max_wal_senders: "3" From 9ba184622bb48548c4e515aecf93b23956bb016f Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 31 Jul 2025 14:10:01 -0600 Subject: [PATCH 019/126] from 7 to 8GB --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index 246179b..0e07589 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -2,7 +2,7 @@ db: name: pgdb03 group: pgdb03 instances: 2 - size: 7000Gi + size: 8000Gi superUser: usernameKey: username passwordKey: password From 076b48560f780f3327b85c567335a44c2d02e138 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 09:50:04 -0600 Subject: [PATCH 020/126] Increase WAL retention --- pgdb03-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index 6ae3fee..9707301 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -49,13 +49,13 @@ spec: wal_level: "replica" checkpoint_timeout: "15min" checkpoint_completion_target: "0.9" - max_wal_size: "5GB" + max_wal_size: "10GB" min_wal_size: "512MB" # Replication settings max_wal_senders: "3" max_replication_slots: "3" - wal_keep_size: "256" + wal_keep_size: "2GB" max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" From e555f1027f709e8c9691bed2458210202f781cda Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 10:01:14 -0600 Subject: [PATCH 021/126] 8 to 9 --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index 0e07589..b68bd64 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -2,7 +2,7 @@ db: name: pgdb03 group: pgdb03 instances: 2 - size: 8000Gi + size: 9000Gi superUser: usernameKey: username passwordKey: password From 29f1a584580b01d01b3ece75183eb616b9360f5c Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 10:09:37 -0600 Subject: [PATCH 022/126] Scale down to 1 replica --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index b68bd64..cea5d9e 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb03 group: pgdb03 - instances: 2 + instances: 1 size: 9000Gi superUser: usernameKey: username From f66278dc16e56bdda5379a4c40fe56b17bd16478 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 10:51:40 -0600 Subject: [PATCH 023/126] Add protection for the pgdb03-4 PVC --- pgdb03-cirrus/templates/pvc-protection.yaml | 36 +++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 pgdb03-cirrus/templates/pvc-protection.yaml diff --git a/pgdb03-cirrus/templates/pvc-protection.yaml b/pgdb03-cirrus/templates/pvc-protection.yaml new file mode 100644 index 0000000..fa07bdf --- /dev/null +++ b/pgdb03-cirrus/templates/pvc-protection.yaml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pgdb03-4 + namespace: rda + annotations: + cnpg.io/nodeSerial: '4' + cnpg.io/operatorVersion: 1.25.1 + cnpg.io/pvcStatus: ready + pv.kubernetes.io/bind-completed: 'yes' + pv.kubernetes.io/bound-by-controller: 'yes' + volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com + volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com + argocd.argoproj.io/sync-options: Delete=false + cluster.x-k8s.io/managed-by: manual-override + + labels: + cnpg.io/cluster: pgdb03 + cnpg.io/instanceName: pgdb03-4 + cnpg.io/instanceRole: primary + cnpg.io/pvcRole: PG_DATA + role: primary + + finalizers: + - kubernetes.io/pvc-protection + - custom.preserve/pgdb03-data + +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 9000Gi + storageClassName: ceph-kubepv + volumeMode: Filesystem + volumeName: pvc-03d9fa68-f05e-4bc0-815f-b9e0bce8fbd4 \ No newline at end of file From f22e112fd468ca437404b7d163ea0fb292cff2ed Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 10:54:15 -0600 Subject: [PATCH 024/126] remove the owners reference --- pgdb03-cirrus/templates/pvc-protection.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pgdb03-cirrus/templates/pvc-protection.yaml b/pgdb03-cirrus/templates/pvc-protection.yaml index fa07bdf..0a16f9d 100644 --- a/pgdb03-cirrus/templates/pvc-protection.yaml +++ b/pgdb03-cirrus/templates/pvc-protection.yaml @@ -25,6 +25,8 @@ metadata: - kubernetes.io/pvc-protection - custom.preserve/pgdb03-data + ownerReferences: [] + spec: accessModes: - ReadWriteOnce From 1f68a46adfddd94b73283605f43060b0510a11c9 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 10:56:05 -0600 Subject: [PATCH 025/126] remove the pgdb03 cluster --- .../{postgres_cluster.yaml => postgres_cluster.yaml.bak} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pgdb03-cirrus/templates/{postgres_cluster.yaml => postgres_cluster.yaml.bak} (100%) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml.bak similarity index 100% rename from pgdb03-cirrus/templates/postgres_cluster.yaml rename to pgdb03-cirrus/templates/postgres_cluster.yaml.bak From dc8a34fc7439683778b13abef7e0fec08aee9888 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 10:58:44 -0600 Subject: [PATCH 026/126] Try to roll out simple cluster to get everything healthy --- pgdb03-cirrus/templates/postgres_cluster.yaml | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 pgdb03-cirrus/templates/postgres_cluster.yaml diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml new file mode 100644 index 0000000..b804dc9 --- /dev/null +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -0,0 +1,47 @@ +# temporary-cluster.yaml +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: pgdb03 + namespace: rda + labels: + app: pgdb03 + argocd.argoproj.io/instance: rda-pgdb03 + group: pgdb03 +spec: + instances: 1 # Start with just primary + + # Basic storage config + storage: + size: 9000Gi + storageClass: ceph-kubepv + + # Essential configs only + enableSuperuserAccess: true + + superuserSecret: + name: pgdb03-superuser + + certificates: + serverTLSSecret: pgdb03-server-cert + serverCASecret: pgdb03-server-cert + + # Minimal PostgreSQL config to get started + postgresql: + parameters: + max_connections: "500" + shared_buffers: "32GB" + max_wal_size: "10GB" + wal_keep_size: "2GB" + wal_level: "replica" + max_wal_senders: "3" + max_replication_slots: "3" + + pg_hba: + - local all all md5 + - host all all 127.0.0.1/32 md5 + - host all all ::1/128 md5 + - host all all 128.117.0.0/16 md5 + - local replication all md5 + - host replication all 127.0.0.1/32 md5 + - host replication all 128.117.0.0/16 trust \ No newline at end of file From 4b7e4c3ecca112d20d67534b7726b0fe4c5110ef Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:04:47 -0600 Subject: [PATCH 027/126] Switch back to the original cluster yaml with 1 instance --- pgdb03-cirrus/templates/postgres_cluster.yaml | 96 +++++++++++++---- .../templates/postgres_cluster.yaml.bak | 101 ------------------ 2 files changed, 75 insertions(+), 122 deletions(-) delete mode 100644 pgdb03-cirrus/templates/postgres_cluster.yaml.bak diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index b804dc9..9707301 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -1,47 +1,101 @@ -# temporary-cluster.yaml apiVersion: postgresql.cnpg.io/v1 kind: Cluster metadata: - name: pgdb03 - namespace: rda + name: {{ .Values.db.name }} + namespace: {{ .Release.Namespace }} labels: - app: pgdb03 - argocd.argoproj.io/instance: rda-pgdb03 - group: pgdb03 + app: {{ .Values.db.name }} + group: {{ .Values.db.group }} spec: - instances: 1 # Start with just primary - - # Basic storage config + instances: {{ .Values.db.instances }} storage: - size: 9000Gi - storageClass: ceph-kubepv - - # Essential configs only + size: {{ .Values.db.size }} + + # Add TLS certificates for encrypted communication + certificates: + serverTLSSecret: {{ .Values.db.name }}-server-cert + serverCASecret: {{ .Values.db.name }}-server-cert + + # Enable superuser access enableSuperuserAccess: true + # Configure postgres superuser from su_external_secret superuserSecret: - name: pgdb03-superuser + name: "{{ .Values.db.name }}-superuser" - certificates: - serverTLSSecret: pgdb03-server-cert - serverCASecret: pgdb03-server-cert - - # Minimal PostgreSQL config to get started + # Allow outside hosts to connect to the database postgresql: + pg_hba: + - "host all all 0.0.0.0/0 md5" parameters: + # Connection settings max_connections: "500" + + # SSL Configuration + ssl_ciphers: "HIGH:!aNULL" + ssl_min_protocol_version: "TLSv1.3" + + # Memory settings shared_buffers: "32GB" - max_wal_size: "10GB" - wal_keep_size: "2GB" + temp_buffers: "64MB" + work_mem: "32MB" + maintenance_work_mem: "128MB" + dynamic_shared_memory_type: "posix" + + # Resource limits + max_files_per_process: "2000" + + # WAL settings wal_level: "replica" + checkpoint_timeout: "15min" + checkpoint_completion_target: "0.9" + max_wal_size: "10GB" + min_wal_size: "512MB" + + # Replication settings max_wal_senders: "3" max_replication_slots: "3" + wal_keep_size: "2GB" + max_slot_wal_keep_size: "-1" + max_standby_archive_delay: "-1" + max_standby_streaming_delay: "-1" + # Logging settings + logging_collector: "on" + log_rotation_age: "0" + log_min_duration_statement: "120000" + log_line_prefix: "%t %a [%p] " + log_timezone: "America/Denver" + + # Locale and timezone settings + datestyle: "iso, mdy" + timezone: "America/Denver" + lc_messages: "en_US.UTF-8" + lc_monetary: "en_US.UTF-8" + lc_numeric: "en_US.UTF-8" + lc_time: "en_US.UTF-8" + default_text_search_config: "pg_catalog.english" + + # Lock management + max_locks_per_transaction: "1024" + pg_hba: + # Local connections with md5 authentication + - local all root md5 - local all all md5 + + # IPv4 local connections with md5 - host all all 127.0.0.1/32 md5 + + # IPv6 local connections with md5 - host all all ::1/128 md5 + + # IPv4 remote connections for UCAR network - host all all 128.117.0.0/16 md5 + + # Replication connections - local replication all md5 - host replication all 127.0.0.1/32 md5 + + # Remote replication - host replication all 128.117.0.0/16 trust \ No newline at end of file diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml.bak b/pgdb03-cirrus/templates/postgres_cluster.yaml.bak deleted file mode 100644 index 9707301..0000000 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml.bak +++ /dev/null @@ -1,101 +0,0 @@ -apiVersion: postgresql.cnpg.io/v1 -kind: Cluster -metadata: - name: {{ .Values.db.name }} - namespace: {{ .Release.Namespace }} - labels: - app: {{ .Values.db.name }} - group: {{ .Values.db.group }} -spec: - instances: {{ .Values.db.instances }} - storage: - size: {{ .Values.db.size }} - - # Add TLS certificates for encrypted communication - certificates: - serverTLSSecret: {{ .Values.db.name }}-server-cert - serverCASecret: {{ .Values.db.name }}-server-cert - - # Enable superuser access - enableSuperuserAccess: true - - # Configure postgres superuser from su_external_secret - superuserSecret: - name: "{{ .Values.db.name }}-superuser" - - # Allow outside hosts to connect to the database - postgresql: - pg_hba: - - "host all all 0.0.0.0/0 md5" - parameters: - # Connection settings - max_connections: "500" - - # SSL Configuration - ssl_ciphers: "HIGH:!aNULL" - ssl_min_protocol_version: "TLSv1.3" - - # Memory settings - shared_buffers: "32GB" - temp_buffers: "64MB" - work_mem: "32MB" - maintenance_work_mem: "128MB" - dynamic_shared_memory_type: "posix" - - # Resource limits - max_files_per_process: "2000" - - # WAL settings - wal_level: "replica" - checkpoint_timeout: "15min" - checkpoint_completion_target: "0.9" - max_wal_size: "10GB" - min_wal_size: "512MB" - - # Replication settings - max_wal_senders: "3" - max_replication_slots: "3" - wal_keep_size: "2GB" - max_slot_wal_keep_size: "-1" - max_standby_archive_delay: "-1" - max_standby_streaming_delay: "-1" - - # Logging settings - logging_collector: "on" - log_rotation_age: "0" - log_min_duration_statement: "120000" - log_line_prefix: "%t %a [%p] " - log_timezone: "America/Denver" - - # Locale and timezone settings - datestyle: "iso, mdy" - timezone: "America/Denver" - lc_messages: "en_US.UTF-8" - lc_monetary: "en_US.UTF-8" - lc_numeric: "en_US.UTF-8" - lc_time: "en_US.UTF-8" - default_text_search_config: "pg_catalog.english" - - # Lock management - max_locks_per_transaction: "1024" - - pg_hba: - # Local connections with md5 authentication - - local all root md5 - - local all all md5 - - # IPv4 local connections with md5 - - host all all 127.0.0.1/32 md5 - - # IPv6 local connections with md5 - - host all all ::1/128 md5 - - # IPv4 remote connections for UCAR network - - host all all 128.117.0.0/16 md5 - - # Replication connections - - local replication all md5 - - host replication all 127.0.0.1/32 md5 - - # Remote replication - - host replication all 128.117.0.0/16 trust \ No newline at end of file From 25fdc9b7f54423d0551e6ac07791f5013fad5eb1 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:08:37 -0600 Subject: [PATCH 028/126] Switch the PVC back to cluster ownership --- pgdb03-cirrus/templates/pvc-protection.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pgdb03-cirrus/templates/pvc-protection.yaml b/pgdb03-cirrus/templates/pvc-protection.yaml index 0a16f9d..811084c 100644 --- a/pgdb03-cirrus/templates/pvc-protection.yaml +++ b/pgdb03-cirrus/templates/pvc-protection.yaml @@ -11,8 +11,6 @@ metadata: pv.kubernetes.io/bound-by-controller: 'yes' volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - argocd.argoproj.io/sync-options: Delete=false - cluster.x-k8s.io/managed-by: manual-override labels: cnpg.io/cluster: pgdb03 @@ -23,9 +21,6 @@ metadata: finalizers: - kubernetes.io/pvc-protection - - custom.preserve/pgdb03-data - - ownerReferences: [] spec: accessModes: From 81cfea69b1769032e59e2bb825f1bfafb7243a7d Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:11:44 -0600 Subject: [PATCH 029/126] remove pvc protection and trigger the PVC to be owned by CNPG cluster again --- pgdb03-cirrus/templates/pvc-protection.yaml | 33 --------------------- 1 file changed, 33 deletions(-) delete mode 100644 pgdb03-cirrus/templates/pvc-protection.yaml diff --git a/pgdb03-cirrus/templates/pvc-protection.yaml b/pgdb03-cirrus/templates/pvc-protection.yaml deleted file mode 100644 index 811084c..0000000 --- a/pgdb03-cirrus/templates/pvc-protection.yaml +++ /dev/null @@ -1,33 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: pgdb03-4 - namespace: rda - annotations: - cnpg.io/nodeSerial: '4' - cnpg.io/operatorVersion: 1.25.1 - cnpg.io/pvcStatus: ready - pv.kubernetes.io/bind-completed: 'yes' - pv.kubernetes.io/bound-by-controller: 'yes' - volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - - labels: - cnpg.io/cluster: pgdb03 - cnpg.io/instanceName: pgdb03-4 - cnpg.io/instanceRole: primary - cnpg.io/pvcRole: PG_DATA - role: primary - - finalizers: - - kubernetes.io/pvc-protection - -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 9000Gi - storageClassName: ceph-kubepv - volumeMode: Filesystem - volumeName: pvc-03d9fa68-f05e-4bc0-815f-b9e0bce8fbd4 \ No newline at end of file From 4da97063400d61a0176df14237e580f9dd21aefb Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:14:01 -0600 Subject: [PATCH 030/126] Add the PVC back --- pgdb03-cirrus/templates/pvc-protection.yaml | 33 +++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 pgdb03-cirrus/templates/pvc-protection.yaml diff --git a/pgdb03-cirrus/templates/pvc-protection.yaml b/pgdb03-cirrus/templates/pvc-protection.yaml new file mode 100644 index 0000000..811084c --- /dev/null +++ b/pgdb03-cirrus/templates/pvc-protection.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pgdb03-4 + namespace: rda + annotations: + cnpg.io/nodeSerial: '4' + cnpg.io/operatorVersion: 1.25.1 + cnpg.io/pvcStatus: ready + pv.kubernetes.io/bind-completed: 'yes' + pv.kubernetes.io/bound-by-controller: 'yes' + volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com + volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com + + labels: + cnpg.io/cluster: pgdb03 + cnpg.io/instanceName: pgdb03-4 + cnpg.io/instanceRole: primary + cnpg.io/pvcRole: PG_DATA + role: primary + + finalizers: + - kubernetes.io/pvc-protection + +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 9000Gi + storageClassName: ceph-kubepv + volumeMode: Filesystem + volumeName: pvc-03d9fa68-f05e-4bc0-815f-b9e0bce8fbd4 \ No newline at end of file From 65896c16f1bdb5c0c90a60e89d5144de2deeb41b Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:17:49 -0600 Subject: [PATCH 031/126] Try this strong deletion --- pgdb03-cirrus/templates/pvc-protection.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pgdb03-cirrus/templates/pvc-protection.yaml b/pgdb03-cirrus/templates/pvc-protection.yaml index 811084c..956c522 100644 --- a/pgdb03-cirrus/templates/pvc-protection.yaml +++ b/pgdb03-cirrus/templates/pvc-protection.yaml @@ -12,7 +12,11 @@ metadata: volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com + argocd.argoproj.io/sync-options: Delete=false,Replace=true,Force=true + cluster.x-k8s.io/managed-by: manual-override + labels: + argocd.argoproj.io/instance: rda-pgdb03 cnpg.io/cluster: pgdb03 cnpg.io/instanceName: pgdb03-4 cnpg.io/instanceRole: primary @@ -21,7 +25,8 @@ metadata: finalizers: - kubernetes.io/pvc-protection - + - custom.preserve/critical-data + spec: accessModes: - ReadWriteOnce From 5722842e081c555d65f795576caf3de2bdda7ef7 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:26:21 -0600 Subject: [PATCH 032/126] remove the pvc file again --- pgdb03-cirrus/templates/pvc-protection.yaml | 38 --------------------- 1 file changed, 38 deletions(-) delete mode 100644 pgdb03-cirrus/templates/pvc-protection.yaml diff --git a/pgdb03-cirrus/templates/pvc-protection.yaml b/pgdb03-cirrus/templates/pvc-protection.yaml deleted file mode 100644 index 956c522..0000000 --- a/pgdb03-cirrus/templates/pvc-protection.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: pgdb03-4 - namespace: rda - annotations: - cnpg.io/nodeSerial: '4' - cnpg.io/operatorVersion: 1.25.1 - cnpg.io/pvcStatus: ready - pv.kubernetes.io/bind-completed: 'yes' - pv.kubernetes.io/bound-by-controller: 'yes' - volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - - argocd.argoproj.io/sync-options: Delete=false,Replace=true,Force=true - cluster.x-k8s.io/managed-by: manual-override - - labels: - argocd.argoproj.io/instance: rda-pgdb03 - cnpg.io/cluster: pgdb03 - cnpg.io/instanceName: pgdb03-4 - cnpg.io/instanceRole: primary - cnpg.io/pvcRole: PG_DATA - role: primary - - finalizers: - - kubernetes.io/pvc-protection - - custom.preserve/critical-data - -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 9000Gi - storageClassName: ceph-kubepv - volumeMode: Filesystem - volumeName: pvc-03d9fa68-f05e-4bc0-815f-b9e0bce8fbd4 \ No newline at end of file From 397c976bf3083c7478946fe04e1fc5d6a714d42f Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:26:37 -0600 Subject: [PATCH 033/126] scale to 2 instances --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index cea5d9e..b68bd64 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb03 group: pgdb03 - instances: 1 + instances: 2 size: 9000Gi superUser: usernameKey: username From c8d03a10ac32aa32b2b38907bcb5468e503f0546 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:33:41 -0600 Subject: [PATCH 034/126] rename the cluster --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index b68bd64..f479639 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,5 +1,5 @@ db: - name: pgdb03 + name: pgdb03-v2 group: pgdb03 instances: 2 size: 9000Gi From 67c800faca1dbfc48b8aec93f7a889c29055b205 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:34:16 -0600 Subject: [PATCH 035/126] don't rename --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index f479639..b68bd64 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,5 +1,5 @@ db: - name: pgdb03-v2 + name: pgdb03 group: pgdb03 instances: 2 size: 9000Gi From 46c6d9b8dd8b7a0172341619d739daa214928884 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:40:33 -0600 Subject: [PATCH 036/126] Try and fix the PVC pending deletion --- pgdb03-cirrus/templates/clean-pvc.yaml | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 pgdb03-cirrus/templates/clean-pvc.yaml diff --git a/pgdb03-cirrus/templates/clean-pvc.yaml b/pgdb03-cirrus/templates/clean-pvc.yaml new file mode 100644 index 0000000..94cc21e --- /dev/null +++ b/pgdb03-cirrus/templates/clean-pvc.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pgdb03-4 + namespace: rda + annotations: + cnpg.io/nodeSerial: '4' + cnpg.io/operatorVersion: 1.25.1 + cnpg.io/pvcStatus: ready + pv.kubernetes.io/bind-completed: 'yes' + pv.kubernetes.io/bound-by-controller: 'yes' + volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com + volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com + argocd.argoproj.io/sync-options: Replace=true,Force=true + + labels: + argocd.argoproj.io/instance: rda-pgdb03 + cnpg.io/cluster: pgdb03 + cnpg.io/instanceName: pgdb03-4 + cnpg.io/instanceRole: primary + cnpg.io/pvcRole: PG_DATA + role: primary + + finalizers: + - kubernetes.io/pvc-protection + + ownerReferences: + - apiVersion: postgresql.cnpg.io/v1 + controller: true + kind: Cluster + name: pgdb03 + uid: f50f620b-6959-4529-8234-c604f241a820 + +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 9000Gi + storageClassName: ceph-kubepv + volumeMode: Filesystem + volumeName: pvc-03d9fa68-f05e-4bc0-815f-b9e0bce8fbd4 \ No newline at end of file From 6c0958cd5db8819c675fb45a46c009728bc66aba Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:49:07 -0600 Subject: [PATCH 037/126] Try to switch the PVC to remove the deletion --- pgdb03-cirrus/templates/clean-pvc.yaml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pgdb03-cirrus/templates/clean-pvc.yaml b/pgdb03-cirrus/templates/clean-pvc.yaml index 94cc21e..2a0c295 100644 --- a/pgdb03-cirrus/templates/clean-pvc.yaml +++ b/pgdb03-cirrus/templates/clean-pvc.yaml @@ -1,22 +1,21 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: pgdb03-4 + name: pgdb03-1 namespace: rda annotations: - cnpg.io/nodeSerial: '4' + cnpg.io/nodeSerial: '1' cnpg.io/operatorVersion: 1.25.1 cnpg.io/pvcStatus: ready pv.kubernetes.io/bind-completed: 'yes' pv.kubernetes.io/bound-by-controller: 'yes' volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - argocd.argoproj.io/sync-options: Replace=true,Force=true labels: argocd.argoproj.io/instance: rda-pgdb03 cnpg.io/cluster: pgdb03 - cnpg.io/instanceName: pgdb03-4 + cnpg.io/instanceName: pgdb03-1 cnpg.io/instanceRole: primary cnpg.io/pvcRole: PG_DATA role: primary @@ -24,13 +23,6 @@ metadata: finalizers: - kubernetes.io/pvc-protection - ownerReferences: - - apiVersion: postgresql.cnpg.io/v1 - controller: true - kind: Cluster - name: pgdb03 - uid: f50f620b-6959-4529-8234-c604f241a820 - spec: accessModes: - ReadWriteOnce From 6a512d7018a603aface396dd902e59cc7777ffd1 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 11:58:41 -0600 Subject: [PATCH 038/126] remove the cluster --- .../{postgres_cluster.yaml => postgres_cluster.yaml.bak} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pgdb03-cirrus/templates/{postgres_cluster.yaml => postgres_cluster.yaml.bak} (100%) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml.bak similarity index 100% rename from pgdb03-cirrus/templates/postgres_cluster.yaml rename to pgdb03-cirrus/templates/postgres_cluster.yaml.bak From 5bb166d1b4a1ae7a749d73c0f78ed2d95871ac08 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 12:34:11 -0600 Subject: [PATCH 039/126] Just try to redeploy --- .../{postgres_cluster.yaml.bak => postgres_cluster.yaml} | 0 pgdb03-cirrus/values.yaml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename pgdb03-cirrus/templates/{postgres_cluster.yaml.bak => postgres_cluster.yaml} (100%) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml.bak b/pgdb03-cirrus/templates/postgres_cluster.yaml similarity index 100% rename from pgdb03-cirrus/templates/postgres_cluster.yaml.bak rename to pgdb03-cirrus/templates/postgres_cluster.yaml diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index b68bd64..cea5d9e 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb03 group: pgdb03 - instances: 2 + instances: 1 size: 9000Gi superUser: usernameKey: username From 6512904cfa773af7e2b8e277ee6ff0fb8658cfb4 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 12:36:16 -0600 Subject: [PATCH 040/126] remove the cluster again --- .../{postgres_cluster.yaml => postgres_cluster.yaml.bak} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pgdb03-cirrus/templates/{postgres_cluster.yaml => postgres_cluster.yaml.bak} (100%) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml.bak similarity index 100% rename from pgdb03-cirrus/templates/postgres_cluster.yaml rename to pgdb03-cirrus/templates/postgres_cluster.yaml.bak From 620b37a8b3619e69ce6c4a63eb3f322d9a25a595 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 12:45:16 -0600 Subject: [PATCH 041/126] Restart with a clean 03 --- pgdb03-cirrus/templates/clean-pvc.yaml | 34 ------------------- ...cluster.yaml.bak => postgres_cluster.yaml} | 0 2 files changed, 34 deletions(-) delete mode 100644 pgdb03-cirrus/templates/clean-pvc.yaml rename pgdb03-cirrus/templates/{postgres_cluster.yaml.bak => postgres_cluster.yaml} (100%) diff --git a/pgdb03-cirrus/templates/clean-pvc.yaml b/pgdb03-cirrus/templates/clean-pvc.yaml deleted file mode 100644 index 2a0c295..0000000 --- a/pgdb03-cirrus/templates/clean-pvc.yaml +++ /dev/null @@ -1,34 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: pgdb03-1 - namespace: rda - annotations: - cnpg.io/nodeSerial: '1' - cnpg.io/operatorVersion: 1.25.1 - cnpg.io/pvcStatus: ready - pv.kubernetes.io/bind-completed: 'yes' - pv.kubernetes.io/bound-by-controller: 'yes' - volume.beta.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - volume.kubernetes.io/storage-provisioner: rbd.csi.ceph.com - - labels: - argocd.argoproj.io/instance: rda-pgdb03 - cnpg.io/cluster: pgdb03 - cnpg.io/instanceName: pgdb03-1 - cnpg.io/instanceRole: primary - cnpg.io/pvcRole: PG_DATA - role: primary - - finalizers: - - kubernetes.io/pvc-protection - -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 9000Gi - storageClassName: ceph-kubepv - volumeMode: Filesystem - volumeName: pvc-03d9fa68-f05e-4bc0-815f-b9e0bce8fbd4 \ No newline at end of file diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml.bak b/pgdb03-cirrus/templates/postgres_cluster.yaml similarity index 100% rename from pgdb03-cirrus/templates/postgres_cluster.yaml.bak rename to pgdb03-cirrus/templates/postgres_cluster.yaml From 483edbad87f438f1eedb57e10327950cb266bf25 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 12:47:29 -0600 Subject: [PATCH 042/126] bak --- .../{postgres_cluster.yaml => postgres_cluster.yaml.bak} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pgdb03-cirrus/templates/{postgres_cluster.yaml => postgres_cluster.yaml.bak} (100%) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml.bak similarity index 100% rename from pgdb03-cirrus/templates/postgres_cluster.yaml rename to pgdb03-cirrus/templates/postgres_cluster.yaml.bak From 1f9643262c3d06066919283d583297803ee105f6 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 12:49:17 -0600 Subject: [PATCH 043/126] Try and force it to start fresh --- .../{postgres_cluster.yaml.bak => postgres_cluster.yaml} | 2 ++ 1 file changed, 2 insertions(+) rename pgdb03-cirrus/templates/{postgres_cluster.yaml.bak => postgres_cluster.yaml} (98%) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml.bak b/pgdb03-cirrus/templates/postgres_cluster.yaml similarity index 98% rename from pgdb03-cirrus/templates/postgres_cluster.yaml.bak rename to pgdb03-cirrus/templates/postgres_cluster.yaml index 9707301..0a7b2c3 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml.bak +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -6,6 +6,8 @@ metadata: labels: app: {{ .Values.db.name }} group: {{ .Values.db.group }} + annotations: + cnpg.io/skipWalArchiveCheck: "true" spec: instances: {{ .Values.db.instances }} storage: From 74571534a46cb2c092d7ac96e2c855d7eab80dfa Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 12:57:30 -0600 Subject: [PATCH 044/126] remove the skip archive check line --- pgdb03-cirrus/templates/postgres_cluster.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index 0a7b2c3..9707301 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -6,8 +6,6 @@ metadata: labels: app: {{ .Values.db.name }} group: {{ .Values.db.group }} - annotations: - cnpg.io/skipWalArchiveCheck: "true" spec: instances: {{ .Values.db.instances }} storage: From 6678fb470228f079c5f3f3c2a03f01f33b45a233 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 12:58:01 -0600 Subject: [PATCH 045/126] Scale to 2 instances --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index cea5d9e..b68bd64 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb03 group: pgdb03 - instances: 1 + instances: 2 size: 9000Gi superUser: usernameKey: username From e12c16eca994082f778630f2f687bd721371490d Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 15:12:15 -0600 Subject: [PATCH 046/126] add a weekly backup --- pgdb03-cirrus/templates/scheduled-backup.yaml | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 pgdb03-cirrus/templates/scheduled-backup.yaml diff --git a/pgdb03-cirrus/templates/scheduled-backup.yaml b/pgdb03-cirrus/templates/scheduled-backup.yaml new file mode 100644 index 0000000..ec3bad4 --- /dev/null +++ b/pgdb03-cirrus/templates/scheduled-backup.yaml @@ -0,0 +1,22 @@ +# scheduled-backup.yaml +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: {{ .Values.db.name }}-weekly-backup + namespace: {{ .Release.Namespace }} + labels: + app: {{ .Values.db.name }} +spec: + # Schedule: Every Friday at 11:00 PM + schedule: "0 23 * * 5" + + backupOwnerReference: self + + cluster: + name: {{ .Values.db.name }} + + target: primary + method: volumeSnapshot + + # Keep 8 weekly backups + retentionPolicy: "8" \ No newline at end of file From 190946c8313d7dd4b36bbad39736e7857d0e9fd0 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Mon, 4 Aug 2025 15:28:36 -0600 Subject: [PATCH 047/126] Remove snapshot as we need to install the operator --- pgdb03-cirrus/templates/scheduled-backup.yaml | 22 ------------------- 1 file changed, 22 deletions(-) delete mode 100644 pgdb03-cirrus/templates/scheduled-backup.yaml diff --git a/pgdb03-cirrus/templates/scheduled-backup.yaml b/pgdb03-cirrus/templates/scheduled-backup.yaml deleted file mode 100644 index ec3bad4..0000000 --- a/pgdb03-cirrus/templates/scheduled-backup.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# scheduled-backup.yaml -apiVersion: postgresql.cnpg.io/v1 -kind: ScheduledBackup -metadata: - name: {{ .Values.db.name }}-weekly-backup - namespace: {{ .Release.Namespace }} - labels: - app: {{ .Values.db.name }} -spec: - # Schedule: Every Friday at 11:00 PM - schedule: "0 23 * * 5" - - backupOwnerReference: self - - cluster: - name: {{ .Values.db.name }} - - target: primary - method: volumeSnapshot - - # Keep 8 weekly backups - retentionPolicy: "8" \ No newline at end of file From a8714cf45be5cf58d1baad93dc74c4252f29f36c Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 6 Aug 2025 10:13:32 -0600 Subject: [PATCH 048/126] Add resource limits for 03 and tweak some settings to get better resource utilization --- pgdb03-cirrus/templates/postgres_cluster.yaml | 14 ++++++++------ pgdb03-cirrus/values.yaml | 6 +++++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index 9707301..c60b0d9 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -10,6 +10,10 @@ spec: instances: {{ .Values.db.instances }} storage: size: {{ .Values.db.size }} + resources: + limits: + cpu: {{ .Values.db.resource.limits.cpu }} + memory: {{ .Values.db.resource.limits.memory }} # Add TLS certificates for encrypted communication certificates: @@ -25,8 +29,6 @@ spec: # Allow outside hosts to connect to the database postgresql: - pg_hba: - - "host all all 0.0.0.0/0 md5" parameters: # Connection settings max_connections: "500" @@ -38,8 +40,8 @@ spec: # Memory settings shared_buffers: "32GB" temp_buffers: "64MB" - work_mem: "32MB" - maintenance_work_mem: "128MB" + work_mem: "64MB" + maintenance_work_mem: "512MB" dynamic_shared_memory_type: "posix" # Resource limits @@ -49,13 +51,13 @@ spec: wal_level: "replica" checkpoint_timeout: "15min" checkpoint_completion_target: "0.9" - max_wal_size: "10GB" + max_wal_size: "16GB" min_wal_size: "512MB" # Replication settings max_wal_senders: "3" max_replication_slots: "3" - wal_keep_size: "2GB" + wal_keep_size: "4GB" max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index b68bd64..6250043 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -6,4 +6,8 @@ db: superUser: usernameKey: username passwordKey: password - secretPath: gdex/pgdb03 \ No newline at end of file + secretPath: gdex/pgdb03 + resource: + limits: + cpu: 16 + memory: 128Gi \ No newline at end of file From cf042d849aa6e916c0cb082cc3ccf8a58a9b2cd3 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 6 Aug 2025 12:29:23 -0600 Subject: [PATCH 049/126] Add single quotes --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index 6250043..24c9d90 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -9,5 +9,5 @@ db: secretPath: gdex/pgdb03 resource: limits: - cpu: 16 + cpu: '16' memory: 128Gi \ No newline at end of file From 6f22c62c5a5e1b540c473dd6b4722a869dcbe0c2 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 20 Aug 2025 15:35:38 -0600 Subject: [PATCH 050/126] Add weekly backup to pgdb01 now that new csi driver is in place --- pgdb01-cirrus/templates/weekly-backup.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 pgdb01-cirrus/templates/weekly-backup.yaml diff --git a/pgdb01-cirrus/templates/weekly-backup.yaml b/pgdb01-cirrus/templates/weekly-backup.yaml new file mode 100644 index 0000000..61ad70f --- /dev/null +++ b/pgdb01-cirrus/templates/weekly-backup.yaml @@ -0,0 +1,21 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: {{ .Values.db.name }}-weekly-backup + namespace: {{ .Release.Namespace }} + labels: + app: {{ .Values.db.name }} +spec: + # Schedule: Every Friday at 11:00 PM + schedule: "0 23 * * 5" + + backupOwnerReference: self + + cluster: + name: {{ .Values.db.name }} + + target: primary + method: volumeSnapshot + + # Keep 8 weekly backups + retentionPolicy: "8" \ No newline at end of file From dbff4d321904d962f687188cc28c180bd40e5de6 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 20 Aug 2025 16:11:10 -0600 Subject: [PATCH 051/126] move retention policy to cluster yaml --- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++++ pgdb01-cirrus/templates/weekly-backup.yaml | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 34f68ab..bb4b947 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -11,6 +11,10 @@ spec: storage: size: {{ .Values.db.size }} + backup: + # Keep 8 weekly backups + retentionPolicy: "8" + # Add TLS certificates for encrypted communication certificates: serverTLSSecret: {{ .Values.db.name }}-server-cert diff --git a/pgdb01-cirrus/templates/weekly-backup.yaml b/pgdb01-cirrus/templates/weekly-backup.yaml index 61ad70f..e64418c 100644 --- a/pgdb01-cirrus/templates/weekly-backup.yaml +++ b/pgdb01-cirrus/templates/weekly-backup.yaml @@ -16,6 +16,3 @@ spec: target: primary method: volumeSnapshot - - # Keep 8 weekly backups - retentionPolicy: "8" \ No newline at end of file From 988f57cfeb6f6cfaee64f655473a5643d49ec2ab Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 20 Aug 2025 16:12:36 -0600 Subject: [PATCH 052/126] 8 -> 8w --- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index bb4b947..475388c 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -13,8 +13,8 @@ spec: backup: # Keep 8 weekly backups - retentionPolicy: "8" - + retentionPolicy: "8w" + # Add TLS certificates for encrypted communication certificates: serverTLSSecret: {{ .Values.db.name }}-server-cert From c17b51924ecce52fc7c249d1a359a33e841a9993 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 10:37:21 -0600 Subject: [PATCH 053/126] Add K8s network CIDR for replications allow --- pgdb01-cirrus/templates/postgres_cluster.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 475388c..863ec26 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -20,11 +20,6 @@ spec: serverTLSSecret: {{ .Values.db.name }}-server-cert serverCASecret: {{ .Values.db.name }}-server-cert - # Allow outside hosts to connect to the database - postgresql: - pg_hba: - - "host all all 0.0.0.0/0 md5" - # Enable superuser access enableSuperuserAccess: true @@ -104,4 +99,7 @@ spec: - host replication all 127.0.0.1/32 md5 # Remote replication - - host replication all 128.117.0.0/16 trust \ No newline at end of file + - host replication all 128.117.0.0/16 trust + + # Remote replication from Kubernetes pod network + - host replication all 10.0.0.0/16 trust \ No newline at end of file From e7e43a1a3e77413610934e7ca31e1e942b39c918 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 10:47:20 -0600 Subject: [PATCH 054/126] Add some annotations to try ang get the DB to reconcile --- pgdb01-cirrus/templates/postgres_cluster.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 863ec26..d5eb685 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -6,6 +6,9 @@ metadata: labels: app: {{ .Values.db.name }} group: {{ .Values.db.group }} + annotations: + cnpg.io/reconcileAt: '2025-08-27T16:45:00Z' + cnpg.io/reloadAt: '2025-08-27T16:45:00Z' spec: instances: {{ .Values.db.instances }} storage: From 7a4e209167c43dbd0d912a9c4e80a0be3da4a579 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 10:48:46 -0600 Subject: [PATCH 055/126] Add annotations to right cluster --- pgdb01-cirrus/templates/postgres_cluster.yaml | 3 --- pgdb02-cirrus/templates/postgres_cluster.yaml | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index d5eb685..863ec26 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -6,9 +6,6 @@ metadata: labels: app: {{ .Values.db.name }} group: {{ .Values.db.group }} - annotations: - cnpg.io/reconcileAt: '2025-08-27T16:45:00Z' - cnpg.io/reloadAt: '2025-08-27T16:45:00Z' spec: instances: {{ .Values.db.instances }} storage: diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index b089903..994ec1b 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -6,6 +6,9 @@ metadata: labels: app: {{ .Values.db.name }} group: {{ .Values.db.group }} + annotations: + cnpg.io/reconcileAt: '2025-08-27T16:45:00Z' + cnpg.io/reloadAt: '2025-08-27T16:45:00Z' spec: instances: {{ .Values.db.instances }} storage: From 4ce90163e96e2bbe6e2fc241c3667b0946f6433a Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 11:35:00 -0600 Subject: [PATCH 056/126] try to trigger a resync --- pgdb02-cirrus/templates/postgres_cluster.yaml | 3 --- pgdb02-cirrus/values.yaml | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 994ec1b..b089903 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -6,9 +6,6 @@ metadata: labels: app: {{ .Values.db.name }} group: {{ .Values.db.group }} - annotations: - cnpg.io/reconcileAt: '2025-08-27T16:45:00Z' - cnpg.io/reloadAt: '2025-08-27T16:45:00Z' spec: instances: {{ .Values.db.instances }} storage: diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index ac117f0..0bfcdb8 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -2,7 +2,7 @@ db: name: pgdb02 group: pgdb02 instances: 2 - size: 5000Gi + size: 5001Gi superUser: usernameKey: username passwordKey: password From 776772fab066989ae616f68086bffff5c3bc675e Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 11:36:57 -0600 Subject: [PATCH 057/126] Comment out the replica info for 02 --- pgdb02-cirrus/templates/postgres_cluster.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index b089903..15e99c3 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -11,13 +11,13 @@ spec: storage: size: {{ .Values.db.size }} - bootstrap: - pg_basebackup: - source: pgdb01-external + #bootstrap: + # pg_basebackup: + # source: pgdb01-external - replica: - enabled: true - source: pgdb01-external + #replica: + # enabled: true + # source: pgdb01-external externalClusters: - name: pgdb01-external From e0d03b695d5dc5d4c3cea46831c5379039564f6f Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 13:16:19 -0600 Subject: [PATCH 058/126] Uncomment the replica and bootstrap info --- pgdb02-cirrus/templates/postgres_cluster.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 15e99c3..b089903 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -11,13 +11,13 @@ spec: storage: size: {{ .Values.db.size }} - #bootstrap: - # pg_basebackup: - # source: pgdb01-external + bootstrap: + pg_basebackup: + source: pgdb01-external - #replica: - # enabled: true - # source: pgdb01-external + replica: + enabled: true + source: pgdb01-external externalClusters: - name: pgdb01-external From 4e1a9d815364098a1e7e90d3d46fd262284c71d5 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 13:16:57 -0600 Subject: [PATCH 059/126] Remove some bloat --- pgdb02-cirrus/templates/postgres_cluster.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index b089903..167ba6b 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -34,11 +34,6 @@ spec: serverTLSSecret: {{ .Values.db.name }}-server-cert serverCASecret: {{ .Values.db.name }}-server-cert - # Allow outside hosts to connect to the database - postgresql: - pg_hba: - - "host all all 0.0.0.0/0 md5" - # Enable superuser access enableSuperuserAccess: true @@ -47,7 +42,6 @@ spec: name: "{{ .Values.db.name }}-superuser" # Allow outside hosts to connect to the database - postgresql: postgresql: parameters: # Connection settings From 33a021e2b0990d255ba47b23ed7a374e7665ddc2 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 13:18:11 -0600 Subject: [PATCH 060/126] Add resource limits to 01 & 02 that match 03 (16/128) --- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++++ pgdb01-cirrus/values.yaml | 6 +++++- pgdb02-cirrus/templates/postgres_cluster.yaml | 4 ++++ pgdb02-cirrus/values.yaml | 6 +++++- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 863ec26..51eb057 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -10,6 +10,10 @@ spec: instances: {{ .Values.db.instances }} storage: size: {{ .Values.db.size }} + resources: + limits: + cpu: {{ .Values.db.resource.limits.cpu }} + memory: {{ .Values.db.resource.limits.memory }} backup: # Keep 8 weekly backups diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index b61c8e8..a34047f 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -6,4 +6,8 @@ db: superUser: usernameKey: username passwordKey: password - secretPath: gdex/pgdb01 \ No newline at end of file + secretPath: gdex/pgdb01 + resource: + limits: + cpu: '16' + memory: 128Gi \ No newline at end of file diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 167ba6b..9a2b4bb 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -10,6 +10,10 @@ spec: instances: {{ .Values.db.instances }} storage: size: {{ .Values.db.size }} + resources: + limits: + cpu: {{ .Values.db.resource.limits.cpu }} + memory: {{ .Values.db.resource.limits.memory }} bootstrap: pg_basebackup: diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 0bfcdb8..9345df8 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -8,4 +8,8 @@ db: passwordKey: password replicationUserKey: repuser replicationPassKey: reppass - secretPath: gdex/pgdb02 \ No newline at end of file + secretPath: gdex/pgdb02 + resource: + limits: + cpu: '16' + memory: 128Gi \ No newline at end of file From 18abfbb2588083afa6f1d2c7bdb6c944b1c30620 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 13:34:58 -0600 Subject: [PATCH 061/126] add | quote --- pgdb02-cirrus/templates/postgres_cluster.yaml | 2 +- pgdb03-cirrus/templates/postgres_cluster.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 9a2b4bb..ac9d2e6 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -12,7 +12,7 @@ spec: size: {{ .Values.db.size }} resources: limits: - cpu: {{ .Values.db.resource.limits.cpu }} + cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} bootstrap: diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index c60b0d9..b7b9ec4 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -12,7 +12,7 @@ spec: size: {{ .Values.db.size }} resources: limits: - cpu: {{ .Values.db.resource.limits.cpu }} + cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} # Add TLS certificates for encrypted communication From 96a9a8ada7e7bc93c0216107d5aa8df22f4b7dfb Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 14:42:57 -0600 Subject: [PATCH 062/126] scale pgdb01 down to 1 instance --- pgdb01-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index a34047f..e5adfdb 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 2 + instances: 1 size: 5000Gi superUser: usernameKey: username From c8024d4ae952b09babb57763d056006175047396 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 14:44:51 -0600 Subject: [PATCH 063/126] remove quotes? --- pgdb01-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index e5adfdb..d5b2fc4 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -9,5 +9,5 @@ db: secretPath: gdex/pgdb01 resource: limits: - cpu: '16' + cpu: 16 memory: 128Gi \ No newline at end of file From 02012a6f025a501b4cc9799cb388d0223a9eb92c Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 14:47:11 -0600 Subject: [PATCH 064/126] Fix the quote --- pgdb01-cirrus/templates/postgres_cluster.yaml | 2 +- pgdb01-cirrus/values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 51eb057..37b8852 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -12,7 +12,7 @@ spec: size: {{ .Values.db.size }} resources: limits: - cpu: {{ .Values.db.resource.limits.cpu }} + cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} backup: diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index d5b2fc4..e5adfdb 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -9,5 +9,5 @@ db: secretPath: gdex/pgdb01 resource: limits: - cpu: 16 + cpu: '16' memory: 128Gi \ No newline at end of file From bda061c2d36c34246bd76768d92cf91703e40a4b Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Wed, 27 Aug 2025 14:55:34 -0600 Subject: [PATCH 065/126] scale back up to 2 --- pgdb01-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index e5adfdb..a34047f 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 1 + instances: 2 size: 5000Gi superUser: usernameKey: username From 6bb885f457970ea2dcf37a0fc3f8adf2beca6883 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 10:42:14 -0600 Subject: [PATCH 066/126] Update WAL settings to increase keep size --- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 37b8852..c6191a4 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -60,8 +60,8 @@ spec: # Replication settings max_wal_senders: "6" max_replication_slots: "6" - wal_keep_size: "256" - max_slot_wal_keep_size: "-1" + wal_keep_size: "2GB" + max_slot_wal_keep_size: "10GB" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" From 8583dfb528dfb0c700e6d82b4149d30ed8098dda Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 10:46:24 -0600 Subject: [PATCH 067/126] set to the default value --- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index c6191a4..f867f33 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -60,8 +60,8 @@ spec: # Replication settings max_wal_senders: "6" max_replication_slots: "6" - wal_keep_size: "2GB" - max_slot_wal_keep_size: "10GB" + wal_keep_size: "512MB" + max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" From 4d32a01c6bc5d8b151b11136c4cd7488ecfc0fdf Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 11:18:25 -0600 Subject: [PATCH 068/126] wal keep size to 1024 --- pgdb01-cirrus/templates/postgres_cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index f867f33..fcf380c 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -60,7 +60,7 @@ spec: # Replication settings max_wal_senders: "6" max_replication_slots: "6" - wal_keep_size: "512MB" + wal_keep_size: "1024MB" max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" From fb60485760668d22cb2804f92c45ea95a3218fd7 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 14:58:18 -0600 Subject: [PATCH 069/126] change the backup to 02 and increase max slot wal keep size --- pgdb01-cirrus/templates/postgres_cluster.yaml | 6 +----- pgdb02-cirrus/templates/postgres_cluster.yaml | 4 ++++ .../templates/weekly-backup.yaml | 0 3 files changed, 5 insertions(+), 5 deletions(-) rename {pgdb01-cirrus => pgdb02-cirrus}/templates/weekly-backup.yaml (100%) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index fcf380c..9e809ca 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -15,10 +15,6 @@ spec: cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} - backup: - # Keep 8 weekly backups - retentionPolicy: "8w" - # Add TLS certificates for encrypted communication certificates: serverTLSSecret: {{ .Values.db.name }}-server-cert @@ -61,7 +57,7 @@ spec: max_wal_senders: "6" max_replication_slots: "6" wal_keep_size: "1024MB" - max_slot_wal_keep_size: "-1" + max_slot_wal_keep_size: "100GB" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index ac9d2e6..509832c 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -15,6 +15,10 @@ spec: cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} + backup: + # Keep 8 weekly backups + retentionPolicy: "8w" + bootstrap: pg_basebackup: source: pgdb01-external diff --git a/pgdb01-cirrus/templates/weekly-backup.yaml b/pgdb02-cirrus/templates/weekly-backup.yaml similarity index 100% rename from pgdb01-cirrus/templates/weekly-backup.yaml rename to pgdb02-cirrus/templates/weekly-backup.yaml From c30681673514f14ad54cc5cedea89b67022ac7b6 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 14:58:40 -0600 Subject: [PATCH 070/126] add 1 TB to volume --- pgdb01-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index a34047f..82db182 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -2,7 +2,7 @@ db: name: pgdb01 group: pgdb01 instances: 2 - size: 5000Gi + size: 6000Gi superUser: usernameKey: username passwordKey: password From e42e256988b19fabcbe143071a171be475ff1c6d Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 15:33:48 -0600 Subject: [PATCH 071/126] increase size and scale down to 1 instance --- pgdb01-cirrus/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 82db182..48342f8 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,8 +1,8 @@ db: name: pgdb01 group: pgdb01 - instances: 2 - size: 6000Gi + instances: 1 + size: 8000Gi superUser: usernameKey: username passwordKey: password From ae8af813ff785fa055dddeb75b7ea5321c2aa947 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 18:55:39 -0600 Subject: [PATCH 072/126] remove the backup for now and increase pgdb01 to 3 instances to clean up --- pgdb01-cirrus/values.yaml | 2 +- .../templates/{weekly-backup.yaml => weekly-backup.yaml.bak} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pgdb02-cirrus/templates/{weekly-backup.yaml => weekly-backup.yaml.bak} (100%) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 48342f8..24554f4 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 1 + instances: 3 size: 8000Gi superUser: usernameKey: username diff --git a/pgdb02-cirrus/templates/weekly-backup.yaml b/pgdb02-cirrus/templates/weekly-backup.yaml.bak similarity index 100% rename from pgdb02-cirrus/templates/weekly-backup.yaml rename to pgdb02-cirrus/templates/weekly-backup.yaml.bak From a4ac68ea7c0cbb3cf4532084f44d74b3fa6f216b Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 18:59:28 -0600 Subject: [PATCH 073/126] pgdb01 back down to 2 instances --- pgdb01-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 24554f4..89819f0 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 3 + instances: 2 size: 8000Gi superUser: usernameKey: username From 5c970aba85aea158c986dc00ecfae09bf591e1fe Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 28 Aug 2025 19:37:35 -0600 Subject: [PATCH 074/126] pgdb01 scale down to 1 --- pgdb01-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 89819f0..48342f8 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 2 + instances: 1 size: 8000Gi superUser: usernameKey: username From 75ab1853e72cb8597a4de51b8e6b5ff1da499b4f Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Tue, 2 Sep 2025 15:40:26 -0600 Subject: [PATCH 075/126] reenable snapshot backups on pgdb02 --- .../templates/{weekly-backup.yaml.bak => weekly-backup.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pgdb02-cirrus/templates/{weekly-backup.yaml.bak => weekly-backup.yaml} (100%) diff --git a/pgdb02-cirrus/templates/weekly-backup.yaml.bak b/pgdb02-cirrus/templates/weekly-backup.yaml similarity index 100% rename from pgdb02-cirrus/templates/weekly-backup.yaml.bak rename to pgdb02-cirrus/templates/weekly-backup.yaml From ecce20080ba7b7c2d686d6867aee2724467db897 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Tue, 2 Sep 2025 16:40:36 -0600 Subject: [PATCH 076/126] increase pgdb01 instances to 2 --- pgdb01-cirrus/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 48342f8..eb8cf5d 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb01 group: pgdb01 - instances: 1 + instances: 2 size: 8000Gi superUser: usernameKey: username @@ -10,4 +10,4 @@ db: resource: limits: cpu: '16' - memory: 128Gi \ No newline at end of file + memory: 128Gi From e0e21143112d093e10ce1cf9cdd199a9a546cc14 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 24 Sep 2025 11:55:38 -0600 Subject: [PATCH 077/126] add backups external secret --- .../templates/backups_external_secret.yaml | 24 +++++++++++++++++++ pgdb02-cirrus/values.yaml | 4 +++- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 pgdb02-cirrus/templates/backups_external_secret.yaml diff --git a/pgdb02-cirrus/templates/backups_external_secret.yaml b/pgdb02-cirrus/templates/backups_external_secret.yaml new file mode 100644 index 0000000..df65f96 --- /dev/null +++ b/pgdb02-cirrus/templates/backups_external_secret.yaml @@ -0,0 +1,24 @@ +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: backup-s3-creds + namespace: {{ .Release.Namespace }} +spec: + data: + - remoteRef: + key: {{ .Values.db.backups.secretPath }} + property: access_key + secretKey: access_key + - remoteRef: + key: {{ .Values.db.backups.secretPath }} + property: secret_key + secretKey: secret_key + refreshInterval: 1h + secretStoreRef: + kind: SecretStore + name: rda-ro + target: + creationPolicy: Owner + deletionPolicy: Retain + name: backup-s3-creds + diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 9345df8..0e7e7cb 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -12,4 +12,6 @@ db: resource: limits: cpu: '16' - memory: 128Gi \ No newline at end of file + memory: 128Gi + backups: + secretPath: gdex/boreas From 0d1209bcd7e14824c0b50e73671241a2ed9f3224 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 24 Sep 2025 14:21:18 -0600 Subject: [PATCH 078/126] add s3 backups --- pgdb02-cirrus/templates/postgres_cluster.yaml | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 509832c..a844577 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -16,6 +16,24 @@ spec: memory: {{ .Values.db.resource.limits.memory }} backup: + {{- if .Values.db.backups.volumeSnapshot }} + volumeSnapshot: + className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} + {{- end }} + {{- if .Values.db.backups.s3 }} + barmanObjectStore: + destinationPath: {{ .Values.db.backups.s3.destinationPath }} + endpointURL: {{ .Values.db.backups.s3.endpointURL }} + s3Credentials: + accessKeyId: + name: {{ .Values.db.backups.s3.secretName }} + key: access_key + secretAccessKey: + name: {{ .Values.db.backups.s3.secretName }} + key: secret_key + {{- end }} + + # Keep 8 weekly backups retentionPolicy: "8w" @@ -122,4 +140,4 @@ spec: - host replication all 127.0.0.1/32 md5 # Remote replication - - host replication all 128.117.0.0/16 trust \ No newline at end of file + - host replication all 128.117.0.0/16 trust From 9f1edfdc1938fbdef87891f341ad814d4d357d0e Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 24 Sep 2025 14:30:13 -0600 Subject: [PATCH 079/126] add s3 backups --- pgdb02-cirrus/values.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 0e7e7cb..81f0e9c 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -14,4 +14,10 @@ db: cpu: '16' memory: 128Gi backups: - secretPath: gdex/boreas + volumeSnapshot: + snapshotClassName: csi-rbdplugin-snapclass + s3: + secretName: backup-s3-creds + endpointURL: https://boreas.hpc.ucar.edu:6443 + destinationPath: s3://pgdb02/ + secretPath: gdex/boreas From 61a63b426ab6376a280b626d6c8935eb3d5348b4 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 24 Sep 2025 14:39:01 -0600 Subject: [PATCH 080/126] add s3 backups --- pgdb02-cirrus/templates/backups_external_secret.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgdb02-cirrus/templates/backups_external_secret.yaml b/pgdb02-cirrus/templates/backups_external_secret.yaml index df65f96..fb757cf 100644 --- a/pgdb02-cirrus/templates/backups_external_secret.yaml +++ b/pgdb02-cirrus/templates/backups_external_secret.yaml @@ -6,11 +6,11 @@ metadata: spec: data: - remoteRef: - key: {{ .Values.db.backups.secretPath }} + key: {{ .Values.db.backups.s3.secretPath }} property: access_key secretKey: access_key - remoteRef: - key: {{ .Values.db.backups.secretPath }} + key: {{ .Values.db.backups.s3.secretPath }} property: secret_key secretKey: secret_key refreshInterval: 1h @@ -20,5 +20,5 @@ spec: target: creationPolicy: Owner deletionPolicy: Retain - name: backup-s3-creds + name: {{ .Values.db.backups.s3.secretName }} From 205682775ceb4df8f4e5db7a00e88c327948a6b0 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 24 Sep 2025 14:48:41 -0600 Subject: [PATCH 081/126] add s3 backups --- pgdb02-cirrus/templates/s3-backup.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 pgdb02-cirrus/templates/s3-backup.yaml diff --git a/pgdb02-cirrus/templates/s3-backup.yaml b/pgdb02-cirrus/templates/s3-backup.yaml new file mode 100644 index 0000000..a06d45e --- /dev/null +++ b/pgdb02-cirrus/templates/s3-backup.yaml @@ -0,0 +1,18 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: {{ .Values.db.name }}-weekly-backup + namespace: {{ .Release.Namespace }} + labels: + app: {{ .Values.db.name }} +spec: + # Schedule: Roughly midnight local time + schedule: "0 5 * * *" + + backupOwnerReference: self + + cluster: + name: {{ .Values.db.name }} + + target: primary + method: barmanObjectStore From 750d43a06ec04d8a49f082a30d91a30d627d5a31 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 24 Sep 2025 14:49:23 -0600 Subject: [PATCH 082/126] add s3 backups --- pgdb02-cirrus/templates/s3-backup.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb02-cirrus/templates/s3-backup.yaml b/pgdb02-cirrus/templates/s3-backup.yaml index a06d45e..5171ccd 100644 --- a/pgdb02-cirrus/templates/s3-backup.yaml +++ b/pgdb02-cirrus/templates/s3-backup.yaml @@ -1,7 +1,7 @@ apiVersion: postgresql.cnpg.io/v1 kind: ScheduledBackup metadata: - name: {{ .Values.db.name }}-weekly-backup + name: {{ .Values.db.name }}-s3-backup namespace: {{ .Release.Namespace }} labels: app: {{ .Values.db.name }} From f7ddcc4943bb49272b206c30572dd5d628b02cbc Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Mon, 29 Sep 2025 15:40:55 -0600 Subject: [PATCH 083/126] test prometheus metrics --- pgdb02-cirrus/templates/postgres_cluster.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index a844577..16418b4 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -41,6 +41,8 @@ spec: pg_basebackup: source: pgdb01-external + monitoring: + enablePodMonitor: true replica: enabled: true source: pgdb01-external From 3f02906f880986da52c4d212ff44b6daedcf452e Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Tue, 30 Sep 2025 13:36:21 -0600 Subject: [PATCH 084/126] remove pgdb02 local replica because pgdb02 needs to be rebuilt --- pgdb02-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 81f0e9c..6a131b3 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb02 group: pgdb02 - instances: 2 + instances: 1 size: 5001Gi superUser: usernameKey: username From 2c9d2ecbd2a62595ebec7725ef438db0cdd69271 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Tue, 30 Sep 2025 16:46:58 -0600 Subject: [PATCH 085/126] add backup spec to pgdb01 --- .../templates/backups_external_secret.yaml | 24 +++++++++++++++++++ pgdb01-cirrus/templates/postgres_cluster.yaml | 20 ++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 pgdb01-cirrus/templates/backups_external_secret.yaml diff --git a/pgdb01-cirrus/templates/backups_external_secret.yaml b/pgdb01-cirrus/templates/backups_external_secret.yaml new file mode 100644 index 0000000..fb757cf --- /dev/null +++ b/pgdb01-cirrus/templates/backups_external_secret.yaml @@ -0,0 +1,24 @@ +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: backup-s3-creds + namespace: {{ .Release.Namespace }} +spec: + data: + - remoteRef: + key: {{ .Values.db.backups.s3.secretPath }} + property: access_key + secretKey: access_key + - remoteRef: + key: {{ .Values.db.backups.s3.secretPath }} + property: secret_key + secretKey: secret_key + refreshInterval: 1h + secretStoreRef: + kind: SecretStore + name: rda-ro + target: + creationPolicy: Owner + deletionPolicy: Retain + name: {{ .Values.db.backups.s3.secretName }} + diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 9e809ca..0e059e1 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -14,6 +14,26 @@ spec: limits: cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} + + {{- if .Values.db.backups }} + backup: + {{- if .Values.db.backups.volumeSnapshot }} + volumeSnapshot: + className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} + {{- end }} + {{- if .Values.db.backups.s3 }} + barmanObjectStore: + destinationPath: {{ .Values.db.backups.s3.destinationPath }} + endpointURL: {{ .Values.db.backups.s3.endpointURL }} + s3Credentials: + accessKeyId: + name: {{ .Values.db.backups.s3.secretName }} + key: access_key + secretAccessKey: + name: {{ .Values.db.backups.s3.secretName }} + key: secret_key + {{- end }} + {{- end }} # Add TLS certificates for encrypted communication certificates: From 90480487f1962e1536377c1a8631886514e498ad Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Tue, 30 Sep 2025 16:52:11 -0600 Subject: [PATCH 086/126] add backup spec to pgdb01 --- pgdb01-cirrus/values.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index eb8cf5d..a1a1b8d 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -11,3 +11,10 @@ db: limits: cpu: '16' memory: 128Gi + + backups: + s3: + secretName: backup-s3-creds + endpointURL: https://boreas.hpc.ucar.edu:6443 + destinationPath: s3://pgdb01/ + secretPath: gdex/boreas From ebb9431267a674309dc49dc8a7bc23c60af91677 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 1 Oct 2025 11:29:14 -0600 Subject: [PATCH 087/126] update barman backup to use compression --- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 0e059e1..0d8daa9 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -23,6 +23,10 @@ spec: {{- end }} {{- if .Values.db.backups.s3 }} barmanObjectStore: + wal: + compression: bzip2 + data: + compression: bzip2 destinationPath: {{ .Values.db.backups.s3.destinationPath }} endpointURL: {{ .Values.db.backups.s3.endpointURL }} s3Credentials: From f8e7bcdd4c16ee43da0b02cab4c7137081fb734f Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 9 Oct 2025 13:28:09 -0600 Subject: [PATCH 088/126] pgdb02 back to 2 instances and disable s3 backups --- .../templates/{s3-backup.yaml => s3-backup.yaml.disabled} | 0 pgdb02-cirrus/values.yaml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename pgdb02-cirrus/templates/{s3-backup.yaml => s3-backup.yaml.disabled} (100%) diff --git a/pgdb02-cirrus/templates/s3-backup.yaml b/pgdb02-cirrus/templates/s3-backup.yaml.disabled similarity index 100% rename from pgdb02-cirrus/templates/s3-backup.yaml rename to pgdb02-cirrus/templates/s3-backup.yaml.disabled diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 6a131b3..81f0e9c 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb02 group: pgdb02 - instances: 1 + instances: 2 size: 5001Gi superUser: usernameKey: username From 37da59ee45bd0cc42c2ea8350aaca2c85a4ef84c Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 9 Oct 2025 16:44:36 -0600 Subject: [PATCH 089/126] add pgdb04 for ml outage --- pgdb04-cirrus/.helmignore | 23 ++++ pgdb04-cirrus/Chart.yaml | 24 ++++ pgdb04-cirrus/README.md | 28 ++++ pgdb04-cirrus/templates/cert.yaml | 30 +++++ pgdb04-cirrus/templates/pg_service.yaml | 19 +++ pgdb04-cirrus/templates/postgres_cluster.yaml | 123 ++++++++++++++++++ .../templates/su_external_secret.yaml | 22 ++++ pgdb04-cirrus/values.yaml | 15 +++ 8 files changed, 284 insertions(+) create mode 100644 pgdb04-cirrus/.helmignore create mode 100644 pgdb04-cirrus/Chart.yaml create mode 100644 pgdb04-cirrus/README.md create mode 100644 pgdb04-cirrus/templates/cert.yaml create mode 100644 pgdb04-cirrus/templates/pg_service.yaml create mode 100644 pgdb04-cirrus/templates/postgres_cluster.yaml create mode 100644 pgdb04-cirrus/templates/su_external_secret.yaml create mode 100644 pgdb04-cirrus/values.yaml diff --git a/pgdb04-cirrus/.helmignore b/pgdb04-cirrus/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/pgdb04-cirrus/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/pgdb04-cirrus/Chart.yaml b/pgdb04-cirrus/Chart.yaml new file mode 100644 index 0000000..39aaa9a --- /dev/null +++ b/pgdb04-cirrus/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: CloudNative-PostgreSQL Template +description: A CloudnativePG Helm chart template for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.1.0" diff --git a/pgdb04-cirrus/README.md b/pgdb04-cirrus/README.md new file mode 100644 index 0000000..43d678e --- /dev/null +++ b/pgdb04-cirrus/README.md @@ -0,0 +1,28 @@ +# postgres-helm +A chart for deploying a PostgreSQL database cluster to the CISL cloud with Helm. This requires a superuser and regular user username and password to access the PostgreSQL database to be stored in bao.k8s.ucar.edu so it can be injected in to the required containers appropriately. + +```{note} +Information required to create a Helm chart for your web application: +1. A Name for the database. This will be used as a hostname to connect to via .k8s.ucar.edu +2. The number of PostgreSQL servers to run in the database cluster +3. The size of the volume to mount to the database and a unique name for the volume. +4. Secret information to access the database. This should be stored in bao.k8s.ucar.edu. An example of what a path would look like is, /database01. Under that path use the keys postgresuser and postgrespass to store the username and password for the DB securely. +``` + +## Update values.yaml file +In the `postgres-helm/` directory is a file named `values.yaml` which contains all the specific details for your application. You need to update the following values to be unique for your deployment: + + - `#DATABASE_NAME` : The name of the database. + - `#DATABASE_APP_GROUP` : The group of applications to run the database with. If it's a standalone DB this can just be the DB name. + - `#DATABASE_CLUSTER_MEMBERS` : The number of PostgreSQL database servers running for the cluster + - `#DATABASE_SIZE` : How large to make the database in Gi. + - `#SU_USERNAME_SECRET_KEY` : The superuser username key as designated in bao.k8s.ucar.edu. + - `#SU_PASSWORD_SECRET_KEY` : The superuser password key as designated in bao.k8s.ucar.edu. + - `#SU_SECRET_PATH` : The superuser secret path designated in bao.k8s.ucar.edu. + - `#APP_USERNAME_SECRET_KEY` : The database username key to query in bao.k8s.ucar.edu in order to get the username value. + - `#APP_PASSWORD_SECRET_KEY` : The database password key to query in bao.k8s.ucar.edu in order to get the password value. + - `#APP_USER_SECRET_PATH` : The path in bao.k8s.ucar.edu where the DB secrets are stored. + + +## Update Chart.yaml +The Chart.yaml file is mostly used to describe your application and keep track of what versions you are on and running. \ No newline at end of file diff --git a/pgdb04-cirrus/templates/cert.yaml b/pgdb04-cirrus/templates/cert.yaml new file mode 100644 index 0000000..ec7310f --- /dev/null +++ b/pgdb04-cirrus/templates/cert.yaml @@ -0,0 +1,30 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ .Values.db.name }}-selfsigned-issuer +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ .Values.db.name }}-server-cert +spec: + secretName: {{ .Values.db.name }}-server-cert + usages: + - server auth + dnsNames: + - "{{ .Values.db.name }}.k8s.ucar.edu" + - {{ .Values.db.name }}-rw + - {{ .Values.db.name }}-rw.{{ .Release.Namespace }} + - {{ .Values.db.name }}-rw.{{ .Release.Namespace }}.svc + - {{ .Values.db.name }}-r + - {{ .Values.db.name }}-r.{{ .Release.Namespace }} + - {{ .Values.db.name }}-r.{{ .Release.Namespace }}.svc + - {{ .Values.db.name }}-ro + - {{ .Values.db.name }}-ro.{{ .Release.Namespace }} + - {{ .Values.db.name }}-ro.{{ .Release.Namespace }}.svc + issuerRef: + name: {{ .Values.db.name }}-selfsigned-issuer + kind: Issuer + group: cert-manager.io \ No newline at end of file diff --git a/pgdb04-cirrus/templates/pg_service.yaml b/pgdb04-cirrus/templates/pg_service.yaml new file mode 100644 index 0000000..68ae506 --- /dev/null +++ b/pgdb04-cirrus/templates/pg_service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.db.name }}-service + namespace: {{ .Release.Namespace }} + labels: + cirrus/lb: internal + annotations: + external-dns.alpha.kubernetes.io/hostname: "{{ .Values.db.name }}.k8s.ucar.edu" + external-dns.alpha.kubernetes.io/ttl: "300" +spec: + type: LoadBalancer + selector: + cnpg.io/cluster: {{ .Values.db.name }} + cnpg.io/instanceRole: primary + ports: + - port: 5432 + targetPort: 5432 + protocol: TCP \ No newline at end of file diff --git a/pgdb04-cirrus/templates/postgres_cluster.yaml b/pgdb04-cirrus/templates/postgres_cluster.yaml new file mode 100644 index 0000000..4069e31 --- /dev/null +++ b/pgdb04-cirrus/templates/postgres_cluster.yaml @@ -0,0 +1,123 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: {{ .Values.db.name }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ .Values.db.name }} + group: {{ .Values.db.group }} +spec: + instances: {{ .Values.db.instances }} + storage: + size: {{ .Values.db.size }} + resources: + limits: + cpu: {{ .Values.db.resource.limits.cpu | quote }} + memory: {{ .Values.db.resource.limits.memory }} + + monitoring: + enablePodMonitor: true + + bootstrap: + pg_basebackup: + source: pgdb03-external + replica: + enabled: true + source: pgdb03-external + + externalClusters: + - name: pgdb03-external + connectionParameteres: + host: pgdb03.k8s.ucar.edu + user: "rep1" + sslmode: prefer + password: + name: {{ .Values.db.name }}-superuser + key: replication-password + + # Add TLS certificates for encrypted communication + certificates: + serverTLSSecret: {{ .Values.db.name }}-server-cert + serverCASecret: {{ .Values.db.name }}-server-cert + + # Enable superuser access + enableSuperuserAccess: true + + # Configure postgres superuser from su_external_secret + superuserSecret: + name: "{{ .Values.db.name }}-superuser" + + # Allow outside hosts to connect to the database + postgresql: + parameters: + # Connection settings + max_connections: "500" + + # SSL Configuration + ssl_ciphers: "HIGH:!aNULL" + ssl_min_protocol_version: "TLSv1.3" + + # Memory settings + shared_buffers: "32GB" + temp_buffers: "64MB" + work_mem: "64MB" + maintenance_work_mem: "512MB" + dynamic_shared_memory_type: "posix" + + # Resource limits + max_files_per_process: "2000" + + # WAL settings + wal_level: "replica" + checkpoint_timeout: "15min" + checkpoint_completion_target: "0.9" + max_wal_size: "16GB" + min_wal_size: "512MB" + + # Replication settings + max_wal_senders: "3" + max_replication_slots: "3" + wal_keep_size: "4GB" + max_slot_wal_keep_size: "-1" + max_standby_archive_delay: "-1" + max_standby_streaming_delay: "-1" + + # Logging settings + logging_collector: "on" + log_rotation_age: "0" + log_min_duration_statement: "120000" + log_line_prefix: "%t %a [%p] " + log_timezone: "America/Denver" + + # Locale and timezone settings + datestyle: "iso, mdy" + timezone: "America/Denver" + lc_messages: "en_US.UTF-8" + lc_monetary: "en_US.UTF-8" + lc_numeric: "en_US.UTF-8" + lc_time: "en_US.UTF-8" + default_text_search_config: "pg_catalog.english" + + # Lock management + max_locks_per_transaction: "1024" + + pg_hba: + # Local connections with md5 authentication + - local all root md5 + - local all all md5 + + # IPv4 local connections with md5 + - host all all 127.0.0.1/32 md5 + + # IPv6 local connections with md5 + - host all all ::1/128 md5 + + # IPv4 remote connections for UCAR network + - host all all 128.117.0.0/16 md5 + + # Replication connections + - local replication all md5 + - host replication all 127.0.0.1/32 md5 + + # Remote replication + - host replication all 128.117.0.0/16 trust \ No newline at end of file diff --git a/pgdb04-cirrus/templates/su_external_secret.yaml b/pgdb04-cirrus/templates/su_external_secret.yaml new file mode 100644 index 0000000..d00f818 --- /dev/null +++ b/pgdb04-cirrus/templates/su_external_secret.yaml @@ -0,0 +1,22 @@ +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: {{ .Values.db.name }}-superuser-esos + namespace: {{ .Release.Namespace }} +spec: + refreshInterval: 1h + secretStoreRef: + name: rda-ro + kind: SecretStore + target: + name: {{ .Values.db.name }}-superuser + data: + - secretKey: username + remoteRef: + key: {{ .Values.db.superUser.secretPath }} + property: {{ .Values.db.superUser.usernameKey }} + - secretKey: password + remoteRef: + key: {{ .Values.db.superUser.secretPath }} + property: {{ .Values.db.superUser.passwordKey }} + \ No newline at end of file diff --git a/pgdb04-cirrus/values.yaml b/pgdb04-cirrus/values.yaml new file mode 100644 index 0000000..6ca75e2 --- /dev/null +++ b/pgdb04-cirrus/values.yaml @@ -0,0 +1,15 @@ +db: + name: pgdb04 + group: pgdb04 + instances: 1 + size: 9000Gi + superUser: + usernameKey: username + passwordKey: password + replicationUserKey: repuser + replicationPassKey: reppass + secretPath: gdex/pgdb04 + resource: + limits: + cpu: '16' + memory: 128Gi \ No newline at end of file From a2af6641adc8be328cd32a81cd90a998ab341a27 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 9 Oct 2025 16:54:59 -0600 Subject: [PATCH 090/126] typo --- pgdb04-cirrus/templates/postgres_cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb04-cirrus/templates/postgres_cluster.yaml b/pgdb04-cirrus/templates/postgres_cluster.yaml index 4069e31..d3aee41 100644 --- a/pgdb04-cirrus/templates/postgres_cluster.yaml +++ b/pgdb04-cirrus/templates/postgres_cluster.yaml @@ -27,7 +27,7 @@ spec: externalClusters: - name: pgdb03-external - connectionParameteres: + connectionParameters: host: pgdb03.k8s.ucar.edu user: "rep1" sslmode: prefer From 83be26cf9df58dc0097b60970bbae3a4ffb72fe2 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 9 Oct 2025 17:01:43 -0600 Subject: [PATCH 091/126] pgdb04 secret futz --- pgdb04-cirrus/templates/su_external_secret.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pgdb04-cirrus/templates/su_external_secret.yaml b/pgdb04-cirrus/templates/su_external_secret.yaml index d00f818..d8419a0 100644 --- a/pgdb04-cirrus/templates/su_external_secret.yaml +++ b/pgdb04-cirrus/templates/su_external_secret.yaml @@ -19,4 +19,12 @@ spec: remoteRef: key: {{ .Values.db.superUser.secretPath }} property: {{ .Values.db.superUser.passwordKey }} + - secretKey: replication-username + remoteRef: + key: {{ .Values.db.superUser.secretPath }} + property: {{ .Values.db.superUser.replicationUserKey }} + - secretKey: replication-password + remoteRef: + key: {{ .Values.db.superUser.secretPath }} + property: {{ .Values.db.superUser.replicationPassKey }} \ No newline at end of file From ab50eed9593360db267451280ff5079dc02ac3f6 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 9 Oct 2025 17:10:17 -0600 Subject: [PATCH 092/126] pgdb03: increase max wal senders --- pgdb03-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index b7b9ec4..276aec4 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -55,8 +55,8 @@ spec: min_wal_size: "512MB" # Replication settings - max_wal_senders: "3" - max_replication_slots: "3" + max_wal_senders: "6" + max_replication_slots: "6" wal_keep_size: "4GB" max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" From 0ccf75aff0304c354491454631a8e8c231a3cb6e Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Mon, 13 Oct 2025 10:34:21 -0600 Subject: [PATCH 093/126] increase pgdb04 instances to 2 --- pgdb04-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb04-cirrus/values.yaml b/pgdb04-cirrus/values.yaml index 6ca75e2..9a06f83 100644 --- a/pgdb04-cirrus/values.yaml +++ b/pgdb04-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb04 group: pgdb04 - instances: 1 + instances: 2 size: 9000Gi superUser: usernameKey: username From d6ced0b7e7b06b0a7321d873c95fd1a3e6bcab86 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Tue, 14 Oct 2025 11:00:38 -0600 Subject: [PATCH 094/126] pgdb02 backup fixes --- pgdb02-cirrus/templates/s3-backup.yaml.disabled | 2 +- pgdb02-cirrus/templates/weekly-backup.yaml | 2 +- pgdb02-cirrus/values.yaml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pgdb02-cirrus/templates/s3-backup.yaml.disabled b/pgdb02-cirrus/templates/s3-backup.yaml.disabled index 5171ccd..10c5780 100644 --- a/pgdb02-cirrus/templates/s3-backup.yaml.disabled +++ b/pgdb02-cirrus/templates/s3-backup.yaml.disabled @@ -7,7 +7,7 @@ metadata: app: {{ .Values.db.name }} spec: # Schedule: Roughly midnight local time - schedule: "0 5 * * *" + schedule: "0 0 5 * * *" backupOwnerReference: self diff --git a/pgdb02-cirrus/templates/weekly-backup.yaml b/pgdb02-cirrus/templates/weekly-backup.yaml index e64418c..c9d1899 100644 --- a/pgdb02-cirrus/templates/weekly-backup.yaml +++ b/pgdb02-cirrus/templates/weekly-backup.yaml @@ -7,7 +7,7 @@ metadata: app: {{ .Values.db.name }} spec: # Schedule: Every Friday at 11:00 PM - schedule: "0 23 * * 5" + schedule: "0 0 23 * * 5" backupOwnerReference: self diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 81f0e9c..b38ec66 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -17,7 +17,7 @@ db: volumeSnapshot: snapshotClassName: csi-rbdplugin-snapclass s3: - secretName: backup-s3-creds - endpointURL: https://boreas.hpc.ucar.edu:6443 + secretName: backup-rgw-creds + endpointURL: https://s3.k8s.ucar.edu:5443 destinationPath: s3://pgdb02/ secretPath: gdex/boreas From db3a2f4e10bf1d965ec7b1c0845a91b2e77b12f7 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 6 Nov 2025 16:29:36 -0700 Subject: [PATCH 095/126] pgdb01: disable backups --- pgdb01-cirrus/values.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index a1a1b8d..3baf7ea 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -12,9 +12,3 @@ db: cpu: '16' memory: 128Gi - backups: - s3: - secretName: backup-s3-creds - endpointURL: https://boreas.hpc.ucar.edu:6443 - destinationPath: s3://pgdb01/ - secretPath: gdex/boreas From 6c75aca1bc3a848768e98509aa3fe8396e912f4a Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 6 Nov 2025 17:05:14 -0700 Subject: [PATCH 096/126] pgdb01: disable backups --- pgdb01-cirrus/templates/backups_external_secret.yaml | 3 ++- pgdb01-cirrus/templates/postgres_cluster.yaml | 4 ++-- pgdb01-cirrus/values.yaml | 5 ++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pgdb01-cirrus/templates/backups_external_secret.yaml b/pgdb01-cirrus/templates/backups_external_secret.yaml index fb757cf..f859b56 100644 --- a/pgdb01-cirrus/templates/backups_external_secret.yaml +++ b/pgdb01-cirrus/templates/backups_external_secret.yaml @@ -1,3 +1,4 @@ +{{- if .Values.db.backups.s3.enabled }} apiVersion: external-secrets.io/v1beta1 kind: ExternalSecret metadata: @@ -21,4 +22,4 @@ spec: creationPolicy: Owner deletionPolicy: Retain name: {{ .Values.db.backups.s3.secretName }} - +{{- end -}} diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 0d8daa9..e6aa4c6 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -21,7 +21,7 @@ spec: volumeSnapshot: className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} {{- end }} - {{- if .Values.db.backups.s3 }} + {{- if .Values.db.backups.s3.enabled }} barmanObjectStore: wal: compression: bzip2 @@ -126,4 +126,4 @@ spec: - host replication all 128.117.0.0/16 trust # Remote replication from Kubernetes pod network - - host replication all 10.0.0.0/16 trust \ No newline at end of file + - host replication all 10.0.0.0/16 trust diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index 3baf7ea..f1e8418 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -11,4 +11,7 @@ db: limits: cpu: '16' memory: 128Gi - + + backups: + s3: + enabled: false From cd83195f0b435ddb743f7d200ab5ad138c757aa1 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Mon, 10 Nov 2025 16:11:43 -0700 Subject: [PATCH 097/126] pgdb02: update chart and remove barman config --- pgdb02-cirrus/templates/backups_external_secret.yaml | 3 ++- pgdb02-cirrus/templates/postgres_cluster.yaml | 7 +++++-- pgdb02-cirrus/templates/weekly-backup.yaml | 2 ++ pgdb02-cirrus/values.yaml | 2 ++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pgdb02-cirrus/templates/backups_external_secret.yaml b/pgdb02-cirrus/templates/backups_external_secret.yaml index fb757cf..29b529f 100644 --- a/pgdb02-cirrus/templates/backups_external_secret.yaml +++ b/pgdb02-cirrus/templates/backups_external_secret.yaml @@ -1,3 +1,4 @@ +{{- if .Values.db.backups.s3.enabled }} apiVersion: external-secrets.io/v1beta1 kind: ExternalSecret metadata: @@ -21,4 +22,4 @@ spec: creationPolicy: Owner deletionPolicy: Retain name: {{ .Values.db.backups.s3.secretName }} - +{{- end }} \ No newline at end of file diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 16418b4..d817bc0 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -15,12 +15,13 @@ spec: cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} + {{- if .Values.db.backups }} backup: - {{- if .Values.db.backups.volumeSnapshot }} + {{- if .Values.db.backups.volumeSnapshot.enabled }} volumeSnapshot: className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} {{- end }} - {{- if .Values.db.backups.s3 }} + {{- if .Values.db.backups.s3.enabled }} barmanObjectStore: destinationPath: {{ .Values.db.backups.s3.destinationPath }} endpointURL: {{ .Values.db.backups.s3.endpointURL }} @@ -32,6 +33,7 @@ spec: name: {{ .Values.db.backups.s3.secretName }} key: secret_key {{- end }} + {{- end }} # Keep 8 weekly backups @@ -103,6 +105,7 @@ spec: max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" + primary_slot_name: "stream_slot" # Logging settings logging_collector: "on" diff --git a/pgdb02-cirrus/templates/weekly-backup.yaml b/pgdb02-cirrus/templates/weekly-backup.yaml index c9d1899..8825106 100644 --- a/pgdb02-cirrus/templates/weekly-backup.yaml +++ b/pgdb02-cirrus/templates/weekly-backup.yaml @@ -1,3 +1,4 @@ +{{- if .Values.db.backups.volumeSnapshot.enabled }} apiVersion: postgresql.cnpg.io/v1 kind: ScheduledBackup metadata: @@ -16,3 +17,4 @@ spec: target: primary method: volumeSnapshot +{{- end }} \ No newline at end of file diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index b38ec66..67adedb 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -15,8 +15,10 @@ db: memory: 128Gi backups: volumeSnapshot: + enabled: true snapshotClassName: csi-rbdplugin-snapclass s3: + enabled: false secretName: backup-rgw-creds endpointURL: https://s3.k8s.ucar.edu:5443 destinationPath: s3://pgdb02/ From b7e8b2092e76fc2ca2034b2f4ccc302274d9ae23 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Mon, 10 Nov 2025 16:21:19 -0700 Subject: [PATCH 098/126] pgdb02: remove primary stream slot name --- pgdb02-cirrus/templates/postgres_cluster.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index d817bc0..540f058 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -105,7 +105,6 @@ spec: max_slot_wal_keep_size: "-1" max_standby_archive_delay: "-1" max_standby_streaming_delay: "-1" - primary_slot_name: "stream_slot" # Logging settings logging_collector: "on" From 251df28c97fb7b4f724769d413652c54f7e9b741 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 12 Nov 2025 13:03:29 -0700 Subject: [PATCH 099/126] add barman plugin method --- pgdb02-cirrus/templates/postgres_cluster.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 540f058..5276c4c 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -39,6 +39,13 @@ spec: # Keep 8 weekly backups retentionPolicy: "8w" + plugins: + - enabled: true + isWALArchiver: false + name: barman-cloud.cloudnative-pg.io + parameters: + barmanObjectName: cirrus-s3 + bootstrap: pg_basebackup: source: pgdb01-external From cb6f1a1d5195f929b18b9bba06dffbe1f39b5c64 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 20 Nov 2025 10:47:38 -0700 Subject: [PATCH 100/126] pgdb02: remove legacy barman config and andd objectstore --- pgdb02-cirrus/templates/objectstore.yaml | 19 +++++++++++++++++++ pgdb02-cirrus/templates/postgres_cluster.yaml | 18 ++++-------------- 2 files changed, 23 insertions(+), 14 deletions(-) create mode 100644 pgdb02-cirrus/templates/objectstore.yaml diff --git a/pgdb02-cirrus/templates/objectstore.yaml b/pgdb02-cirrus/templates/objectstore.yaml new file mode 100644 index 0000000..e152efb --- /dev/null +++ b/pgdb02-cirrus/templates/objectstore.yaml @@ -0,0 +1,19 @@ +{{- if .Values.db.backups.s3.enabled }} +apiVersion: barmancloud.cnpg.io/v1 +kind: ObjectStore +metadata: + name: boreas +spec: + configuration: + destinationPath: {{ .Values.db.backups.s3.destinationPath }} + endpointURL: {{ .Values.db.backups.s3.endpointURL }} + s3Credentials: + accessKeyId: + name: {{ .Values.db.backups.s3.secretName }} + key: access_key + secretAccessKey: + name: {{ .Values.db.backups.s3.secretName }} + key: secret_key + wal: + compression: bzip2 +{{- end }} \ No newline at end of file diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 5276c4c..75ff6c7 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -21,30 +21,20 @@ spec: volumeSnapshot: className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} {{- end }} - {{- if .Values.db.backups.s3.enabled }} - barmanObjectStore: - destinationPath: {{ .Values.db.backups.s3.destinationPath }} - endpointURL: {{ .Values.db.backups.s3.endpointURL }} - s3Credentials: - accessKeyId: - name: {{ .Values.db.backups.s3.secretName }} - key: access_key - secretAccessKey: - name: {{ .Values.db.backups.s3.secretName }} - key: secret_key - {{- end }} - {{- end }} + {{- end }} # Keep 8 weekly backups retentionPolicy: "8w" + {{- if .Values.db.backups.s3.enabled }} plugins: - enabled: true isWALArchiver: false name: barman-cloud.cloudnative-pg.io parameters: - barmanObjectName: cirrus-s3 + barmanObjectName: boreas + {{- end }} bootstrap: pg_basebackup: From fa00b35f2eef9618547f5dc1de8bc29520922c52 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 20 Nov 2025 11:18:45 -0700 Subject: [PATCH 101/126] pgdb02: forgot this file --- pgdb02-cirrus/values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 67adedb..4a1e2c5 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -18,8 +18,8 @@ db: enabled: true snapshotClassName: csi-rbdplugin-snapclass s3: - enabled: false + enabled: true secretName: backup-rgw-creds - endpointURL: https://s3.k8s.ucar.edu:5443 - destinationPath: s3://pgdb02/ + endpointURL: https://boreas.hpc.ucar.edu:6443 + destinationPath: s3://gdex secretPath: gdex/boreas From fac40c4de364feeb225da4c71781f1640c3c90fd Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 20 Nov 2025 11:51:07 -0700 Subject: [PATCH 102/126] pgdb02: forgot this file --- pgdb02-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 4a1e2c5..242cf90 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -19,7 +19,7 @@ db: snapshotClassName: csi-rbdplugin-snapclass s3: enabled: true - secretName: backup-rgw-creds + secretName: backup-s3-creds endpointURL: https://boreas.hpc.ucar.edu:6443 destinationPath: s3://gdex secretPath: gdex/boreas From eb3a732b9e8fe0c6ff1738c5daf54a5ca7c93d84 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 20 Nov 2025 16:15:45 -0700 Subject: [PATCH 103/126] pgdb02: update s3 scheduledbackup --- .../templates/{s3-backup.yaml.disabled => s3-backup.yaml} | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) rename pgdb02-cirrus/templates/{s3-backup.yaml.disabled => s3-backup.yaml} (75%) diff --git a/pgdb02-cirrus/templates/s3-backup.yaml.disabled b/pgdb02-cirrus/templates/s3-backup.yaml similarity index 75% rename from pgdb02-cirrus/templates/s3-backup.yaml.disabled rename to pgdb02-cirrus/templates/s3-backup.yaml index 10c5780..b78c1f2 100644 --- a/pgdb02-cirrus/templates/s3-backup.yaml.disabled +++ b/pgdb02-cirrus/templates/s3-backup.yaml @@ -7,7 +7,7 @@ metadata: app: {{ .Values.db.name }} spec: # Schedule: Roughly midnight local time - schedule: "0 0 5 * * *" + schedule: "0 0 5 */3 * *" backupOwnerReference: self @@ -15,4 +15,6 @@ spec: name: {{ .Values.db.name }} target: primary - method: barmanObjectStore + method: plugin + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io From d07c64e7ad7b057be69705be010d4821db798b83 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Mon, 24 Nov 2025 13:30:08 -0700 Subject: [PATCH 104/126] pgdb02: add backup compressor --- .../templates/backup_compressor.yaml | 40 +++++++++++++++++++ .../templates/backups_external_secret.yaml | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 pgdb02-cirrus/templates/backup_compressor.yaml diff --git a/pgdb02-cirrus/templates/backup_compressor.yaml b/pgdb02-cirrus/templates/backup_compressor.yaml new file mode 100644 index 0000000..e6f52f4 --- /dev/null +++ b/pgdb02-cirrus/templates/backup_compressor.yaml @@ -0,0 +1,40 @@ +{{- if .Values.db.backups.s3.enabled }} +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ .Values.db.name }}-s3-backup-compressor +spec: + schedule: "0 8 * * *" + jobTemplate: + spec: + template: + spec: + restartPolicy: OnFailure + containers: + - name: compressor + image: hub.k8s.ucar.edu/khrpcek/backup-compressor:kmh + command: ["/multi_compress.sh"] + imagePullPolicy: Always + env: + - name: BASEDIR + value: "{{ .Values.db.backups.s3.destinationPath }}/{{ .Values.db.name }}/base/" + - name: AWS_ENDPOINT_URL + value: "{{ .Values.db.backups.s3.endpointURL }}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ .Values.db.backups.s3.secretName }} + key: access_key + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.db.backups.s3.secretName }} + key: secret_key + resources: + requests: + memory: 12Gi + cpu: 30 + limits: + memory: 16Gi + cpu: 32 +{{- end }} \ No newline at end of file diff --git a/pgdb02-cirrus/templates/backups_external_secret.yaml b/pgdb02-cirrus/templates/backups_external_secret.yaml index 29b529f..bd15724 100644 --- a/pgdb02-cirrus/templates/backups_external_secret.yaml +++ b/pgdb02-cirrus/templates/backups_external_secret.yaml @@ -2,7 +2,7 @@ apiVersion: external-secrets.io/v1beta1 kind: ExternalSecret metadata: - name: backup-s3-creds + name: {{ .Values.db.name }}-backup-s3-creds namespace: {{ .Release.Namespace }} spec: data: From c9a991486ea1a6da39061d4f72117b7051250e92 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Tue, 25 Nov 2025 13:55:02 -0700 Subject: [PATCH 105/126] pgdb02: give s3 backups more time before compression starts --- pgdb02-cirrus/templates/backup_compressor.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb02-cirrus/templates/backup_compressor.yaml b/pgdb02-cirrus/templates/backup_compressor.yaml index e6f52f4..6f34df4 100644 --- a/pgdb02-cirrus/templates/backup_compressor.yaml +++ b/pgdb02-cirrus/templates/backup_compressor.yaml @@ -4,7 +4,7 @@ kind: CronJob metadata: name: {{ .Values.db.name }}-s3-backup-compressor spec: - schedule: "0 8 * * *" + schedule: "0 12 * * *" jobTemplate: spec: template: @@ -37,4 +37,4 @@ spec: limits: memory: 16Gi cpu: 32 -{{- end }} \ No newline at end of file +{{- end }} From 3229f5a49233edf1fe990d1c6aef83f0319cf313 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Wed, 3 Dec 2025 11:19:06 -0700 Subject: [PATCH 106/126] add retention policy to s3 backups --- pgdb02-cirrus/templates/objectstore.yaml | 1 + pgdb02-cirrus/values.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pgdb02-cirrus/templates/objectstore.yaml b/pgdb02-cirrus/templates/objectstore.yaml index e152efb..0ef5d9c 100644 --- a/pgdb02-cirrus/templates/objectstore.yaml +++ b/pgdb02-cirrus/templates/objectstore.yaml @@ -4,6 +4,7 @@ kind: ObjectStore metadata: name: boreas spec: + retentionPolicy: "14d" configuration: destinationPath: {{ .Values.db.backups.s3.destinationPath }} endpointURL: {{ .Values.db.backups.s3.endpointURL }} diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 242cf90..55c5649 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -19,7 +19,7 @@ db: snapshotClassName: csi-rbdplugin-snapclass s3: enabled: true - secretName: backup-s3-creds + secretName: pgdb02-backup-s3-creds endpointURL: https://boreas.hpc.ucar.edu:6443 destinationPath: s3://gdex secretPath: gdex/boreas From 1242fa8b3d82f0b2321bf1286f0aaa28d66960b5 Mon Sep 17 00:00:00 2001 From: Kevin Hrpcek Date: Thu, 29 Jan 2026 13:29:45 -0700 Subject: [PATCH 107/126] update external secret api --- pgdb01-cirrus/templates/backups_external_secret.yaml | 2 +- pgdb01-cirrus/templates/su_external_secret.yaml | 2 +- pgdb02-cirrus/templates/backups_external_secret.yaml | 2 +- pgdb02-cirrus/templates/su_external_secret.yaml | 2 +- pgdb03-cirrus/templates/su_external_secret.yaml | 2 +- pgdb04-cirrus/templates/su_external_secret.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pgdb01-cirrus/templates/backups_external_secret.yaml b/pgdb01-cirrus/templates/backups_external_secret.yaml index f859b56..f7dd7e6 100644 --- a/pgdb01-cirrus/templates/backups_external_secret.yaml +++ b/pgdb01-cirrus/templates/backups_external_secret.yaml @@ -1,5 +1,5 @@ {{- if .Values.db.backups.s3.enabled }} -apiVersion: external-secrets.io/v1beta1 +apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: backup-s3-creds diff --git a/pgdb01-cirrus/templates/su_external_secret.yaml b/pgdb01-cirrus/templates/su_external_secret.yaml index d00f818..3567a2e 100644 --- a/pgdb01-cirrus/templates/su_external_secret.yaml +++ b/pgdb01-cirrus/templates/su_external_secret.yaml @@ -1,4 +1,4 @@ -apiVersion: external-secrets.io/v1beta1 +apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: {{ .Values.db.name }}-superuser-esos diff --git a/pgdb02-cirrus/templates/backups_external_secret.yaml b/pgdb02-cirrus/templates/backups_external_secret.yaml index bd15724..74dc9f8 100644 --- a/pgdb02-cirrus/templates/backups_external_secret.yaml +++ b/pgdb02-cirrus/templates/backups_external_secret.yaml @@ -1,5 +1,5 @@ {{- if .Values.db.backups.s3.enabled }} -apiVersion: external-secrets.io/v1beta1 +apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: {{ .Values.db.name }}-backup-s3-creds diff --git a/pgdb02-cirrus/templates/su_external_secret.yaml b/pgdb02-cirrus/templates/su_external_secret.yaml index d8419a0..f66ebc9 100644 --- a/pgdb02-cirrus/templates/su_external_secret.yaml +++ b/pgdb02-cirrus/templates/su_external_secret.yaml @@ -1,4 +1,4 @@ -apiVersion: external-secrets.io/v1beta1 +apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: {{ .Values.db.name }}-superuser-esos diff --git a/pgdb03-cirrus/templates/su_external_secret.yaml b/pgdb03-cirrus/templates/su_external_secret.yaml index d00f818..3567a2e 100644 --- a/pgdb03-cirrus/templates/su_external_secret.yaml +++ b/pgdb03-cirrus/templates/su_external_secret.yaml @@ -1,4 +1,4 @@ -apiVersion: external-secrets.io/v1beta1 +apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: {{ .Values.db.name }}-superuser-esos diff --git a/pgdb04-cirrus/templates/su_external_secret.yaml b/pgdb04-cirrus/templates/su_external_secret.yaml index d8419a0..f66ebc9 100644 --- a/pgdb04-cirrus/templates/su_external_secret.yaml +++ b/pgdb04-cirrus/templates/su_external_secret.yaml @@ -1,4 +1,4 @@ -apiVersion: external-secrets.io/v1beta1 +apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: {{ .Values.db.name }}-superuser-esos From 29ce54eaad9c7f14367676fb2e217e59457d4caa Mon Sep 17 00:00:00 2001 From: Nicholas Cote <131394540+NicholasCote@users.noreply.github.com> Date: Fri, 6 Feb 2026 12:09:16 -0700 Subject: [PATCH 108/126] Disable S3 backups until plugin cert issue is fixed There's a cert issue with the barman S3 backup plugin. We still have snapshots in place, and this is a replica so I feel good with disabling this for now in order to get the replication restarted and this cluster healthy and up to date. --- pgdb02-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 55c5649..78f66ec 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -18,7 +18,7 @@ db: enabled: true snapshotClassName: csi-rbdplugin-snapclass s3: - enabled: true + enabled: false secretName: pgdb02-backup-s3-creds endpointURL: https://boreas.hpc.ucar.edu:6443 destinationPath: s3://gdex From 4c07325dcb062658dffff8092ddc667366888c9e Mon Sep 17 00:00:00 2001 From: Nicholas Cote <131394540+NicholasCote@users.noreply.github.com> Date: Mon, 9 Feb 2026 10:20:20 -0700 Subject: [PATCH 109/126] Update values.yaml Scale replication target down to 1 --- pgdb02-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb02-cirrus/values.yaml b/pgdb02-cirrus/values.yaml index 78f66ec..9fec517 100644 --- a/pgdb02-cirrus/values.yaml +++ b/pgdb02-cirrus/values.yaml @@ -1,7 +1,7 @@ db: name: pgdb02 group: pgdb02 - instances: 2 + instances: 1 size: 5001Gi superUser: usernameKey: username From 1fb59ad8775f418ac35d264b9cd387a27cb0cf15 Mon Sep 17 00:00:00 2001 From: Nicholas Cote <131394540+NicholasCote@users.noreply.github.com> Date: Mon, 9 Feb 2026 16:07:56 -0700 Subject: [PATCH 110/126] Update postgres_cluster.yaml pin pgdb02 image to the same as pgdb01 --- pgdb02-cirrus/templates/postgres_cluster.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 75ff6c7..7dd76b9 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -7,6 +7,7 @@ metadata: app: {{ .Values.db.name }} group: {{ .Values.db.group }} spec: + imageName: ghcr.io/cloudnative-pg/postgresql:17.4 instances: {{ .Values.db.instances }} storage: size: {{ .Values.db.size }} From a08a351e1462ef5943b91814890d4ee053d4436f Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 10 Feb 2026 13:15:04 -0700 Subject: [PATCH 111/126] Add alert rules for replication --- pgdb02-cirrus/templates/alerts.yaml | 47 +++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 pgdb02-cirrus/templates/alerts.yaml diff --git a/pgdb02-cirrus/templates/alerts.yaml b/pgdb02-cirrus/templates/alerts.yaml new file mode 100644 index 0000000..1bcd531 --- /dev/null +++ b/pgdb02-cirrus/templates/alerts.yaml @@ -0,0 +1,47 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: cnpg-replication-alerts + namespace: rda + labels: + team: gdex-app-team +spec: + groups: + - name: cnpg.replication + interval: 60s + rules: + - alert: CNPGReplicationLagHigh + expr: | + cnpg_pg_replication_lag{namespace="rda"} > 100 + for: 15m + labels: + severity: warning + team: gdex-app-team + namespace: rda + annotations: + summary: "CNPG replication lag high on {{ $labels.pod }}" + description: "Replication lag is {{ $value }} WAL segments behind on {{ $labels.pod }} in cluster {{ $labels.cluster }}" + + - alert: CNPGReplicationBroken + expr: | + cnpg_pg_replication_streaming_replicas{namespace="rda"} == 0 + for: 5m + labels: + severity: critical + team: gdex-app-team + namespace: rda + annotations: + summary: "CNPG replication broken for {{ $labels.cluster }}" + description: "Cluster {{ $labels.cluster }} has no streaming replicas. Replication may be broken." + + - alert: CNPGClusterNotHealthy + expr: | + cnpg_collector_up{namespace="rda"} == 0 + for: 5m + labels: + severity: critical + team: gdex-app-team + namespace: rda + annotations: + summary: "CNPG cluster {{ $labels.cluster }} is not healthy" + description: "The CNPG exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues" \ No newline at end of file From f507db8d32748c22568b32f78f09d5293cad475c Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 10 Feb 2026 13:15:43 -0700 Subject: [PATCH 112/126] Add if s3 enabled --- pgdb02-cirrus/templates/s3-backup.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pgdb02-cirrus/templates/s3-backup.yaml b/pgdb02-cirrus/templates/s3-backup.yaml index b78c1f2..62bb677 100644 --- a/pgdb02-cirrus/templates/s3-backup.yaml +++ b/pgdb02-cirrus/templates/s3-backup.yaml @@ -1,3 +1,4 @@ +{{- if .Values.db.backups.s3.enabled }} apiVersion: postgresql.cnpg.io/v1 kind: ScheduledBackup metadata: @@ -18,3 +19,4 @@ spec: method: plugin pluginConfiguration: name: barman-cloud.cloudnative-pg.io +{{- end }} \ No newline at end of file From c254d9dd49e9d3cdebaeeb47f5df6d2429df9cb9 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 10 Feb 2026 13:34:55 -0700 Subject: [PATCH 113/126] Update alerts for Go syntax for {{}} variables from grafana --- pgdb02-cirrus/templates/alerts.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pgdb02-cirrus/templates/alerts.yaml b/pgdb02-cirrus/templates/alerts.yaml index 1bcd531..64518c3 100644 --- a/pgdb02-cirrus/templates/alerts.yaml +++ b/pgdb02-cirrus/templates/alerts.yaml @@ -4,7 +4,7 @@ metadata: name: cnpg-replication-alerts namespace: rda labels: - team: gdex-app-team + team: rda-app-team spec: groups: - name: cnpg.replication @@ -16,11 +16,11 @@ spec: for: 15m labels: severity: warning - team: gdex-app-team + team: rda-app-team namespace: rda annotations: - summary: "CNPG replication lag high on {{ $labels.pod }}" - description: "Replication lag is {{ $value }} WAL segments behind on {{ $labels.pod }} in cluster {{ $labels.cluster }}" + summary: "CNPG replication lag high on {{ `{{` }} $labels.pod {{ `}}` }}" + description: "Replication lag is {{ `{{` }} $value {{ `}}` }} WAL segments behind on {{ `{{` }} $labels.pod {{ `}}` }} in cluster {{ `{{` }} $labels.cluster {{ `}}` }}" - alert: CNPGReplicationBroken expr: | @@ -28,11 +28,11 @@ spec: for: 5m labels: severity: critical - team: gdex-app-team + team: rda-app-team namespace: rda annotations: - summary: "CNPG replication broken for {{ $labels.cluster }}" - description: "Cluster {{ $labels.cluster }} has no streaming replicas. Replication may be broken." + summary: "CNPG replication broken for {{ `{{` }} $labels.cluster {{ `}}` }}" + description: "Cluster {{ `{{` }} $labels.cluster {{ `}}` }} has no streaming replicas. Replication may be broken." - alert: CNPGClusterNotHealthy expr: | @@ -40,8 +40,8 @@ spec: for: 5m labels: severity: critical - team: gdex-app-team + team: rda-app-team namespace: rda annotations: - summary: "CNPG cluster {{ $labels.cluster }} is not healthy" - description: "The CNPG exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues" \ No newline at end of file + summary: "CNPG cluster {{ `{{` }} $labels.cluster {{ `}}` }} is not healthy" + description: "The CNPG exporter for cluster {{ `{{` }} $labels.cluster {{ `}}` }} is down, indicating cluster health issues" \ No newline at end of file From ca99fcb64111a9fe7401e0b8e3eca0b23cda4158 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Tue, 10 Feb 2026 13:36:23 -0700 Subject: [PATCH 114/126] switch team name to gdex --- pgdb02-cirrus/templates/alerts.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pgdb02-cirrus/templates/alerts.yaml b/pgdb02-cirrus/templates/alerts.yaml index 64518c3..8f8197f 100644 --- a/pgdb02-cirrus/templates/alerts.yaml +++ b/pgdb02-cirrus/templates/alerts.yaml @@ -4,7 +4,7 @@ metadata: name: cnpg-replication-alerts namespace: rda labels: - team: rda-app-team + team: gdex-app-team spec: groups: - name: cnpg.replication @@ -16,7 +16,7 @@ spec: for: 15m labels: severity: warning - team: rda-app-team + team: gdex-app-team namespace: rda annotations: summary: "CNPG replication lag high on {{ `{{` }} $labels.pod {{ `}}` }}" @@ -28,7 +28,7 @@ spec: for: 5m labels: severity: critical - team: rda-app-team + team: gdex-app-team namespace: rda annotations: summary: "CNPG replication broken for {{ `{{` }} $labels.cluster {{ `}}` }}" @@ -40,7 +40,7 @@ spec: for: 5m labels: severity: critical - team: rda-app-team + team: gdex-app-team namespace: rda annotations: summary: "CNPG cluster {{ `{{` }} $labels.cluster {{ `}}` }} is not healthy" From 40b3d856c60d9035d175ddaadc35b97c5f080331 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 12 Feb 2026 11:54:58 -0700 Subject: [PATCH 115/126] Update alert configuration --- pgdb02-cirrus/templates/alert-email.yaml | 27 ++++++++++++++ pgdb02-cirrus/templates/alert-rule.yaml | 47 ++++++++++++++++++++++++ pgdb02-cirrus/templates/alerts.yaml | 47 ------------------------ 3 files changed, 74 insertions(+), 47 deletions(-) create mode 100644 pgdb02-cirrus/templates/alert-email.yaml create mode 100644 pgdb02-cirrus/templates/alert-rule.yaml delete mode 100644 pgdb02-cirrus/templates/alerts.yaml diff --git a/pgdb02-cirrus/templates/alert-email.yaml b/pgdb02-cirrus/templates/alert-email.yaml new file mode 100644 index 0000000..465e4f8 --- /dev/null +++ b/pgdb02-cirrus/templates/alert-email.yaml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: gdex-app-team + namespace: rda + labels: + alertmanagerConfig: gdex + namespace: rda +spec: + route: + receiver: gdex-app-team + groupBy: + - alertname + groupWait: 10s + groupInterval: 1m + repeatInterval: 60m + matchers: + - name: namespace + value: rda + matchType: "=" + + receivers: + - name: gdex-app-team + emailConfigs: + - to: decs-info@ucar.edu + from: alertmanager@k8s.ucar.edu + smarthost: vdir.ucar.edu:25 \ No newline at end of file diff --git a/pgdb02-cirrus/templates/alert-rule.yaml b/pgdb02-cirrus/templates/alert-rule.yaml new file mode 100644 index 0000000..bc0254e --- /dev/null +++ b/pgdb02-cirrus/templates/alert-rule.yaml @@ -0,0 +1,47 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: gdex-pg-replication-alerts + namespace: rda + labels: + team: gdex-app-team +spec: + groups: + - name: pg.replication + interval: 60s + rules: + - alert: PGReplicationLagHigh + expr: | + cnpg_pg_replication_lag{namespace="rda"} > 100 + for: 15m + labels: + severity: warning + team: gdex-app-team + namespace: rda + annotations: + summary: "PostgresDB replication lag high on {{ $labels.pod }}" + description: "Replication lag is {{ $value }} WAL segments behind on {{ $labels.pod }} in cluster {{ $labels.cluster }}" + + - alert: PGReplicationBroken + expr: | + cnpg_pg_replication_streaming_replicas{namespace="rda"} == 0 + for: 5m + labels: + severity: critical + team: gdex-app-team + namespace: rda + annotations: + summary: "PostgresDB replication broken for {{ $labels.cluster }}" + description: "Cluster {{ $labels.cluster }} has no streaming replicas. Replication may be broken." + + - alert: PGClusterNotHealthy + expr: | + cnpg_collector_up{namespace="rda"} == 0 + for: 5m + labels: + severity: critical + team: gdex-app-team + namespace: rda + annotations: + summary: "PostgresDB cluster {{ $labels.cluster }} is not healthy" + description: "The Postgres exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues" \ No newline at end of file diff --git a/pgdb02-cirrus/templates/alerts.yaml b/pgdb02-cirrus/templates/alerts.yaml deleted file mode 100644 index 8f8197f..0000000 --- a/pgdb02-cirrus/templates/alerts.yaml +++ /dev/null @@ -1,47 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: cnpg-replication-alerts - namespace: rda - labels: - team: gdex-app-team -spec: - groups: - - name: cnpg.replication - interval: 60s - rules: - - alert: CNPGReplicationLagHigh - expr: | - cnpg_pg_replication_lag{namespace="rda"} > 100 - for: 15m - labels: - severity: warning - team: gdex-app-team - namespace: rda - annotations: - summary: "CNPG replication lag high on {{ `{{` }} $labels.pod {{ `}}` }}" - description: "Replication lag is {{ `{{` }} $value {{ `}}` }} WAL segments behind on {{ `{{` }} $labels.pod {{ `}}` }} in cluster {{ `{{` }} $labels.cluster {{ `}}` }}" - - - alert: CNPGReplicationBroken - expr: | - cnpg_pg_replication_streaming_replicas{namespace="rda"} == 0 - for: 5m - labels: - severity: critical - team: gdex-app-team - namespace: rda - annotations: - summary: "CNPG replication broken for {{ `{{` }} $labels.cluster {{ `}}` }}" - description: "Cluster {{ `{{` }} $labels.cluster {{ `}}` }} has no streaming replicas. Replication may be broken." - - - alert: CNPGClusterNotHealthy - expr: | - cnpg_collector_up{namespace="rda"} == 0 - for: 5m - labels: - severity: critical - team: gdex-app-team - namespace: rda - annotations: - summary: "CNPG cluster {{ `{{` }} $labels.cluster {{ `}}` }} is not healthy" - description: "The CNPG exporter for cluster {{ `{{` }} $labels.cluster {{ `}}` }} is down, indicating cluster health issues" \ No newline at end of file From 21491af5d115df96745673bbffd69f1337d4c80a Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 12 Feb 2026 11:57:01 -0700 Subject: [PATCH 116/126] Escape variables for Golang --- pgdb02-cirrus/templates/alert-rule.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pgdb02-cirrus/templates/alert-rule.yaml b/pgdb02-cirrus/templates/alert-rule.yaml index bc0254e..0efdfb6 100644 --- a/pgdb02-cirrus/templates/alert-rule.yaml +++ b/pgdb02-cirrus/templates/alert-rule.yaml @@ -19,8 +19,8 @@ spec: team: gdex-app-team namespace: rda annotations: - summary: "PostgresDB replication lag high on {{ $labels.pod }}" - description: "Replication lag is {{ $value }} WAL segments behind on {{ $labels.pod }} in cluster {{ $labels.cluster }}" + summary: "PostgresDB replication lag high on {{`{{ $labels.pod }}`}}" + description: "Replication lag is {{`{{ $value }}`}} WAL segments behind on {{`{{ $labels.pod }}`}} in cluster {{`{{ $labels.cluster }}`}}" - alert: PGReplicationBroken expr: | @@ -31,8 +31,8 @@ spec: team: gdex-app-team namespace: rda annotations: - summary: "PostgresDB replication broken for {{ $labels.cluster }}" - description: "Cluster {{ $labels.cluster }} has no streaming replicas. Replication may be broken." + summary: "PostgresDB replication broken for {{`{{ $labels.cluster }}`}}" + description: "Cluster {{`{{ $labels.cluster }}`}} has no streaming replicas. Replication may be broken." - alert: PGClusterNotHealthy expr: | @@ -43,5 +43,5 @@ spec: team: gdex-app-team namespace: rda annotations: - summary: "PostgresDB cluster {{ $labels.cluster }} is not healthy" - description: "The Postgres exporter for cluster {{ $labels.cluster }} is down, indicating cluster health issues" \ No newline at end of file + summary: "PostgresDB cluster {{`{{ $labels.cluster }}`}} is not healthy" + description: "The Postgres exporter for cluster {{`{{ $labels.cluster }}`}} is down, indicating cluster health issues" \ No newline at end of file From a085df73c0781bb26ed00a6a82683f38f5f04b10 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 12 Feb 2026 13:33:00 -0700 Subject: [PATCH 117/126] Add release label --- pgdb02-cirrus/templates/alert-rule.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pgdb02-cirrus/templates/alert-rule.yaml b/pgdb02-cirrus/templates/alert-rule.yaml index 0efdfb6..3b7769d 100644 --- a/pgdb02-cirrus/templates/alert-rule.yaml +++ b/pgdb02-cirrus/templates/alert-rule.yaml @@ -5,6 +5,7 @@ metadata: namespace: rda labels: team: gdex-app-team + release: kube-prometheus-stack spec: groups: - name: pg.replication From c4974f2f04deaca1c4579bb5f9e1aa8061e5fd82 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 12 Feb 2026 13:50:18 -0700 Subject: [PATCH 118/126] Fix replication broken, this is inside a cluster, so 1 replica will trigger this alert --- pgdb02-cirrus/templates/alert-rule.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb02-cirrus/templates/alert-rule.yaml b/pgdb02-cirrus/templates/alert-rule.yaml index 3b7769d..5357e24 100644 --- a/pgdb02-cirrus/templates/alert-rule.yaml +++ b/pgdb02-cirrus/templates/alert-rule.yaml @@ -25,14 +25,14 @@ spec: - alert: PGReplicationBroken expr: | - cnpg_pg_replication_streaming_replicas{namespace="rda"} == 0 + cnpg_pg_replication_streaming_replicas{namespace="rda", cluster="pgdb03"} == 0 for: 5m labels: severity: critical team: gdex-app-team namespace: rda annotations: - summary: "PostgresDB replication broken for {{`{{ $labels.cluster }}`}}" + summary: "PostgresDB cluster replication broken for {{`{{ $labels.cluster }}`}}" description: "Cluster {{`{{ $labels.cluster }}`}} has no streaming replicas. Replication may be broken." - alert: PGClusterNotHealthy From 45a89b2018f7283678b126e1d7c362724712409b Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Thu, 12 Feb 2026 13:51:28 -0700 Subject: [PATCH 119/126] Add alerts to nwc1 --- pgdb01-cirrus/templates/alert-email.yaml | 27 +++++++++++++ pgdb01-cirrus/templates/alert-rule.yaml | 48 ++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 pgdb01-cirrus/templates/alert-email.yaml create mode 100644 pgdb01-cirrus/templates/alert-rule.yaml diff --git a/pgdb01-cirrus/templates/alert-email.yaml b/pgdb01-cirrus/templates/alert-email.yaml new file mode 100644 index 0000000..465e4f8 --- /dev/null +++ b/pgdb01-cirrus/templates/alert-email.yaml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: gdex-app-team + namespace: rda + labels: + alertmanagerConfig: gdex + namespace: rda +spec: + route: + receiver: gdex-app-team + groupBy: + - alertname + groupWait: 10s + groupInterval: 1m + repeatInterval: 60m + matchers: + - name: namespace + value: rda + matchType: "=" + + receivers: + - name: gdex-app-team + emailConfigs: + - to: decs-info@ucar.edu + from: alertmanager@k8s.ucar.edu + smarthost: vdir.ucar.edu:25 \ No newline at end of file diff --git a/pgdb01-cirrus/templates/alert-rule.yaml b/pgdb01-cirrus/templates/alert-rule.yaml new file mode 100644 index 0000000..15ed310 --- /dev/null +++ b/pgdb01-cirrus/templates/alert-rule.yaml @@ -0,0 +1,48 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: gdex-pg-replication-alerts + namespace: rda + labels: + team: gdex-app-team + release: kube-prometheus-stack +spec: + groups: + - name: pg.replication + interval: 60s + rules: + - alert: PGReplicationLagHigh + expr: | + cnpg_pg_replication_lag{namespace="rda"} > 100 + for: 15m + labels: + severity: warning + team: gdex-app-team + namespace: rda + annotations: + summary: "PostgresDB replication lag high on {{`{{ $labels.pod }}`}}" + description: "Replication lag is {{`{{ $value }}`}} WAL segments behind on {{`{{ $labels.pod }}`}} in cluster {{`{{ $labels.cluster }}`}}" + + - alert: PGReplicationBroken + expr: | + cnpg_pg_replication_streaming_replicas{namespace="rda"} == 0 + for: 5m + labels: + severity: critical + team: gdex-app-team + namespace: rda + annotations: + summary: "PostgresDB cluster replication broken for {{`{{ $labels.cluster }}`}}" + description: "Cluster {{`{{ $labels.cluster }}`}} has no streaming replicas. Replication may be broken." + + - alert: PGClusterNotHealthy + expr: | + cnpg_collector_up{namespace="rda"} == 0 + for: 5m + labels: + severity: critical + team: gdex-app-team + namespace: rda + annotations: + summary: "PostgresDB cluster {{`{{ $labels.cluster }}`}} is not healthy" + description: "The Postgres exporter for cluster {{`{{ $labels.cluster }}`}} is down, indicating cluster health issues" \ No newline at end of file From 6fe61e77b47431309660843db04544c5ad28220c Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 6 Mar 2026 12:05:02 -0700 Subject: [PATCH 120/126] fix some if statements, add s3 backup to pgdb03 for testing --- pgdb01-cirrus/templates/postgres_cluster.yaml | 2 +- pgdb01-cirrus/values.yaml | 1 + pgdb02-cirrus/templates/postgres_cluster.yaml | 3 --- pgdb03-cirrus/templates/postgres_cluster.yaml | 25 +++++++++++++++++++ pgdb03-cirrus/values.yaml | 12 ++++++++- 5 files changed, 38 insertions(+), 5 deletions(-) diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index e6aa4c6..17f143d 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -37,7 +37,7 @@ spec: name: {{ .Values.db.backups.s3.secretName }} key: secret_key {{- end }} - {{- end }} + {{- end }} # Add TLS certificates for encrypted communication certificates: diff --git a/pgdb01-cirrus/values.yaml b/pgdb01-cirrus/values.yaml index f1e8418..0d63122 100644 --- a/pgdb01-cirrus/values.yaml +++ b/pgdb01-cirrus/values.yaml @@ -13,5 +13,6 @@ db: memory: 128Gi backups: + enabled: false s3: enabled: false diff --git a/pgdb02-cirrus/templates/postgres_cluster.yaml b/pgdb02-cirrus/templates/postgres_cluster.yaml index 7dd76b9..8cac490 100644 --- a/pgdb02-cirrus/templates/postgres_cluster.yaml +++ b/pgdb02-cirrus/templates/postgres_cluster.yaml @@ -16,14 +16,11 @@ spec: cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} - {{- if .Values.db.backups }} backup: {{- if .Values.db.backups.volumeSnapshot.enabled }} volumeSnapshot: className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} {{- end }} - {{- end }} - # Keep 8 weekly backups retentionPolicy: "8w" diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index 276aec4..28984ae 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -15,6 +15,31 @@ spec: cpu: {{ .Values.db.resource.limits.cpu | quote }} memory: {{ .Values.db.resource.limits.memory }} + {{- if .Values.db.backups }} + backup: + target: "prefer-standby" + {{- if .Values.db.backups.volumeSnapshot }} + volumeSnapshot: + className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} + {{- end }} + {{- if .Values.db.backups.s3.enabled }} + barmanObjectStore: + wal: + compression: bzip2 + data: + compression: bzip2 + destinationPath: {{ .Values.db.backups.s3.destinationPath }} + endpointURL: {{ .Values.db.backups.s3.endpointURL }} + s3Credentials: + accessKeyId: + name: {{ .Values.db.backups.s3.secretName }} + key: access_key + secretAccessKey: + name: {{ .Values.db.backups.s3.secretName }} + key: secret_key + {{- end }} + {{- end }} + # Add TLS certificates for encrypted communication certificates: serverTLSSecret: {{ .Values.db.name }}-server-cert diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index 24c9d90..45a4512 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -10,4 +10,14 @@ db: resource: limits: cpu: '16' - memory: 128Gi \ No newline at end of file + memory: 128Gi + backups: + enabled: false + volumeSnapshot: + enabled: false + s3: + enabled: true + secretName: pgdb02-backup-s3-creds + endpointURL: https://boreas.hpc.ucar.edu:6443 + destinationPath: s3://gdex + secretPath: gdex/boreas \ No newline at end of file From 0b1ec3790b26c54dd2efd50bacf140a0a91bb94f Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 6 Mar 2026 12:49:58 -0700 Subject: [PATCH 121/126] add s3 eso --- .../templates/backups_external_secret.yaml | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 pgdb03-cirrus/templates/backups_external_secret.yaml diff --git a/pgdb03-cirrus/templates/backups_external_secret.yaml b/pgdb03-cirrus/templates/backups_external_secret.yaml new file mode 100644 index 0000000..74dc9f8 --- /dev/null +++ b/pgdb03-cirrus/templates/backups_external_secret.yaml @@ -0,0 +1,25 @@ +{{- if .Values.db.backups.s3.enabled }} +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: {{ .Values.db.name }}-backup-s3-creds + namespace: {{ .Release.Namespace }} +spec: + data: + - remoteRef: + key: {{ .Values.db.backups.s3.secretPath }} + property: access_key + secretKey: access_key + - remoteRef: + key: {{ .Values.db.backups.s3.secretPath }} + property: secret_key + secretKey: secret_key + refreshInterval: 1h + secretStoreRef: + kind: SecretStore + name: rda-ro + target: + creationPolicy: Owner + deletionPolicy: Retain + name: {{ .Values.db.backups.s3.secretName }} +{{- end }} \ No newline at end of file From 052ec80b109d3dd068ccb38f577287757a2f3425 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 6 Mar 2026 12:51:48 -0700 Subject: [PATCH 122/126] change secret name --- pgdb03-cirrus/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdb03-cirrus/values.yaml b/pgdb03-cirrus/values.yaml index 45a4512..c6c3b13 100644 --- a/pgdb03-cirrus/values.yaml +++ b/pgdb03-cirrus/values.yaml @@ -17,7 +17,7 @@ db: enabled: false s3: enabled: true - secretName: pgdb02-backup-s3-creds + secretName: pgdb03-backup-s3-creds endpointURL: https://boreas.hpc.ucar.edu:6443 destinationPath: s3://gdex secretPath: gdex/boreas \ No newline at end of file From 66233518f74f77c76ef061aa920c179ec9f29bb7 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 6 Mar 2026 13:15:21 -0700 Subject: [PATCH 123/126] add null route for info alerts to decrease tds noise --- pgdb01-cirrus/templates/alert-email.yaml | 9 ++++++++- pgdb02-cirrus/templates/alert-email.yaml | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pgdb01-cirrus/templates/alert-email.yaml b/pgdb01-cirrus/templates/alert-email.yaml index 465e4f8..51150d3 100644 --- a/pgdb01-cirrus/templates/alert-email.yaml +++ b/pgdb01-cirrus/templates/alert-email.yaml @@ -18,10 +18,17 @@ spec: - name: namespace value: rda matchType: "=" + routes: + - receiver: "null" + matchers: + - name: alertname + value: InfoInhibitor + matchType: "=" receivers: - name: gdex-app-team emailConfigs: - to: decs-info@ucar.edu from: alertmanager@k8s.ucar.edu - smarthost: vdir.ucar.edu:25 \ No newline at end of file + smarthost: vdir.ucar.edu:25 + - name: "null" \ No newline at end of file diff --git a/pgdb02-cirrus/templates/alert-email.yaml b/pgdb02-cirrus/templates/alert-email.yaml index 465e4f8..cd5933c 100644 --- a/pgdb02-cirrus/templates/alert-email.yaml +++ b/pgdb02-cirrus/templates/alert-email.yaml @@ -18,10 +18,17 @@ spec: - name: namespace value: rda matchType: "=" + routes: + - receiver: "null" + matchers: + - name: alertname + value: InfoInhibitor + matchType: "=" receivers: - name: gdex-app-team emailConfigs: - to: decs-info@ucar.edu from: alertmanager@k8s.ucar.edu - smarthost: vdir.ucar.edu:25 \ No newline at end of file + smarthost: vdir.ucar.edu:25 + - name: "null" \ No newline at end of file From 355f702d02723061450ef78541fc5aebd47fcd56 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 6 Mar 2026 13:45:13 -0700 Subject: [PATCH 124/126] add 03 s3 backup retention period and a schedule backup --- pgdb01-cirrus/templates/postgres_cluster.yaml | 2 +- pgdb03-cirrus/templates/postgres_cluster.yaml | 2 ++ pgdb03-cirrus/templates/s3_backup.yaml | 11 +++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 pgdb03-cirrus/templates/s3_backup.yaml diff --git a/pgdb01-cirrus/templates/postgres_cluster.yaml b/pgdb01-cirrus/templates/postgres_cluster.yaml index 17f143d..de0defb 100644 --- a/pgdb01-cirrus/templates/postgres_cluster.yaml +++ b/pgdb01-cirrus/templates/postgres_cluster.yaml @@ -16,7 +16,7 @@ spec: memory: {{ .Values.db.resource.limits.memory }} {{- if .Values.db.backups }} - backup: + backup: {{- if .Values.db.backups.volumeSnapshot }} volumeSnapshot: className: {{ .Values.db.backups.volumeSnapshot.snapshotClassName }} diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index 28984ae..eaef4c7 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -24,6 +24,8 @@ spec: {{- end }} {{- if .Values.db.backups.s3.enabled }} barmanObjectStore: + # Keep 8 weekly backups + retentionPolicy: "8w" wal: compression: bzip2 data: diff --git a/pgdb03-cirrus/templates/s3_backup.yaml b/pgdb03-cirrus/templates/s3_backup.yaml new file mode 100644 index 0000000..02f7b25 --- /dev/null +++ b/pgdb03-cirrus/templates/s3_backup.yaml @@ -0,0 +1,11 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: {{ .Values.db.name }}-backup + namespace: {{ .Release.Namespace }} +spec: + schedule: "0 0 2 * * *" # daily at 2am + backupOwnerReference: self + cluster: + name: {{ .Values.db.name }} + method: barmanObjectStore \ No newline at end of file From 2f3a7ec3acb937fd78dce20cd47872694f9069ba Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 6 Mar 2026 13:47:05 -0700 Subject: [PATCH 125/126] put the retention policy in the right place --- pgdb03-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index eaef4c7..a62521d 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -17,6 +17,8 @@ spec: {{- if .Values.db.backups }} backup: + # Keep 8 weekly backups + retentionPolicy: "8w" target: "prefer-standby" {{- if .Values.db.backups.volumeSnapshot }} volumeSnapshot: @@ -24,8 +26,6 @@ spec: {{- end }} {{- if .Values.db.backups.s3.enabled }} barmanObjectStore: - # Keep 8 weekly backups - retentionPolicy: "8w" wal: compression: bzip2 data: From 90fd92a4276e6a7baeb81f0a4a6f204b1360f1c1 Mon Sep 17 00:00:00 2001 From: Nick Cote Date: Fri, 6 Mar 2026 15:59:09 -0700 Subject: [PATCH 126/126] switch to gzip to stream compression instead of compressing locally. --- pgdb03-cirrus/templates/postgres_cluster.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgdb03-cirrus/templates/postgres_cluster.yaml b/pgdb03-cirrus/templates/postgres_cluster.yaml index a62521d..56de9d8 100644 --- a/pgdb03-cirrus/templates/postgres_cluster.yaml +++ b/pgdb03-cirrus/templates/postgres_cluster.yaml @@ -27,9 +27,9 @@ spec: {{- if .Values.db.backups.s3.enabled }} barmanObjectStore: wal: - compression: bzip2 + compression: gzip data: - compression: bzip2 + compression: gzip destinationPath: {{ .Values.db.backups.s3.destinationPath }} endpointURL: {{ .Values.db.backups.s3.endpointURL }} s3Credentials: