From 034710a7f73d0c6383ed0eba717d31e72eb3271f Mon Sep 17 00:00:00 2001 From: titigmr Date: Thu, 25 Jun 2026 07:42:24 +0200 Subject: [PATCH] feat: :tada: Add configurable CRD health checks --- README.md | 15 +++ .../properties/CrdHealthCheckProperties.java | 101 ++++++++++++++++ .../api/services/impl/HelmAppsService.java | 10 +- .../impl/HelmReleaseHealthResolver.java | 112 +++++++++++++----- .../src/main/resources/application.properties | 13 ++ 5 files changed, 216 insertions(+), 35 deletions(-) create mode 100644 onyxia-api/src/main/java/fr/insee/onyxia/api/configuration/properties/CrdHealthCheckProperties.java diff --git a/README.md b/README.md index c1986e54..a756ab51 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,22 @@ Configurable properties : | `event.webhook.includes` | | List of events types to send the webhook for (empty = all events). e.g `service.uninstall,service.install` | | `event.webhook.excludes` | | List of events types to ignore for the webhook. e.g `service.uninstall,service.install` | +### Health checks configuration +Deployments, StatefulSets and DaemonSets are monitored natively. Additional CRDs can be configured as a list: + +| Key | Default | Description | +| --- | ------- | ----------- | +| `health.custom-crd.checks[n].group` | | API group of the CRD. e.g. `postgresql.cnpg.io` | +| `health.custom-crd.checks[n].version` | | API version of the CRD. e.g. `v1` | +| `health.custom-crd.checks[n].plural` | | Plural resource name used in the API URL. e.g. `clusters` | +| `health.custom-crd.checks[n].kind` | | Kind name as it appears in the manifest. e.g. `Cluster` | +| `health.custom-crd.checks[n].strategy` | `FIELDS` | `FIELDS`: read two integer fields from `status`. `CONDITION`: check `status.conditions` | +| `health.custom-crd.checks[n].desired-field` | | Status field for the desired count (`FIELDS` strategy). e.g. `instances` | +| `health.custom-crd.checks[n].ready-field` | | Status field for the ready count (`FIELDS` strategy). e.g. `readyInstances` | +| `health.custom-crd.checks[n].condition-type` | `Ready` | Condition type to look for in `status.conditions` (`CONDITION` strategy) | + ### Admin configuration: + :warning: This section should be considered pre-alpha and may be subject to major changes and revamps :warning: | Key | Default | Description | diff --git a/onyxia-api/src/main/java/fr/insee/onyxia/api/configuration/properties/CrdHealthCheckProperties.java b/onyxia-api/src/main/java/fr/insee/onyxia/api/configuration/properties/CrdHealthCheckProperties.java new file mode 100644 index 00000000..0f1b72a8 --- /dev/null +++ b/onyxia-api/src/main/java/fr/insee/onyxia/api/configuration/properties/CrdHealthCheckProperties.java @@ -0,0 +1,101 @@ +package fr.insee.onyxia.api.configuration.properties; + +import java.util.ArrayList; +import java.util.List; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ConfigurationProperties(prefix = "health.custom-crd") +public class CrdHealthCheckProperties { + + private List checks = new ArrayList<>(); + + public List getChecks() { + return checks; + } + + public void setChecks(List checks) { + this.checks = checks; + } + + public enum Strategy { + FIELDS, + CONDITION + } + + public static class CrdHealthCheck { + private String group; + private String version; + private String plural; + private String kind; + private Strategy strategy = Strategy.FIELDS; + private String desiredField; + private String readyField; + private String conditionType = "Ready"; + + public String getGroup() { + return group; + } + + public void setGroup(String group) { + this.group = group; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public String getPlural() { + return plural; + } + + public void setPlural(String plural) { + this.plural = plural; + } + + public String getKind() { + return kind; + } + + public void setKind(String kind) { + this.kind = kind; + } + + public Strategy getStrategy() { + return strategy; + } + + public void setStrategy(Strategy strategy) { + this.strategy = strategy; + } + + public String getDesiredField() { + return desiredField; + } + + public void setDesiredField(String desiredField) { + this.desiredField = desiredField; + } + + public String getReadyField() { + return readyField; + } + + public void setReadyField(String readyField) { + this.readyField = readyField; + } + + public String getConditionType() { + return conditionType; + } + + public void setConditionType(String conditionType) { + this.conditionType = conditionType; + } + } +} diff --git a/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmAppsService.java b/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmAppsService.java index 35319101..e6b596f6 100644 --- a/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmAppsService.java +++ b/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmAppsService.java @@ -1,6 +1,5 @@ package fr.insee.onyxia.api.services.impl; -import static fr.insee.onyxia.api.services.impl.HelmReleaseHealthResolver.checkHelmReleaseHealth; import static fr.insee.onyxia.api.services.impl.ServiceUrlResolver.getServiceUrls; import com.fasterxml.jackson.databind.JsonNode; @@ -63,6 +62,8 @@ public class HelmAppsService implements AppsService { private final HelmClientProvider helmClientProvider; + private final HelmReleaseHealthResolver helmReleaseHealthResolver; + final OnyxiaEventPublisher onyxiaEventPublisher; public static final String ONYXIA_SECRET_PREFIX = "sh.onyxia.release.v1."; @@ -77,12 +78,14 @@ public HelmAppsService( KubernetesService kubernetesService, KubernetesClientProvider kubernetesClientProvider, HelmClientProvider helmClientProvider, - OnyxiaEventPublisher onyxiaEventPublisher) { + OnyxiaEventPublisher onyxiaEventPublisher, + HelmReleaseHealthResolver helmReleaseHealthResolver) { this.mapperHelm = mapperHelm; this.kubernetesService = kubernetesService; this.kubernetesClientProvider = kubernetesClientProvider; this.helmClientProvider = helmClientProvider; this.onyxiaEventPublisher = onyxiaEventPublisher; + this.helmReleaseHealthResolver = helmReleaseHealthResolver; } private HelmConfiguration getHelmConfiguration(Region region, User user) { @@ -540,7 +543,8 @@ private Service getServiceFromRelease( try { List controllers = - checkHelmReleaseHealth(release.getNamespace(), manifest, client); + helmReleaseHealthResolver.checkHelmReleaseHealth( + release.getNamespace(), manifest, client); service.setControllers(controllers); } catch (Exception e) { LOGGER.warn( diff --git a/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmReleaseHealthResolver.java b/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmReleaseHealthResolver.java index 985682f2..a5645ab5 100644 --- a/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmReleaseHealthResolver.java +++ b/onyxia-api/src/main/java/fr/insee/onyxia/api/services/impl/HelmReleaseHealthResolver.java @@ -1,5 +1,7 @@ package fr.insee.onyxia.api.services.impl; +import fr.insee.onyxia.api.configuration.properties.CrdHealthCheckProperties; +import fr.insee.onyxia.api.configuration.properties.CrdHealthCheckProperties.CrdHealthCheck; import fr.insee.onyxia.model.service.HealthCheckResult; import io.fabric8.kubernetes.api.model.GenericKubernetesResource; import io.fabric8.kubernetes.api.model.HasMetadata; @@ -18,22 +20,21 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; -public final class HelmReleaseHealthResolver { +@Component +public class HelmReleaseHealthResolver { private static final Logger LOGGER = LoggerFactory.getLogger(HelmReleaseHealthResolver.class); - private static final ResourceDefinitionContext CNPG_CLUSTER = - new ResourceDefinitionContext.Builder() - .withGroup("postgresql.cnpg.io") - .withVersion("v1") - .withPlural("clusters") - .withNamespaced(true) - .build(); + private final CrdHealthCheckProperties crdHealthCheckProperties; - static List checkHelmReleaseHealth( + public HelmReleaseHealthResolver(CrdHealthCheckProperties crdHealthCheckProperties) { + this.crdHealthCheckProperties = crdHealthCheckProperties; + } + + List checkHelmReleaseHealth( String namespace, String manifest, KubernetesClient kubernetesClient) { - // Identify the Helm release secret List resources; try (InputStream inputStream = new ByteArrayInputStream(manifest.getBytes(StandardCharsets.UTF_8))) { @@ -45,13 +46,12 @@ static List checkHelmReleaseHealth( return checkHealth(namespace, resources, kubernetesClient); } - private static List checkHealth( + private List checkHealth( String namespace, List resources, KubernetesClient kubernetesClient) { List results = new ArrayList<>(); for (HasMetadata resource : resources) { String name = resource.getMetadata().getName(); String kind = resource.getKind(); - String apiVersion = resource.getApiVersion(); HealthCheckResult result = new HealthCheckResult(); result.setName(name); result.setKind(kind); @@ -109,27 +109,11 @@ private static List checkHealth( details.setReady(daemonSet.getStatus().getNumberReady()); } break; - case "Cluster": - if (!"postgresql.cnpg.io/v1".equals(apiVersion)) continue; - GenericKubernetesResource raw = - kubernetesClient - .genericKubernetesResources(CNPG_CLUSTER) - .inNamespace(namespace) - .withName(name) - .get(); - if (raw.getAdditionalProperties().get("status") instanceof Map) { - Map status = - Collections.unmodifiableMap( - (Map) - raw.getAdditionalProperties().get("status")); - details.setDesired( - Integer.parseInt(status.get("instances").toString())); - details.setReady( - Integer.parseInt(status.get("readyInstances").toString())); - } - break; default: - continue; + CrdHealthCheck check = findConfiguredCrd(kind, resource.getApiVersion()); + if (check == null) continue; + resolveCustomCrdHealth(namespace, name, check, kubernetesClient, details); + break; } } catch (Exception e) { LOGGER.warn( @@ -144,4 +128,68 @@ private static List checkHealth( } return results; } + + private CrdHealthCheck findConfiguredCrd(String kind, String apiVersion) { + return crdHealthCheckProperties.getChecks().stream() + .filter(c -> c.getKind().equals(kind)) + .filter( + c -> + apiVersion == null + || apiVersion.equals(c.getGroup() + "/" + c.getVersion())) + .findFirst() + .orElse(null); + } + + private void resolveCustomCrdHealth( + String namespace, + String name, + CrdHealthCheck check, + KubernetesClient kubernetesClient, + HealthCheckResult.HealthDetails details) { + ResourceDefinitionContext ctx = + new ResourceDefinitionContext.Builder() + .withGroup(check.getGroup()) + .withVersion(check.getVersion()) + .withPlural(check.getPlural()) + .withNamespaced(true) + .build(); + + GenericKubernetesResource raw = + kubernetesClient + .genericKubernetesResources(ctx) + .inNamespace(namespace) + .withName(name) + .get(); + + if (raw == null || !(raw.getAdditionalProperties().get("status") instanceof Map)) { + return; + } + + Map status = + Collections.unmodifiableMap( + (Map) raw.getAdditionalProperties().get("status")); + + switch (check.getStrategy()) { + case FIELDS: + details.setDesired(Integer.parseInt(status.get(check.getDesiredField()).toString())); + details.setReady(Integer.parseInt(status.get(check.getReadyField()).toString())); + break; + case CONDITION: + details.setDesired(1); + details.setReady(isConditionTrue(status, check.getConditionType()) ? 1 : 0); + break; + } + } + + @SuppressWarnings("unchecked") + private boolean isConditionTrue(Map status, String conditionType) { + Object conditionsObj = status.get("conditions"); + if (!(conditionsObj instanceof List)) { + return false; + } + return ((List>) conditionsObj) + .stream() + .filter(c -> conditionType.equals(c.get("type"))) + .anyMatch(c -> "True".equals(c.get("status"))); + } } diff --git a/onyxia-api/src/main/resources/application.properties b/onyxia-api/src/main/resources/application.properties index c4ad58b9..2be3b8a8 100644 --- a/onyxia-api/src/main/resources/application.properties +++ b/onyxia-api/src/main/resources/application.properties @@ -51,3 +51,16 @@ spring.mvc.async.request-timeout=600000 #logging.structured.format.console=gelf # Enable admin endpoints admin.enabled=false +# Custom CRD health checks. Each entry targets a CRD not natively handled (Deployment, StatefulSet, DaemonSet). +# kind: Kubernetes kind name as it appears in the manifest +# group/version/plural: CRD API coordinates (plural is the URL path segment, e.g. clusters) +# strategy: FIELDS (read two integer fields from status) or CONDITION (check status.conditions) +# desired-field/ready-field: status field names for FIELDS strategy. e.g. instances/readyInstances +# condition-type: condition type to look for in status.conditions for CONDITION strategy (default: Ready) +health.custom-crd.checks[0].group=postgresql.cnpg.io +health.custom-crd.checks[0].version=v1 +health.custom-crd.checks[0].plural=clusters +health.custom-crd.checks[0].kind=Cluster +health.custom-crd.checks[0].strategy=FIELDS +health.custom-crd.checks[0].desired-field=instances +health.custom-crd.checks[0].ready-field=readyInstances