diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f5a4ab9..3544692 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -1,40 +1,22 @@ { "$schema": "https://anthropic.com/claude-code/marketplace.schema.json", "name": "cozystack-claude-plugins", - "description": "Claude Code plugins for the Cozystack ecosystem — deployment skills, development workflows, and infrastructure tools", + "description": "Claude Code plugins for the Cozystack ecosystem — cozystack:* platform skills (wizard, talos-bootstrap, talos-reset, ubuntu-bootstrap, cluster-install, debug, cluster-upgrade, package-deploy, package-bump, external-app-create) plus linstor:* DRBD/LINSTOR operations", "owner": { "name": "Cozystack", "url": "https://github.com/cozystack" }, "plugins": [ { - "name": "cozy-deploy", - "description": "Deploy a Cozystack package to a dev cluster via make + cozyhr — handles fresh install and dev-loop iteration with ExternalArtifact support", - "source": "./skills/cozy-deploy", + "name": "cozystack", + "description": "Cozystack platform skills bundle — wizard (entry-point orchestrator that interviews + dispatches the chain), talos-bootstrap (Talos node prep via talm with maintenance-mode probe, cert-SAN NAT guardrail, multidoc machine-config, and opt-in boot-method picker), talos-reset (cloud-provider terminate+relaunch helper for unrecoverable Talos nodes — OCI/AWS/GCP/Hetzner; preserves disks + VNICs + NSGs), ubuntu-bootstrap (Ubuntu/Debian k3s bootstrap wrapping ansible-cozystack), cluster-install (Cozystack on a ready cluster — node-readiness, ZFS pool provisioning, extractedprism HA proxy, all-HRs-Ready + storage-pools-registered gate), debug (investigate stuck installs — classify operator-error/config-drift/upstream-bug/not-supported, apply fixes or workarounds, draft upstream issues on approval), cluster-upgrade (release-notes-driven v1.x patch/minor upgrade), package-deploy (dev-loop deploy with ExternalArtifact support), package-bump (single-package version bump with changelog adaptation), external-app-create (scaffold a new external-apps package). Invoked as cozystack:wizard, cozystack:talos-bootstrap, cozystack:talos-reset, cozystack:ubuntu-bootstrap, cozystack:cluster-install, cozystack:debug, cozystack:cluster-upgrade, cozystack:package-deploy, cozystack:package-bump, cozystack:external-app-create.", + "source": "./plugins/cozystack", "category": "infrastructure" }, { - "name": "cozy-external-app", - "description": "Scaffold a new Cozystack external app package — generates chart skeleton, ApplicationDefinition, and handles dependency integration (e.g. Immich → Postgres) via managed CNPG clusters or external secret references", - "source": "./skills/cozy-external-app", - "category": "infrastructure" - }, - { - "name": "drbd-recovery", - "description": "Diagnose and recover DRBD/LINSTOR storage issues in Kubernetes clusters — handles StandAlone, DELETING, Inconsistent, Diskless, quorum loss, bitmap errors, and other common failure modes", - "source": "./skills/drbd-recovery", - "category": "infrastructure" - }, - { - "name": "cozystack-upgrade", - "description": "Guided upgrade of a running Cozystack v1.x cluster to a newer v1.x patch or minor version — release-notes analysis, prechecks, stop gates, helm upgrade, targeted post-upgrade verification, known failure recovery", - "source": "./skills/cozystack-upgrade", - "category": "infrastructure" - }, - { - "name": "cozy-bump", - "description": "Bump a cozystack monorepo package — reads upstream changelog, adapts to breaking changes, regenerates schema, optionally deploys to a dev cluster", - "source": "./skills/cozy-bump", + "name": "linstor", + "description": "LINSTOR / DRBD operations bundle for Kubernetes — invoked as linstor:recover for diagnosing and recovering broken DRBD resources (StandAlone, DELETING, Inconsistent, Diskless, quorum loss, bitmap errors). Useful on any Kubernetes cluster that runs piraeus-operator / LINSTOR.", + "source": "./plugins/linstor", "category": "infrastructure" } ] diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml new file mode 100644 index 0000000..dd5ea02 --- /dev/null +++ b/.github/workflows/validate.yml @@ -0,0 +1,34 @@ +name: validate + +on: + push: + branches: [main] + pull_request: + +jobs: + jq: + name: jq lint manifests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Validate marketplace.json + run: jq . .claude-plugin/marketplace.json > /dev/null + - name: Validate every plugin.json + run: | + set -euo pipefail + fail=0 + while IFS= read -r f; do + if ! jq . "$f" > /dev/null; then + echo "FAIL: $f is not valid JSON" >&2 + fail=1 + fi + done < <(find plugins -name plugin.json -type f) + exit "$fail" + + cross-refs: + name: cross-reference validator + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: tools/check-refs.sh + run: bash tools/check-refs.sh diff --git a/CLAUDE.md b/CLAUDE.md index 2ea793e..69ea9f6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,22 +4,67 @@ This file provides guidance to Claude Code when working with this repository. ## What This Is -Cozystack Claude Plugins (CCP) — an external marketplace repository for Claude Code -plugins for the Cozystack ecosystem. +Cozystack Claude Plugins (CCP) — an external marketplace repository for Claude Code plugins for the Cozystack ecosystem. ## Repository Structure -- **`agents/`** — Agent definitions -- **`skills/`** — Skill definitions (SKILL.md with frontmatter) -- **`mcp/`** — MCP server definitions -- **`hooks/`** — Hook plugins +```text +plugins/ + / + .claude-plugin/plugin.json # plugin metadata + skills/ + / + SKILL.md # skill spec (frontmatter + workflow) + references/ # supporting docs the skill reads +.claude-plugin/ + marketplace.json # registry; lists every plugin + description +tools/ + check-refs.sh # cross-reference validator (CI gate) +.github/workflows/ + validate.yml # PR validation: jq + check-refs.sh +README.md # operator-facing skill catalogue +CLAUDE.md # this file — contributor guidance +``` -Registry: `.claude-plugin/marketplace.json` +Two plugins ship today: + +- `plugins/cozystack/` — platform bundle (10 skills: wizard, talos-bootstrap, talos-reset, ubuntu-bootstrap, cluster-install, debug, cluster-upgrade, package-deploy, package-bump, external-app-create). +- `plugins/linstor/` — storage-recovery (1 skill: recover). + +Multi-skill plugin shape: every plugin has one `.claude-plugin/plugin.json` at its root, and one directory per skill under `skills/`. Skills are addressed by Claude Code as `/:` (e.g. `/cozystack:wizard`). + +## Adding a New Skill to an Existing Plugin + +1. `mkdir plugins//skills//{references}` (references optional). +2. Write `plugins//skills//SKILL.md` with YAML frontmatter (`name:`, `description:`, optional `argument-hint:`). +3. Update `plugins//.claude-plugin/plugin.json` `description` to mention the new skill (the cross-reference checker in `tools/check-refs.sh` enforces this). +4. Update `.claude-plugin/marketplace.json` `plugins[].description` for the parent plugin — list every skill the plugin ships. +5. Update `README.md` skills table. +6. `bash tools/check-refs.sh` locally before commit. ## Adding a New Plugin -1. Create directory under the appropriate type -2. Add `.claude-plugin/plugin.json` with metadata -3. Add content files (SKILL.md, agent .md, .mcp.json, or hooks.json) -4. Register in `.claude-plugin/marketplace.json` -5. Update README.md +1. `mkdir -p plugins//{.claude-plugin,skills}`. +2. Write `plugins//.claude-plugin/plugin.json` with `name`, `version`, `description` (mentioning every skill). +3. Add one or more skills per the section above. +4. Register the plugin in `.claude-plugin/marketplace.json` `plugins[]` with `name`, `description`, `source: ./plugins/`, `category`. +5. Update `README.md`. +6. `bash tools/check-refs.sh`. + +## Cross-reference discipline + +The skills lean heavily on each other (`cozystack:wizard` dispatches `cozystack:talos-bootstrap` etc.), and skill bodies reference sibling skills and `references/.md` documents. Stale paths and renamed skill identifiers cause silent breakage — operators type a skill name that no longer exists, or follow a link to a file that's been moved. `tools/check-refs.sh` walks the plugin tree and validates: + +- Every `references/.md` mentioned in a SKILL.md exists on disk. +- Every `cozystack:` / `linstor:` mention resolves to an actual directory under `plugins//skills/`. +- Every plugin's `description` in `marketplace.json` and in its own `plugin.json` mentions every skill present under `plugins//skills/`. + +Run before any commit that touches skill names, references, or descriptions. + +## Versioning + +`plugin.json` `version` follows semver. Bump: + +- patch — text-only fixes (typos, doc cleanup). +- minor — new skills, new features, schema additions. +- major — breaking changes for installed users (renames, removals, layout shifts). diff --git a/README.md b/README.md index 58e8b36..be9bf3a 100644 --- a/README.md +++ b/README.md @@ -14,20 +14,80 @@ Add the marketplace: Install a plugin: ```text -/plugin install @cozystack-claude-plugins +/plugin install cozystack@cozystack-claude-plugins +/plugin install linstor@cozystack-claude-plugins ``` ## Plugins -### Skills +### cozystack -| Plugin | Description | +Platform skills bundle. One install gives you nine skills, invoked as `/cozystack:`. Start with `/cozystack:wizard` — it asks Talos / Ubuntu / Existing and picks the chain. + +| Skill | Description | +| --- | --- | +| **/cozystack:wizard** | Entry point. Opens with a free-form "tell me about your setup and goal" question so context comes through in the operator's own words and pre-fills the structured questions. Then asks Talos / Ubuntu / Existing, builds a chain, dispatches downstream skills via a cluster config directory the operator picks. Artifacts (inventory, kubeconfig, state, platform-package YAML) all live there — operator manages git on their own. Every skill in the chain matches the operator's natural language. | +| **/cozystack:talos-bootstrap** | Bootstrap Talos nodes via talm. Default: probe nodes for maintenance mode (Talos-1.12-aware `get disks --insecure`); if ready, write per-node multidoc machine-config stubs with VIP-link static IPv4, run NAT-provider cert-SAN guardrail (auto-populates `values.yaml.certSANs` with public IPs before first `talm apply`), then `talm apply` + `talosctl bootstrap` + kubeconfig fetch + cozystack-tuned shape verification with auto-upgrade to tuned image when nodes booted from base Talos. Opt-in boot-method picker (OCI Custom Image / boot-to-talos / ISO / PXE) only when nodes aren't yet imaged. | +| **/cozystack:talos-reset** | Cloud-provider recovery helper when Talos nodes are unrecoverable from inside the cluster (cert-SAN trap, broken machine-config, lost talosconfig). Wraps `oci` / `aws` / `gcloud` / `hcloud` to terminate + relaunch from the cozystack-tuned image while preserving block volumes, secondary VNICs, NSG memberships. Sequential per-node to maintain etcd quorum. Hands off to `cozystack:talos-bootstrap` for re-bootstrap. | +| **/cozystack:ubuntu-bootstrap** | Bootstrap Ubuntu / Debian nodes by wrapping `cozystack/ansible-cozystack/examples/ubuntu/` — OS prep, drbd-dkms for Secure Boot, ZFS + KubeVirt modules, k3s install with cozystack-compatible flags, kubeconfig retrieval. Stops before Cozystack itself. | +| **/cozystack:cluster-install** | Cozystack on a ready cluster — node-readiness validation, variant picker, interactive values, per-node ZFS pool provisioning, extractedprism for kube-apiserver HA, cozy-installer chart, Platform Package apply, root Tenant ingress patch, wait until every HelmRelease is Ready, NOTES summary. | +| **/cozystack:debug** | Investigate a stuck or broken Cozystack install. Gathers symptoms, classifies (operator error / config drift / upstream bug / not-yet-supported), applies fixes or workarounds, drafts upstream issues with diagnostic bundle on approval. Never opens PRs or files silently. Auto-dispatched by the wizard when any chain step fails. | +| **/cozystack:cluster-upgrade** | Guided upgrade of a running Cozystack v1.x cluster — release-notes analysis, prechecks, stop gates, helm upgrade, targeted post-upgrade verification, known-failure recovery. | +| **/cozystack:package-deploy** | Deploy a single Cozystack package to a dev cluster via make + cozyhr — handles fresh install and dev-loop iteration with ExternalArtifact support. | +| **/cozystack:package-bump** | Bump a single package inside the cozystack monorepo — reads upstream changelog, adapts to breaking changes, regenerates schema, optionally deploys to a dev cluster. | +| **/cozystack:external-app-create** | Scaffold a new Cozystack external app package with dependency integration (managed CNPG Postgres, external secret references). | + +Chains the wizard builds: + +| Target | Chain | +| ----------- | ----------- | +| Bare-metal Talos | `talos-bootstrap` → `cluster-install` | +| Bare-metal Ubuntu / Debian | `ubuntu-bootstrap` → `cluster-install` | +| Existing Kubernetes (self-managed or managed) | `cluster-install` | +| Existing Cozystack | refuse → `cozystack:cluster-upgrade` | + +### linstor + +LINSTOR / DRBD operations bundle. Useful on any Kubernetes cluster that runs piraeus-operator / LINSTOR, not just on Cozystack. + +| Skill | Description | | --- | --- | -| **cozy-deploy** | Deploy a Cozystack package to a dev cluster via make + cozyhr | -| **cozy-external-app** | Scaffold a new Cozystack external app package with dependency integration | -| **drbd-recovery** | Diagnose and recover DRBD/LINSTOR storage issues in Kubernetes clusters | -| **cozystack-upgrade** | Guided upgrade of a running Cozystack v1.x cluster to a newer v1.x patch or minor version | -| **cozy-bump** | Bump a cozystack monorepo package — reads upstream changelog, adapts to breaking changes, regenerates schema, optionally deploys to a dev cluster | +| **/linstor:recover** | Diagnose and recover broken DRBD resources — handles StandAlone, DELETING, Inconsistent, Diskless, quorum loss, bitmap errors, and other common failure modes. | + +## Third-party dependencies + +`cozystack:cluster-install` default-installs [extractedprism](https://github.com/lexfrei/extractedprism) on `generic` variant clusters (k3s / kubeadm / RKE2). extractedprism is a per-node TCP load balancer that gives generic Linux Kubernetes the same `localhost:7445` kube-apiserver shape Talos has built-in (KubePrism), so Cilium and KubeOVN can dial a stable local address regardless of which control-plane node is up. + +Project metadata: + +- Source: `https://github.com/lexfrei/extractedprism` (BSD-3-Clause). +- Helm chart: `oci://ghcr.io/lexfrei/charts/extractedprism`. +- Maintained independently by a Cozystack contributor; reviewed and approved by the Cozystack platform team for use as the generic-variant HA proxy. + +Operators can opt out with `--no-extractedprism` and supply their own `--api-host=` (external LB, VIP, or single CP IP with the SPOF caveat) — see `cozystack:cluster-install` Phase 4. Talos and hosted variants do not need extractedprism. + +## Repository layout + +```text +plugins/ + cozystack/ # platform bundle (9 skills) + .claude-plugin/plugin.json + skills/ + wizard/ # entry point: interview + chain dispatcher + talos-bootstrap/ # Talos node prep + talos-reset/ # cloud-provider terminate+relaunch helper + ubuntu-bootstrap/ # Ubuntu/Debian via ansible-cozystack wrapper + cluster-install/ # Cozystack on a ready cluster + debug/ # investigate + classify + workaround + issue draft + cluster-upgrade/ # v1.x patch/minor upgrade + package-deploy/ # dev-loop deploy of a single package + package-bump/ # bump a monorepo package + external-app-create/ # scaffold a new external-apps package + linstor/ # storage bundle (1 skill) + .claude-plugin/plugin.json + skills/ + recover/ +``` ## License diff --git a/plugins/cozystack/.claude-plugin/plugin.json b/plugins/cozystack/.claude-plugin/plugin.json new file mode 100644 index 0000000..a105b3e --- /dev/null +++ b/plugins/cozystack/.claude-plugin/plugin.json @@ -0,0 +1,9 @@ +{ + "name": "cozystack", + "version": "1.14.1", + "description": "Cozystack platform skills bundle. Start with cozystack:wizard — it begins with a free-form 'tell me about your setup and goal' question, parses hints, then asks Talos / Ubuntu / Existing, builds a chain, and dispatches downstream skills via a cluster config directory the operator picks (every artifact lives there: inventory.yml, kubeconfig, .state.yaml, cozystack-platform-package.yaml — operator manages git on their own; optional sops opt-in encrypts secret files in-tree). Skills, invoked as cozystack:: wizard (orchestrator + 3-route dispatcher + Phase 4.5 active research + auto-dispatches debug on any failed_at), talos-bootstrap (Talos node prep via talm — Talos-1.12-aware maintenance probe, NAT-provider cert-SAN guardrail before first talm apply, multidoc machine-config with per-node VIP-link IPv4 stubs, etcd bootstrap, kubeconfig fetch, cozystack-tuned shape verification with Phase 11.5 auto-upgrade), talos-reset (cloud-provider terminate+relaunch helper for OCI/AWS/GCP/Hetzner when nodes are unrecoverable from inside; preserves block volumes + secondary VNICs + NSG memberships), ubuntu-bootstrap (wraps cozystack/ansible-cozystack — OS prep + k3s install in one go), cluster-install (Cozystack on a ready cluster — node-readiness, ZFS pool provisioning via privileged DaemonSet on Talos with hostNetwork, extractedprism for kube-apiserver HA, OCI-tag-normalized cozy-installer chart, Platform Package, inline tenants/root ingress patch + LINSTOR pool registration during watch loop with combined HRs-Ready + pools-registered gate, Phase 8.6 default StorageClasses for v1.3.x, Phase 9.1 end-to-end reachability probe), debug (investigate a stuck or broken install — gathers symptoms, classifies operator error / config drift / upstream bug / not-yet-supported, applies fixes or workarounds, drafts upstream issues on approval; never opens PRs or files silently), cluster-upgrade (v1.x patch/minor upgrade with release-notes analysis), package-deploy (dev-loop deploy of a single package with ExternalArtifact support), package-bump (single-package version bump with changelog adaptation), external-app-create (scaffold a new external-apps package). All skills match the operator's natural language detected from conversation context — code identifiers, commands, file paths, and GitHub-public text stay canonical. All skills follow the same gate-and-confirm discipline: read-only lookups run freely; any mutation needs explicit per-step approval.", + "author": { + "name": "Cozystack", + "url": "https://github.com/cozystack" + } +} diff --git a/plugins/cozystack/skills/cluster-install/SKILL.md b/plugins/cozystack/skills/cluster-install/SKILL.md new file mode 100644 index 0000000..6ad9a62 --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/SKILL.md @@ -0,0 +1,1059 @@ +--- +name: cluster-install +description: Use when installing Cozystack on an existing Kubernetes cluster (kubeadm / k3s / RKE2 / managed). Discovers cluster facts, validates node readiness via kubectl debug, recommends an installer + platform variant, gathers values interactively, creates the ZFS pool on each storage node through kubectl debug (cozystack standardises on ZFS for LINSTOR — LVM / LVM-thin paths are not supported), installs extractedprism (per-node kube-apiserver HA proxy, generic variant default), installs the cozy-installer chart, applies the Platform Package, patches the root Tenant for ingress (breaks the OIDC chicken-and-egg), waits until every HelmRelease is Ready, prints a NOTES-style access summary, and offers an issue-template handoff to cozystack/* on fatal failure. Not for Kubernetes bootstrap and not for upgrades — see `cozystack:cluster-upgrade` for that. Talos node-prep is out of scope — `cozystack:cluster-install` refuses if Talos nodes are missing cozystack-tuned extensions and points at `cozystack:talos-bootstrap`. +argument-hint: "[--config-dir=] [--context=] [--installer-version=] [--installer-variant=] [--platform-variant=] [--no-extractedprism] [--api-host=] [--dry-run]" +--- + +# cozystack:cluster-install + +Work in reasoning mode. Use the phrasing `cozystack:cluster-install` (not "the skill") in user-facing messages. Announce phase transitions: `cozystack:cluster-install Phase N — `. + +> **Note on language in this SKILL.md** — every operator-facing prompt below is written in English for clarity. At runtime the skill matches the operator's natural language detected from prior conversation messages (or read from `/.state.yaml` `operator_language` when the wizard chain is in progress). Code identifiers, commands, file paths, and any text destined for GitHub stay canonical. + +Entry point note: `cozystack:wizard` is the recommended way to reach this skill. The wizard runs node-bootstrap (`talos-bootstrap` / `ubuntu-bootstrap`) before handing off here when needed. This skill is also callable directly when the operator already has a ready cluster with prepared nodes. + +Source of truth, in priority order: + +1. Live cluster state — `kubectl --context $CTX ...`. +2. Upstream chart values: `~/git/github.com/cozystack/cozystack/packages/core/{installer,platform}/values.yaml` and the variant overlays. +3. Install guide for the major matching `--installer-version`: `https://cozystack.io/docs/v/install/kubernetes/generic/` (or `talos/`, `air-gapped`, etc.). +4. Ansible reference: `~/git/github.com/cozystack/ansible-cozystack/roles/cozystack/{defaults,tasks}/main.yml`. + +Never guess versions, IPs, label values, or CIDRs — read them from the cluster or ask. + +## Core principles + +- Match the operator's natural language. Read from `/.state.yaml` `operator_language` (set by `cozystack:wizard` Phase 0) or detect from prior messages when invoked directly. Use it in prompts, AskUserQuestion options, summaries, and gates. Code identifiers, commands, file paths, and GitHub-public text stay in their canonical form. +- One valid path → just do it. After the operator approved the consolidated plan in Phase 5 (STOP GATE 2), the skill runs helm install + Platform Package apply + Tenant patch + HR wait + verification back-to-back without re-prompting. Approval gates remain only for (a) multi-option questions in Phase 4 (storage layout, network values, publishing host), (b) destructive operations (Phase 5.5 per-node storage provisioning — each `zpool create` is a real choice with data implications), (c) STOP GATEs 1/2/3 themselves. No "are you ready to continue?" between phases that have one valid outcome. +- Front-load the interview. **Every question the skill might ask in any phase is collected upfront in Phase 4**. That includes per-node storage devices (Phase 5.5), per-node provisioning approvals (Phase 5.5), extractedprism opt-out (Phase 5.6), Tenant ingress patch confirmation (Phase 8, inline), and HR stuck-state recovery preferences (Phase 8). Phase 2 cluster lookup + Phase 0 `intent_hints` from `/.state.yaml` must fill every slot they can before any question fires. Phase 4 presents **one consolidated summary** with every slot filled (defaults marked) and quick-edit affordances. Phases 5.5 onward execute against the collected answers — no re-prompts mid-flow except destructive STOP GATEs that have to ask by their nature (e.g. `zpool destroy` on an existing pool the operator chose to wipe). +- Layer-pure operator output. The skill never says "returning control to wizard", "the wizard will dispatch next", or any other orchestration commentary in the **operator-facing** summary. Whoever invoked the skill (a human running `/cozystack:cluster-install` directly, or the wizard's dispatch loop) figures out what's next on their own. Internal SKILL.md references to `cozystack:wizard` are fine for documentation; `wizard` does not appear in any text shown to the operator. +- The user is click-ops. Show what you're about to do, in plain language, before doing it. Wait for `Continue`. +- Three non-negotiable gates: **cluster fits**, **values gathered**, **all HRs Ready**. None can be skipped. +- Read-only lookups (`get`, `describe`, logs, ephemeral debug pods) need no approval. Mutating actions (`apply`, `patch`, `helm install`, `label`) need explicit user approval each time. +- Errors are not the user's fault. When a check fails, explain what's wrong, why it matters, and offer concrete next steps (with commands). Don't leave the user staring at a Helm stack trace. +- On a fatal failure that looks upstream: stop, assemble a diagnostic bundle (`references/issue-templates.md`), draft an issue body, hand it to the user. + +## Phase 1 — Parse arguments and pin the context + +Parse flags. Defaults: + +- `--context` → use the one you parse from `kubectl config current-context` **but show it to the user and ask `Use this context or pick another?` via AskUserQuestion**. Do not silently inherit current-context. +- `--installer-version` → if absent, ask. Suggest the latest known stable from the cozystack repo's tags (read `git -C ~/git/github.com/cozystack/cozystack tag --list 'v*' --sort=-v:refname | head -5`). +- `--installer-variant` and `--platform-variant` → recommended by Phase 2 unless preset. + +After the user picks a context, run and show: + +```bash +kubectl --context $CTX cluster-info | head -1 +kubectl --context $CTX get nodes --output wide +``` + +Detect prod-like signals (`prod`, `production`, `prd` in the context name). If present, refuse without `--allow-prod` and explain. The user must re-invoke with the flag. + +Check for an existing install: + +```bash +kubectl --context $CTX get namespace cozy-system --ignore-not-found --output name +kubectl --context $CTX get package cozystack.cozystack-platform --ignore-not-found --output name +``` + +If `cozy-system` exists **and** holds resources (any pods, any HRs, any operator deployment), refuse — point the user at `cozystack:cluster-upgrade`. The only "exists but empty" case to handle is a stale namespace with no helm metadata (see Phase 6 namespace-adoption). + +## Phase 2 — Cluster lookup and variant recommendation + +All read-only. Don't ask the user anything yet — gather facts first, then present. + +Collect: + +- k8s version (`kubectl version --output json`). +- Per-node `nodeInfo.osImage`, `kernelVersion`, `kubeletVersion`, `architecture`, `kubeProxyVersion`, plus all labels and `status.addresses`. +- Cluster domain — from coredns Corefile (see `references/node-checks.md`). Must be `cozy.local`. +- Pod CIDR — `kubectl get nodes -o jsonpath='{.items[*].spec.podCIDR}'`. +- Service CIDR — try `kubectl --namespace kube-system get pod --selector component=kube-apiserver -o yaml`; fall back to `kubectl get svc kubernetes -o jsonpath='{.spec.clusterIP}'` and infer. +- CNI pods in `kube-system`. +- Conflicting workloads (ingress-nginx, cert-manager, metrics-server, kube-proxy, traefik, servicelb). +- Existing storage classes and the default. +- Existing LoadBalancer-class services. +- Control-plane node labels: `node-role.kubernetes.io/control-plane` key **and value** per node (see `references/node-checks.md`). +- **Storage discovery** (per node, via `kubectl debug node` in read-only mode — `lsblk`, `pvs`, `vgs`, `lvs --all`, `zpool list`, `command -v pvcreate vgcreate lvcreate lvs`; on Talos additionally `lsmod` for `drbd`/`zfs`/`openvswitch` and `/etc/lvm/lvm.conf` global_filter). Records: unmounted devices ≥ 50 GiB, existing VGs / LVs / zpools (will be reused as Phase 4 defaults), LVM-tools availability, Talos extension readiness. Skip on `isp-hosted`. + +Apply variant recommendation logic from `references/variants.md`. Print: + +```text +cluster lookup + context: $CTX + api server: $API_URL + distribution: $DIST (kubelet $K8S_VERSION) + nodes: $N (cp: $CPN, worker: $WN, mixed-arch: $MIXED) + cluster domain: $DOMAIN (required: cozy.local — $MATCH) + pod CIDR: $POD_CIDR + service CIDR: $SVC_CIDR (apiserver flag: $SVC_FLAG_MATCH) + cni: $CNI_PODS ($CNI_VERDICT) + conflicts: $CONFLICT_LIST or "none" + cp label values: $CP_LABEL_VALUES (per node) + storage: + per-node disks: $UNMOUNTED_DISKS_PER_NODE + existing pools: $EXISTING_VGS_OR_ZPOOLS or "none" + lvm tools: $LVM_TOOLS_VERDICT + talos modules: $TALOS_DRBD_ZFS_OVS_VERDICT (talos only) + +recommended: + installer variant: $INSTALLER_VARIANT + platform variant: $PLATFORM_VARIANT + why: $REASON +``` + +Hard refusals (don't move on, surface clearly): + +- Cluster domain is not `cozy.local` → refuse, link to bootstrap docs for the user's distribution. +- CNI conflict (non-hosted) → refuse, list pods to remove. +- Conflicting workloads (ingress-nginx, cert-manager, etc., non-hosted) → refuse, list workloads. +- kubectl can't reach the cluster → refuse, suggest `kubectl auth can-i '*' '*' --all-namespaces`. + +## Phase 3 — Node readiness validation + +Skip on `isp-hosted` (no node access required). + +For every other case, drive `kubectl debug node` per `references/node-checks.md`. For homogeneous clusters, check one sample CP node and one sample worker; warn that the rest are assumed identical. For heterogeneous, check every node. + +**Talos-specific early-exit.** If Phase 2 detected Talos on any node, run the four Talos checks from `references/node-checks.md` (lsmod drbd / lsmod zfs / lsmod openvswitch / `/etc/lvm/lvm.conf` cozystack global_filter). If any of them fail on any Talos node, **STOP GATE 1 fails immediately** with this message: + +```text +Talos nodes detected without cozystack-tuned extensions. + +Missing on $NODE: $MISSING_LIST (drbd / zfs / openvswitch / lvm.conf filter) + +`cozystack:cluster-install` installs Cozystack on top of an already-prepared +cluster — Talos node prep is out of scope. + +Fix paths: + - Run `/cozystack:talos-bootstrap` (separate skill; prepares Talos nodes). + - Or reinstall the affected nodes from the cozystack-tuned image + `ghcr.io/cozystack/cozystack/talos:vX.Y.Z` (see + https://cozystack.io/docs/v1.3/install/talos/). + +Refusing to proceed — no mutations performed. +``` + +No partial install on a half-tuned Talos cluster. + +For non-Talos and tuned-Talos, aggregate findings into a node readiness matrix: + +```text +node-readiness + node1 (cp): kmod ✓ pkgs ✓ svcs ✓ sysctl ✓ multipath-bl ✓ disks ✓ cp-label ✓ storage-tools ✓ + node2 (cp): kmod ✓ pkgs ✓ svcs ✗ (iscsid inactive) sysctl ✓ multipath-bl ✗ ... + node3 (worker): ... +``` + +For every ✗ surface: the exact check that failed, the value observed vs required, and one concrete fix command. If many nodes share the same gap on Ubuntu / Debian, the recommended fix is to run `cozystack:ubuntu-bootstrap` first (it wraps `cozystack/ansible-cozystack/examples/ubuntu/prepare-ubuntu.yml` which covers every node-prep concern Cozystack has). On Talos, the fix is `cozystack:talos-bootstrap`. For other distributions (RHEL family, SUSE), point operators at `cozystack/ansible-cozystack/examples/{rhel,suse}/prepare-*.yml` directly — out of scope for v1 of `cozystack:wizard`. + +**STOP GATE 1 — Cluster readiness** + +Present: + +```text +gate 1 — cluster readiness + +summary: + +blockers: + - +warnings: + - + +next: + options: Continue / Fix and re-check / Cancel +``` + +If blockers present → only `Cancel` is honoured. If only warnings → user picks. + +## Phase 4 — Consolidated intake + +This is the **one** interview phase. **Policy slots are read from `state.cozystack_intake`** (written by `cozystack:wizard` Phase 4 — the chain orchestrator front-loads every operator-decidable value before any skill runs). Discovery slots — the ones that need post-bootstrap probing (actual device paths, KubeOVN label values, real `Node.status.addresses`) — are resolved here against the live cluster. + +Two-tier read pattern: + +1. **Read `state.cozystack_intake` first.** Every value the wizard collected is the operator's authoritative answer; this skill never re-prompts what's already there. +2. **Discovery-driven fill.** Run Phase 2's lookups and Phase 3's node checks against the policy values. Pre-fill remaining slots from the discovery output: largest unmounted disk per node (matching `cozystack_intake.storage_pref.layout_per_node[]`), real `InternalIP` / `ExternalIP` (validates `cozystack_intake.external_ips.strategy`), KubeOVN MASTER_NODES label state, talos KubePrism presence. +3. **Render the consolidated summary** with every slot filled and a single `Approve all / Edit / Cancel` gate. AskUserQuestion fires only when (a) `cozystack_intake` is missing — direct invocation without the wizard, (b) discovery contradicts the policy (e.g. `external_ips.strategy: external` but every node's `ExternalIP == InternalIP`), or (c) destructive STOP GATEs that have to ask by their nature. + +When `state.cozystack_intake` is absent (operator ran `/cozystack:cluster-install` directly without the wizard), the skill falls back to asking each slot inline — same shape as before the front-load refactor. This keeps direct invocation viable; running through the wizard is the optimised path. + +The shape of the consolidated summary (operator sees this **once**, not 10 times): + +```text +cozystack:cluster-install — collected values + +bundles: system, paas, iaas, naas (default for platform_variant: isp-full-generic on installer_variant: generic) + +storage (ZFS): + cp1 (10.0.0.10): /dev/nvme1n1 → zpool data (single) + cp2 (10.0.0.11): /dev/nvme1n1 → zpool data (single) + cp3 (10.0.0.12): /dev/nvme1n1 → zpool data (single) + linstor pool name: data + +networking: + podCIDR: 10.244.0.0/16 (cozystack default; matches apiserver: ✓) + podGateway: 10.244.0.1 + serviceCIDR: 10.96.0.0/16 (cozystack default; matches apiserver: ✓) + joinCIDR: 100.64.0.0/16 + apiServerHost: 127.0.0.1 (extractedprism, default) + apiServerPort: 7445 + kubeovn MASTER_NODES: "" (Helm lookup, label matches) + +publishing: + external IPs: 10.0.0.50 (from MetalLB pool / operator-supplied) + mode: externalIPs + host: 10-0-0-50.nip.io (nip.io default; ownership gate auto-passes) + apiServerEndpoint: https://api.10-0-0-50.nip.io + exposed: api, dashboard + cert solver: http01 + +operations: + storage provisioning: auto (one approve per node in Phase 5.5) + extractedprism: enabled (default for generic) + tenant ingress patch: enabled (default for system bundle) + +options: + - Approve all — proceed to Phase 5 plan gate + - Edit — name the slot (storage, networking.podCIDR, publishing.host, …) + - Cancel +``` + +Slot legend (every slot the operator may want to edit): + +1. **Bundles** (multiSelect, defaults from variant overlay): + - system (required for `isp-full*`; off for `isp-hosted`) + - paas (databases / applications) + - iaas (Cluster API + VMs) + - naas (Network as a Service) + +2. **Storage** (only when `system` bundle is on — `isp-hosted` skips). Cozystack standardises on **ZFS** for LINSTOR pools; see `references/storage-backends.md`. No backend question — only device selection and pool layout per node. + + a. **Per-node disk selection** — for every storage-providing node, show Phase 2's unmounted-device list and ask which device(s) to use. Default is the largest unmounted disk ≥ 50 GiB. If a node has multiple candidates or the operator wants a mirror / RAID-Z, prompt for a vdev layout: + + - `single` — one disk, no redundancy (Recommended for dev / sandbox). + - `mirror` — two disks, two-way mirror (Recommended for prod on 2-disk nodes). + - `raidz` — 3+ disks, one parity disk. + + Refuse and ask again if the node has no qualifying device — operator picks: re-check / cancel / proceed without this node (the latter excludes the node from storage). + + b. **Names**: + - zpool name — default `data`. Same name on every storage node (the LINSTOR storage-pool entry surfaces them under one logical name). + - LINSTOR storage-pool name — default `data`. What `linstor storage-pool list` shows; referenced by every StorageClass `parameters.linstor.csi.linbit.com/storagePool`. + + c. **Summary echo** after collection: + + ```text + storage decision (ZFS) + node1: /dev/nvme1n1 → zpool data (single) + node2: /dev/nvme1n1, /dev/nvme2n1 → zpool data (mirror) + node3: /dev/nvme1n1 → zpool data (single) + linstor pool: data + ``` + + Refuse to proceed if any storage node lacks ZFS tooling (`zpool` / `zfs` binaries) — the Phase 3 storage discovery would have caught it, but re-verify here in case a fix was applied since. RHEL 10 family is not supported on the storage path; see `references/known-failures.md`. + +3. **podCIDR / serviceCIDR / joinCIDR** — show detected as defaults. If user picks Other, validate format (CIDR notation, no overlap with host networks the cluster sees, joinCIDR ≠ podCIDR ≠ serviceCIDR). +4. **podGateway** — auto-derive as the first IP of podCIDR, confirm. +5. **apiServerHost** — the address Cilium / KubeOVN / cozystack-operator dial to reach kube-apiserver. The skill picks this automatically by installer variant — **do not ask the operator** unless they passed `--api-host=`. + + - **`talos` variant** — `localhost:7445` (KubePrism, built into Talos machine-config). Operator can't override; Cozystack's `values-isp-full.yaml` overlay hard-codes this for Cilium. + - **`generic` variant — default** — `127.0.0.1:7445` via **extractedprism DaemonSet** (a per-node TCP load balancer for kube-apiserver HA; mirrors what KubePrism does on Talos). The skill installs the chart in Phase 5.6 before the cozy-installer chart. No VIP, no keepalived. Single-CP sandboxes work too — extractedprism just proxies to one endpoint. + - **`generic` variant with `--no-extractedprism`** — operator must supply `--api-host=` (internal IP of a CP, or a VIP / external LB IP they manage themselves). Single point of failure if a single CP IP is given on a multi-CP cluster; the skill warns but does not refuse. + - **`hosted` variant** — the managed provider handles kube-apiserver HA; the skill does not install extractedprism and does not set apiServerHost (the cozy-installer chart's `hosted` variant doesn't need it). + + The plan presentation in Phase 5 always shows which choice landed and how to flip it. +6. **LB / external IPs** — + - Mode: `externalIPs` (recommended for now; deprecated upstream in k8s v1.36) vs `loadBalancer` (Cilium L2/BGP or external cloud LB; needs more wiring). + - Pool: resolved from `state.cozystack_intake.external_ips` (wizard Phase 4). The wizard already asked the operator's strategy — `internal` / `external` / `explicit` — and the reason. Here the skill **validates the strategy against live `Node.status.addresses`** and refuses to silently override: + + ```bash + kubectl --context $CTX get nodes --output json \ + | jq -r '.items[] | {name: .metadata.name, + internal: (.status.addresses[] | select(.type=="InternalIP") | .address), + external: (.status.addresses[]? | select(.type=="ExternalIP") | .address // "")}' + ``` + + Cases: + + - `strategy: internal` and every node's `InternalIP` is present → use those. + - `strategy: external` and every node's `ExternalIP` is present and differs from `InternalIP` → use the external set, but **first print the NAT-warning probe**: surface the strategy + reason from intake, and warn that on OCI 1:1 NAT / GCP NAT'd external IPs / AWS EIP the public IP is **not** present on the interface that receives the packet — the kernel only sees the InternalIP. Picking external here on such providers causes Cilium externalIPs BPF to never match, producing `Connection refused` on the dashboard host even when every HR is Ready. If `intent_hints.platform` ∈ {oci, aws-with-eip, gcp-with-nat}, **refuse the external choice** and force `internal` with a one-line justification; the operator can override with `--allow-external-on-nat-provider` if they know better. + - `strategy: explicit` → take `cozystack_intake.external_ips.explicit` verbatim; sanity-check that each IP is reachable from at least one node (`nc -zw1 80` from the first CP via `kubectl debug` if `--check-externals` is set; otherwise informational only). + - `cozystack_intake.external_ips` is missing (direct invocation without the wizard) → ask inline. Default `internal` when InternalIP ≠ ExternalIP on any node. Surface the NAT warning. + + The chosen pool is recorded as `cozystack.publishing.external_ips` and lands in the Platform Package CR `spec.components.platform.values.publishing.externalIPs`. + + Collect these **before** the publishing.host question — the domain gate references the chosen IPs. + +7. **publishing.host (FQDN) + domain ownership gate** — this is the public domain under which every service lives: `dashboard.${HOST}`, `keycloak.${HOST}`, `api.${HOST}`, `grafana.${HOST}`. Cozystack creates a wildcard ingress for `*.${HOST}` and asks Let's Encrypt for certificates via the **HTTP-01 solver by default** (`publishing.certificates.solver: http01`). + + This means the domain must be: + + 1. **Owned by the operator** — they need to configure DNS for it. + 2. **Publicly resolvable** — wildcard `*.${HOST}` A-records pointing at the external IPs picked in question #6. + 3. **Reachable on port 80 from the public internet** — Let's Encrypt validators hit `http:///.well-known/acme-challenge/...`. + + Failure modes if any of these is wrong: cert-manager `Order` CRs go pending, certificates never issue, ingress serves a default cert that browsers reject, dashboard / keycloak unreachable. + + Options: + + a) **Custom FQDN** (`cluster.example.com`) — Recommended for real deployments. **Hard gate**: show this confirmation: + + ```text + You picked publishing.host = cluster.example.com + + Cozystack will request Let's Encrypt certificates for *.cluster.example.com + via the HTTP-01 solver. For this to work, you must: + + 1. Own the domain cluster.example.com (or its parent example.com). + 2. Configure a wildcard A-record: *.cluster.example.com → + (the same IPs you picked at the "external IPs" question above). + 3. Make port 80 reachable from the public internet at those IPs. + + If any of this is not true, certificates will never issue, every browser + visit to https://dashboard.cluster.example.com will warn, and the install + verification at Phase 9 will fail the dashboard reachability probe. + + Confirm: I own this domain and DNS will be configured before this install completes. + + options: Yes, I own this domain / No, let me pick a different host / Cancel install + ``` + + Without an explicit yes — do not proceed. + + b) **nip.io playground** (`.nip.io`, e.g. `192-0-2-10.nip.io`) — Recommended for sandbox / dev. Works out of the box: nip.io is a public service that resolves `.nip.io` to the embedded IP, Let's Encrypt accepts nip.io domains, no DNS configuration needed. Skip the ownership gate for nip.io patterns. + + After collection, run a soft DNS pre-flight (warning only, not a refusal — operator may be configuring DNS in parallel): + + ```bash + probe="precheck-$(TZ=UTC date +%s).${HOST}" + dig +short "$probe" | head -3 + ``` + + On a nip.io host this returns the embedded IP immediately. On a fresh custom FQDN it may return empty until DNS propagates. Surface either way: + + ```text + DNS pre-flight for *.${HOST}: $RESULT + + Expected: at least one of appears in the output. + Got: $DIG_OUTPUT + + - If matches: DNS already configured. Proceed. + - If empty / different: configure DNS now (wildcard A-record) before + Phase 8 reaches dashboard / keycloak HRs (~10 min into wait). + The install will still progress, but will fail those HRs until DNS + resolves. + + options: Continue (DNS will be configured shortly) / Re-check DNS / Cancel + ``` + +8. **Cert-manager solver** — `http01` (default, Recommended — only works with public DNS + port 80) vs `dns01` (works on internal networks, but needs DNS-provider credentials in cert-manager values, out of scope for v1 of `cozystack:cluster-install`). On nip.io always pick http01. If operator insists on dns01, set `publishing.certificates.solver: dns01` and remind them the issuer config has to be applied manually. + +9. **apiServerEndpoint** — default `https://api.:6443`. Explain it goes into client kubeconfigs. + +10. **exposedServices** (multiSelect): `api`, `dashboard`, `vm-exportproxy`, `cdi-uploadproxy`. Default `api,dashboard`. + +11. **KubeOVN `MASTER_NODES`** — branch on Phase 2 finding: + - If CP label value matches variant expectation on at least one node → default empty (let lookup work). + - Otherwise → pre-fill comma-separated INTERNAL-IPs of CP nodes from Phase 2 and explain why the lookup would fail. + +After all questions, render the full Package CR to `/cozystack-platform-package.yaml` and show the file path + content. Offer: `Accept` / `Edit ` / `Cancel`. `` comes from `state.config_dir` written by `cozystack:wizard` Phase 1; if invoked directly, ask the operator (default `$PWD`). + +If `state.sops.enabled` is true, the skill `sops --encrypt --in-place`s the file after Accept and decrypts to a tempfile for the Phase 7 `kubectl apply --filename`. The encrypted form is what gets committed; the tempfile is removed immediately after apply. + +## Phase 5 — Plan presentation (STOP GATE 2) + +After Accept of the Phase 4 intake values, build the consolidated plan view. This is the final operator confirmation before the skill runs helm install + Platform Package apply + watch loop: + +```text +cozystack:cluster-install plan + +context: $CTX ($API_URL) +installer release: oci://ghcr.io/cozystack/cozystack/cozy-installer:$INSTALLER_VERSION_OCI (OCI tag = git tag with the v stripped) +installer variant: $INSTALLER_VARIANT +helm release ns: kube-system (chart templates Namespace cozy-system itself) +platform variant: $PLATFORM_VARIANT +bundles: $BUNDLES_CSV + +networking: + podCIDR: $POD_CIDR (matches apiserver: ✓/✗) + podGateway: $POD_GW + serviceCIDR: $SVC_CIDR (matches apiserver: ✓/✗) + joinCIDR: $JOIN_CIDR + kubeovn MASTER_NODES: $MASTER_NODES or "(label lookup)" + apiServerHost: $API_HOST ($API_HOST_SOURCE) + # $API_HOST_SOURCE values: + # "Talos KubePrism" — variant=talos + # "extractedprism (default)" — variant=generic + # "operator override" — --api-host= or --no-extractedprism + # "managed (hosted)" — variant=hosted, not set + +publishing: + host: $HOST (ownership confirmed: ✓ / nip.io) + dns pre-flight: $DNS_PROBE_RESULT (✓ matches external IPs / ⚠ pending / nip.io magic) + apiServerEndpoint: $API_ENDPOINT + exposure mode: $MODE + external IPs: $EXT_IPS + exposed: $EXPOSED_CSV + cert solver: $SOLVER (http01 / dns01) + +storage (ZFS): + per-node: + $NODE1: $DEVICES1 → zpool $POOL_NAME ($LAYOUT) + $NODE2: $DEVICES2 → zpool $POOL_NAME ($LAYOUT) + linstor pool: $LINSTOR_POOL_NAME + +actions on Continue: + 1. Storage provisioning per node (Phase 5.5; one approval per node) + 2. (generic only, unless --no-extractedprism) install extractedprism DaemonSet for kube-apiserver HA (~1 min) + 3. (if cozy-system namespace exists but unowned) adopt namespace into kube-system/cozy-installer + 4. helm upgrade --install cozy-installer ... --namespace kube-system (~2 min) + 5. wait deploy/cozystack-operator Available; wait CRD packages.cozystack.io Established + 6. kubectl apply --filename /tmp/.../platform-package.yaml + 7. wait root Tenant CR, patch spec.ingress=true (~3 min — required for Phase 8 to ever finish; breaks the OIDC chicken-and-egg) + 8. poll HRs every 30s until all Ready=True (~30–60 min) + 9. print access summary + +options: Continue / Edit values / Cancel +``` + +If `Skip — pool already managed externally` was picked in Phase 4, the storage block is replaced with `storage: skipped (externally managed)` and Phase 5.5 is omitted from the action list. + +## Phase 5.5 — Storage provisioning (STOP GATE per node, or batch when identical) + +Skip entirely if Phase 2 detected an existing zpool with the name Phase 4 collected and the operator chose `reuse` — in that case the skill verifies the existing pool with `zpool status` instead of creating. + +**Batch-all-identical pattern**: when every storage-scope node has the same layout (single device path, same `cozystack_intake.storage_pref.layout_per_node[*]` value, same zpool name, same distribution), the per-node STOP GATE collapses to one approval that covers all nodes. Three identical CPs each picking `/dev/sdb` for `single` layout shouldn't require three identical yes/no rounds. Determine identical-batch shape: + +```bash +LAYOUTS=$(yq '[.cozystack_intake.storage_pref.layout_per_node[]] | unique' "$STATE_FILE") +DEVICES_PER_NODE=$(yq '[.cozystack.storage.discovered_devices[][] | length] | unique' "$STATE_FILE") +DISTRIBUTIONS=$(yq '[.inventory.nodes[].distribution // .cluster.distribution] | unique' "$STATE_FILE") + +if [ "$(jq 'length' <<<"$LAYOUTS")" -eq 1 ] && \ + [ "$(jq 'length' <<<"$DEVICES_PER_NODE")" -eq 1 ] && \ + [ "$(jq 'length' <<<"$DISTRIBUTIONS")" -eq 1 ]; then + BATCHABLE=1 +fi +``` + +When `BATCHABLE=1`, surface a single STOP GATE that covers all nodes: + +```text +storage provisioning — $N identical nodes ($DISTRIBUTION) + + layout: $LAYOUT (single device per node, same path on each) + per node: /dev/sdb → zpool $POOL_NAME + + nodes that will be provisioned: + node0 /dev/sdb → zpool $POOL_NAME (single) + node1 /dev/sdb → zpool $POOL_NAME (single) + node2 /dev/sdb → zpool $POOL_NAME (single) + + options: + - Provision all $N (Recommended — identical config across nodes) + - Step through each node — switch to one-STOP-GATE-per-node flow + - Cancel install +``` + +On `Provision all`, run the per-node loop without per-node prompts. On any per-node failure, abort the batch and surface which node failed + the failure mode (do not silently continue with the remaining nodes — partial cluster is harder to debug than no cluster). + +For non-identical configurations (mixed layouts: one node `single`, another `mirror`; or different device paths discovered per node), drop to the per-node STOP GATE flow below — each node gets its own approval because the operator is making a real choice each time. + +For every other node in storage scope, run the per-node loop. The mechanism splits on `cluster.distribution`: + +- **Talos**: cannot use `kubectl debug node --image=alpine:3 -- chroot /host zpool create`. Three reasons it fails: + 1. `alpine:3` ships musl libc; the Talos host's `/usr/local/sbin/zpool` is glibc-linked and depends on `/lib64/ld-linux-x86-64.so.2`, which doesn't exist on Talos rootfs (Talos itself is musl-statically-linked; the zfs userspace lives only inside the `ext-zfs-service` namespace). + 2. `chroot /host /bin/sh` — `/bin/sh` does not exist in Talos rootfs. + 3. The Pod Security Admission `baseline` enforced on `default` and `kube-system` rejects the privileged debug Pod the `sysadmin` profile creates. + + Use a privileged DaemonSet bootstrap pattern instead — see `references/storage-backends.md` Talos section for the exact ubuntu+apt+sgdisk+bind-mount commands. Summary: create a `cozy-storage-bootstrap` namespace with PSA label `pod-security.kubernetes.io/enforce=privileged`, run a one-shot privileged pod from `ubuntu:24.04`, `apt-get install -y zfsutils-linux`, bind-mount `/dev/zfs` and the chosen disk's host path, `sgdisk` partition before `zpool create` (Talos has no udev inside the pod, so `zpool create /dev/sdb` cannot wait for partition discovery — partition manually first), then `zpool create -f $POOL_NAME /dev/sdbN`. + +- **Ubuntu / k3s / kubeadm**: the original `kubectl debug node --image=alpine:3 -- chroot /host` path works (host has glibc + `/bin/sh` + zfs userspace pre-installed by `ansible-cozystack`). + +1. **Present** the exact commands per distribution, with `$DEVICE` (or `$DEVICES` for mirror / raidz) and `$POOL_NAME` substituted. Pull the bodies from `references/storage-backends.md`. + + ```text + storage provisioning — node $NODE ($DISTRIBUTION) + + layout: $LAYOUT (single / mirror / raidz) + target: $DEVICES → zpool $POOL_NAME + + commands (Talos path — privileged DaemonSet from ubuntu:24.04): + # one-shot pod in cozy-storage-bootstrap; PSA=privileged + apt-get install -y zfsutils-linux + sgdisk --zap-all $DEVICE && sgdisk --new=1:0:0 --typecode=1:bf01 $DEVICE + partprobe $DEVICE && sleep 1 + zpool create -o ashift=12 $POOL_NAME ${DEVICE}1 + zfs set compression=lz4 $POOL_NAME + zfs set atime=off $POOL_NAME + + commands (Ubuntu / k3s / kubeadm path — kubectl debug chroot /host): + zpool create -o ashift=12 $POOL_NAME $VDEV_SPEC + zfs set compression=lz4 $POOL_NAME + zfs set atime=off $POOL_NAME + + verify (read-only) commands the skill will run afterwards: + zpool status $POOL_NAME + zpool list -H -o name,size,free,health $POOL_NAME + + options: Provision this node / Skip this node / Cancel install + ``` + + `$VDEV_SPEC` resolves per layout: `$DEVICE` for `single`, `mirror $DEVICE1 $DEVICE2` for `mirror`, `raidz $DEVICE1 $DEVICE2 $DEVICE3` for `raidz`. The Talos path is single-device only in v1 of this skill — multi-device layouts on Talos need per-device sgdisk and the pod's bind-mount list grows accordingly; surface as "not yet supported, use Ubuntu route for mirror/raidz on Talos workers". + +2. **On `Provision this node`**: + + - Talos path: + + ```bash + # Ensure the bootstrap namespace exists with the right PSA label + kubectl --context $CTX get ns cozy-storage-bootstrap >/dev/null 2>&1 || \ + kubectl --context $CTX create ns cozy-storage-bootstrap + kubectl --context $CTX label ns cozy-storage-bootstrap \ + pod-security.kubernetes.io/enforce=privileged --overwrite + + # Run the bootstrap pod; image is held in references/storage-backends.md + # so a pinned digest is the single source of truth. + kubectl --context $CTX --namespace cozy-storage-bootstrap run zpool-create-$NODE \ + --image=ubuntu:24.04 --restart=Never --overrides='{...}' \ + --command -- /usr/local/bin/cozy-zpool-bootstrap.sh "$DEVICE" "$POOL_NAME" + kubectl --context $CTX --namespace cozy-storage-bootstrap wait pod/zpool-create-$NODE \ + --for=condition=Ready --timeout=300s + ``` + + The `--overrides` payload mounts `/dev` into the pod, sets `nodeName: $NODE` so it runs where the disk lives, sets `hostPID: true` so `partprobe` is visible to the host kernel, and sets `hostNetwork: true` because Phase 5.5 runs **before** Phase 6 installs cozy-installer (which installs Cilium). Without `hostNetwork: true`, a pod with the default pod-network needs CNI to assign an IP — Cilium isn't up yet, so the pod stays `ContainerCreating`. Using the host network sidesteps the need for CNI entirely; the bootstrap pod doesn't open any listening ports. See `references/storage-backends.md` for the verbatim JSON. + + - Ubuntu / k3s / kubeadm path: + + ```bash + kubectl --context $CTX debug node/$NODE \ + --image=alpine:3 --profile=sysadmin --quiet --stdin=false --tty=false \ + -- chroot /host /bin/sh -c '' + ``` + + Capture stdout/stderr. Print them to the operator verbatim. On Talos, the pod's logs are the source. + +3. **Verify**. Run the verify block from step 1. The zpool must show `health: ONLINE`. If the output does not match (`DEGRADED` on a fresh single-disk pool = device failure; missing pool entirely = `zpool create` silently failed), **mark the node as failed** and surface the diff. + +4. **On any failure** in step 2 or step 3: + + ```text + storage provisioning failed on $NODE + + stage: create | verify + stderr: + + options: Retry / Skip this node / Cancel install (and offer backout commands) + ``` + +5. **On `Skip this node`** — record the decision. The skill excludes this node from the LINSTOR storage-pool registration block inside the Phase 8 watch loop. Warn the operator: a single-node-skipped cluster degrades DRBD replica count. + +6. **On `Cancel install`** — show backout commands from `references/storage-backends.md` (`zpool destroy` + `wipefs --all`, plus `kubectl delete ns cozy-storage-bootstrap` on Talos) for every node that was already provisioned in this Phase 5.5 run. The skill does not auto-rollback (operator must own the destructive step). + +7. **Pre-existing-data check** (every distribution). Before `zpool create`, probe the target device for residual LVM / DRBD / LINSTOR thin-pool state from previous installs: + + ```bash + # On the same debug pod / privileged pod: + pvs --noheadings --options vg_name "$DEVICE" 2>/dev/null + dmsetup ls | grep -E "(linstor|drbd|thin)" || true + wipefs --output=label,type "$DEVICE" + ``` + + If anything is reported, **refuse to proceed** without an explicit operator confirmation. A `talosctl reset` does not wipe user disks by default; previous-install state on `/dev/sdb` is a common cause of `zpool create` failing with `EBUSY` or silently producing a DEGRADED pool. Offer the wipe command (`vgchange -an; dmsetup remove --force; dd if=/dev/zero of=$DEVICE bs=1M count=10; sgdisk --zap-all $DEVICE; wipefs --all $DEVICE`) for the operator to approve and execute, then re-run. + +After every storage-scope node is either provisioned or explicitly skipped, persist the per-node mapping to `/.state.yaml` under `cozystack.storage.nodes[]`. The LINSTOR storage-pool registration is **not** declarative on the ZFS path (the CRD has no `zfsPool` slot); the skill runs the registration inside the Phase 8 watch loop as soon as `linstor-controller` reports Ready — see Phase 8 below for the implementation, not a forward-reference. + +## Phase 5.6 — Install extractedprism DaemonSet (generic HA) + +Skip entirely when: + +- installer variant is `talos` — KubePrism is built-in. +- installer variant is `hosted` — provider handles kube-apiserver HA. +- operator passed `--no-extractedprism` — skill respects the override and uses `--api-host=` instead. + +For `generic` variant without opt-out, install the chart **before** cozy-installer so the operator's `apiServerHost: 127.0.0.1 + apiServerPort: 7445` already resolves when cozystack-operator starts dialing kube-apiserver: + +```bash +# Endpoints = list of :6443 for every CP in inventory. +# Read from Phase 2 node lookup (filter by node-role.kubernetes.io/control-plane). +CP_IPS=$(kubectl --context $CTX get nodes \ + --selector node-role.kubernetes.io/control-plane \ + --output jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}') + +# Chart contract: endpoints is a comma-separated string scalar +# (values.schema.json type=string). A YAML list is rejected by helm +# schema validation before render. +ENDPOINTS=$(printf '%s\n' "$CP_IPS" | awk 'NF{printf "%s%s:6443", sep, $0; sep=","}') + +# Render the values file under . This is the canonical +# artifact the orchestrator advertises and the sops opt-in encrypts. +cat > "$CONFIG_DIR/extractedprism-values.yaml" < "$tmp" + kubectl --context $CTX apply --filename "$tmp"; rm -f "$tmp" +else + kubectl --context $CTX apply --filename "$CONFIG_DIR/cozystack-platform-package.yaml" +fi +``` + +Confirm the Package was created: + +```bash +kubectl --context $CTX get package cozystack.cozystack-platform --output yaml +``` + +## Phase 8 — Watch HelmReleases until green (with inline root Tenant patch) + +See `references/helmrelease-monitoring.md` for the full polling pattern and stuck-state diagnostics. + +This phase merges what used to be Phase 7.5 (root Tenant ingress patch) into the watch loop. The motivation: on a real install the `tenants.apps.cozystack.io` CRD can take longer to install than the wait deadline of a separate phase — sometimes the CRD doesn't exist until after most HRs are Ready, so a fixed 5-minute wait-for-Tenant expires before the CRD lands. Folding the patch into the watch loop makes it event-driven: as soon as the CR appears, the skill patches it and continues monitoring, regardless of where Phase 8 is in its own progression. + +**Why the patch is needed**: cozystack's dashboard ships gatekeeper (oauth2-proxy) which, on startup, does OIDC discovery against the **public FQDN** `https://keycloak.${HOST}/realms/cozy/.well-known/openid-configuration` — not an in-cluster service. Without the root ingress controller running, nothing listens on 443, gatekeeper CrashLoopBackOffs, the `cozy-dashboard/dashboard` HR sits in `Unknown: Running 'install' action with timeout of 10m0s` and then `InstallFailed: context deadline exceeded`, Flux remediates and retries forever. `cozy-fluxcd/flux-plunger` has a hard dependency on `cozy-dashboard/dashboard` and stays `False: dependency is not ready`. The phase would never go green. + +Skip the root-Tenant patch entirely on `isp-hosted` or when the `system` bundle was disabled in Phase 4 — there is no root Tenant CR in those modes. + +Watch loop (per 30 s poll): + +```bash +# 1) Has the root Tenant CR landed? If yes and not yet patched, patch it. +if kubectl --context $CTX --namespace tenant-root get tenants.apps.cozystack.io root \ + --output jsonpath='{.metadata.name}' 2>/dev/null | grep -q '^root$'; then + CURRENT=$(kubectl --context $CTX --namespace tenant-root get tenants.apps.cozystack.io root \ + --output jsonpath='{.spec.ingress}') + if [ "$CURRENT" != "true" ]; then + kubectl --context $CTX --namespace tenant-root patch tenants.apps.cozystack.io root \ + --type=merge --patch '{"spec":{"ingress":true}}' + echo "patched tenants/root.spec.ingress=true at $(TZ=UTC date -Iseconds)" + fi +fi + +# 2) Standard HR readiness summary +kubectl --context $CTX get hr --all-namespaces \ + --output jsonpath='{range .items[?(@.status.conditions[?(@.type=="Ready" && @.status!="True")])]}{.metadata.namespace}/{.metadata.name} {end}' +``` + +```text +HelmRelease $NS/$NAME has been Failing for $T minutes. +Last condition: +options: Keep waiting / Capture diagnostics and pause / Abort +``` + +`Capture diagnostics` runs the bundle script from `references/issue-templates.md`. + +**STOP GATE 3 — All HRs Ready AND storage pools registered** + +Don't print success until **both** conditions hold: + +```bash +# Condition 1: no HelmRelease is non-Ready +HR_NOT_READY=$(kubectl --context $CTX get hr --all-namespaces \ + --output jsonpath='{range .items[?(@.status.conditions[?(@.type=="Ready" && @.status!="True")])]}{.metadata.namespace}/{.metadata.name} {end}') + +# Condition 2: storage-pool count matches the storage-node count (skip when no ZFS scope) +EXPECTED_POOLS=$(yq '.cozystack.storage.nodes | length' "$STATE_FILE") +if [ "$EXPECTED_POOLS" -gt 0 ]; then + ACTUAL_POOLS=$(kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list --output-version v1 2>/dev/null \ + | jq --raw-output '[.[] | select(.provider_kind == "ZFS") | .node_name] | unique | length') +else + ACTUAL_POOLS=0 + EXPECTED_POOLS=0 +fi + +# Exit watch loop only when BOTH are satisfied +[ -z "$HR_NOT_READY" ] && [ "$ACTUAL_POOLS" -eq "$EXPECTED_POOLS" ] +``` + +Why both: an HR can report `Ready=True` (from Flux's lifecycle: "install action succeeded") **before** the underlying Deployment has any ready replicas. The LINSTOR-controller HR is the canonical case — Ready=True at the moment the helm release lands, but `kubectl get deploy linstor-controller --output jsonpath='{.status.readyReplicas}'` is still empty for another 30–60 s while the pod starts. Exiting on HR-Ready-only races the inline storage-pool registration block: the watch loop sees "all HRs Ready" and exits before the per-poll registration check has fired against a ready linstor-controller, leaving pools unregistered. Operator then has to register them manually after `cluster-install` already declared success. + +The combined gate avoids that race without re-introducing the post-watch deadlock (paas / monitoring HRs that depend on PVCs would never reach Ready if registration ran *after* all HRs were required to be Ready — see `references/known-failures.md`). + +### LINSTOR storage-pool registration — inline (gated on linstor-controller Ready) + +Why this is folded into the watch loop rather than a post-watch phase: any HR in the `paas` or `monitoring` bundles that requests a PVC stays `Pending` until LINSTOR has a registered storage pool for at least one node. If pool registration runs *after* "all HRs Ready", the watch loop can never exit — Phase 8 deadlocks waiting for HRs whose PVCs deadlock waiting for the pool. This is the same shape as the root-Tenant ingress patch: the right gate is "the producer became Ready", not "everything is Ready". + +The same watch loop documented above is extended with a storage-pool registration block, gated on `linstor-controller` Ready, not on all-HR-Ready: + +```bash +# Per poll, alongside the root Tenant patch check: + +# Gate: linstor-controller Deployment has >= 1 Ready replica +LINSTOR_READY=$(kubectl --context $CTX --namespace cozy-linstor get deploy linstor-controller \ + --output jsonpath='{.status.readyReplicas}' 2>/dev/null) +if [ "${LINSTOR_READY:-0}" -ge 1 ]; then + yq --output-format=json '.cozystack.storage.nodes' "$STATE_FILE" \ + | jq --compact-output '.[]' \ + | while IFS= read -r entry; do + NODE=$(jq --raw-output '.name' <<<"$entry") + ZPOOL=$(jq --raw-output '.zpool' <<<"$entry") + LINPOOL=$(jq --raw-output '.linstor_pool' <<<"$entry") + + # Idempotent: skip if the pool is already registered on this node. + # linstor CLI lives inside the controller pod and uses the in-cluster mTLS + # client certs at /etc/linstor/client/ — no external CLI install needed. + if kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list --node "$NODE" --storage-pool "$LINPOOL" \ + --output-version v1 2>/dev/null | grep -q "$LINPOOL"; then + continue + fi + + kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool create zfs "$NODE" "$LINPOOL" "$ZPOOL" \ + || { echo "storage-pool register failed on $NODE"; exit 1; } + done +fi +``` + +On per-node registration failure, the watch loop aborts Phase 8 and writes `failed_at: "linstor-storage-pool"` with `error_detail` containing the failing node and the `linstor storage-pool create` stderr. Partial registrations are kept (idempotent retry-friendly); operator can re-invoke after fixing the root cause. + +Verification (after all HRs Ready, separately from registration): + +```bash +kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list | grep -E "zfs|node" +# Expect one ZFS row per storage-providing node with non-zero Capacity. +``` + +## Phase 8.6 — Default StorageClasses (cozystack v1.3.x compatibility) + +Skip on `cluster.cozystack.installer_version` ≥ `1.4.0`. The cozystack `tenants.apps.cozystack.io` CRD in v1.4+ exposes `spec.storageClasses` and the operator creates the StorageClasses based on the tenant declaration. v1.3.x does **not** do this — the cluster reaches "all HRs Ready" with zero StorageClasses, and every stateful tenant workload sits in `Pending: pod has unbound immediate PersistentVolumeClaims` until the operator applies them by hand. + +The skill writes two StorageClasses by default for v1.3.x: + +```yaml +# /storageclasses-default.yaml +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: local + annotations: + storageclass.kubernetes.io/is-default-class: "false" +provisioner: linstor.csi.linbit.com +parameters: + linstor.csi.linbit.com/storagePool: "${LINSTOR_POOL_NAME}" + linstor.csi.linbit.com/placementCount: "1" +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: replicated + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: linstor.csi.linbit.com +parameters: + linstor.csi.linbit.com/storagePool: "${LINSTOR_POOL_NAME}" + linstor.csi.linbit.com/placementCount: "3" + linstor.csi.linbit.com/allowRemoteVolumeAccess: "true" +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete +``` + +```bash +kubectl --context $CTX apply --filename "$CONFIG_DIR/storageclasses-default.yaml" +kubectl --context $CTX get storageclass +# Expect: +# NAME PROVISIONER ... DEFAULT +# local linstor.csi.linbit.com ... false +# replicated (default) linstor.csi.linbit.com ... true +``` + +`replicated` is marked as the default; `local` is a single-replica fallback for system workloads that don't need replication. On clusters with fewer than 3 storage-providing nodes, drop `placementCount` for `replicated` to match — the skill auto-derives this from `cozystack.storage.nodes[]` count. + +## Phase 9 — Post-install verification + +```bash +kubectl --context $CTX get hr --all-namespaces | grep -v ' True ' || echo "all HRs Ready" +kubectl --context $CTX get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded +``` + +Confirm the root Tenant patch (applied inline by Phase 8) actually landed: + +```bash +kubectl --context $CTX --namespace tenant-root get tenants.apps.cozystack.io root \ + --output jsonpath='{.spec.ingress}' +# Expect: true +``` + +Spot-check dashboard ingress and certificate: + +```bash +kubectl --context $CTX get ingress --all-namespaces | grep dashboard +kubectl --context $CTX get certificate --all-namespaces | grep -E 'dashboard|keycloak' +# Expect: Ready=True. +``` + +If certificates are still `Ready=False` **after 2 minutes**, do not advise the operator to wait longer — "first issuance takes ~5 min" is a misleading mitigation that hides the actual failure mode. On a healthy install, an ACME HTTP-01 challenge resolves in 10–30 seconds; anything past 2 minutes means the challenge is stuck on a definite cause (rate limit, DNS unreachable, port 80 firewalled, externalIPs mismatch, wildcard config). Inspect the ACME `Challenge` CRs and the cert-manager log directly: + +```bash +# Per-domain challenge state — the most direct signal +kubectl --context $CTX get challenges --all-namespaces \ + --output custom-columns=NS:.metadata.namespace,NAME:.metadata.name,STATE:.status.state,REASON:.status.reason,DOMAIN:.spec.dnsName + +# Order state — surfaces rate-limit hits and DNS verification failures +kubectl --context $CTX get orders --all-namespaces + +# cert-manager log, filtered to ACME activity +kubectl --context $CTX --namespace cozy-cert-manager logs deploy/cert-manager --tail=100 \ + | grep -iE 'acme|challenge|order' +``` + +Common stuck states and where to look: + +| `Challenge.status.state` | Reason in log | Likely cause | Fix | +|---|---|---|---| +| `pending` | `Waiting for HTTP-01 challenge propagation` | DNS not resolving challenge host | Configure wildcard A-record at the registrar | +| `pending` | `dial tcp ...:80: i/o timeout` | port 80 not reachable from LE validators | Open port 80 at firewall / LB | +| `pending` | `connect: connection refused` (RST) | externalIPs on a NAT'd provider (OCI / GCP NAT / AWS EIP) match wrong address | Re-pick external_ips.strategy=internal — see Phase 4 | +| `invalid` | `urn:ietf:params:acme:error:rateLimited` | Let's Encrypt 5-cert-per-week rate limit hit | Wait 7 days, or switch to staging issuer, or `dns01` solver | +| `invalid` | `incorrect response from authoritative server` | wildcard A-record pointing at the wrong IP | Re-verify which IPs cozystack actually advertises with `kubectl get svc -n tenant-root-ingress` | +| `valid` but `Certificate` not Ready | (none — challenge done) | Order Issuing slow (usually <5s) | Wait one more minute; if still stuck, `kubectl describe certificate` for the precise error | + +For nip.io hosts: A 2-minute pending challenge is itself anomalous because nip.io DNS is instantaneous. Suspect port 80 firewalling or wrong external IPs first. + +That is the operator's fix, not the skill's — DNS / firewall configuration is outside cluster scope. The skill surfaces the diagnosis without speculating on the cause. + +## Phase 9.1 — End-to-end reachability probe (mandatory final gate) + +"All HelmReleases Ready" is a cluster-side signal — it reports the lifecycle of helm-controller, not whether the dashboard is actually reachable from outside the cluster. Real-world failure modes (OCI 1:1 NAT externalIPs mismatch, DNS not yet propagated, ingress controller running on the wrong nodes) routinely produce Ready=True HRs with an unreachable dashboard. The skill **must** verify reachability before declaring success, otherwise the next operator action is "click the dashboard URL → 530 / RST → debug from scratch", which is the worst possible failure UX. + +From the operator's workstation (not from inside the cluster): + +```bash +# DNS resolves to expected IPs +EXPECTED_IPS=$(jq -r '.publishing.external_ips[]' <<< "$STATE_JSON" | sort -u) +RESOLVED=$(dig +short "dashboard.${HOST}" | sort -u) + +# TCP-level reachability on the canonical port (HTTPS over the ingress). +# %{exitcode} requires curl 7.75+; older curl prints the template literal — +# guard with `curl --version | head -1` if you suspect a pre-2021 workstation. +curl --silent --output /dev/null --write-out '%{http_code} %{exitcode}\n' \ + --connect-timeout 5 --max-time 10 \ + --insecure \ + "https://dashboard.${HOST}/" +``` + +Pass conditions (any of): + +- HTTP 200 / 302 / 401 — dashboard answered (401 is the gatekeeper redirect; that's fine). +- HTTP 200 from `https://api.${HOST}:6443/healthz` with `--insecure` — apiserver ingress is up. + +Fail conditions (any of these is `failed_at`, **not** a successful install): + +- `curl exit 6` (`Couldn't resolve host`) — DNS not configured. On nip.io this should never happen; on custom-fqdn it means the wildcard A-record isn't published yet. Cross-reference: `RESOLVED` (will be empty). +- `curl exit 7` (`Couldn't connect to host`) — covers both `ECONNREFUSED` (TCP RST — nothing listening at that IP/port) **and** `EHOSTUNREACH` / `ENETUNREACH` (no route to the IP). They look identical at the curl layer; distinguish by checking `ip route get ` from the workstation: + - Route returns `unreachable` → no path; workstation can't actually reach the cluster (firewall, VPN not up). + - Route returns a normal egress → RST; almost always externalIPs misconfig on a NAT'd provider. Cross-reference: `cozystack_intake.external_ips.strategy` and `Node.status.addresses`. +- `curl exit 28` (`Timeout`) — packets reach the destination IP but no SYN/ACK comes back. Usually port 443 firewalled at the cloud-provider security group level; can also be wrong IPs in DNS (`EXPECTED_IPS` ≠ `RESOLVED`). +- HTTP 530 (Cloudflare-style "origin unreachable") — DNS points through a CDN to wrong upstream. +- HTTP 502 / 503 — ingress is up but backend isn't ready; usually transient, retry once after 30 s. Persistent 502/503 (over 2 min) → real failure. + +When the probe fails, write `failed_at: "external-reachability"` with `error_detail` containing the curl output, the resolved-vs-expected IP set, and the `Node.status.addresses` mismatch (if any). The cluster stays as-is — no rollback — and `cozystack:debug` gets dispatched by the wizard. + +For sandbox / nip.io installs where the operator deliberately picked a non-routable address (`192.168.x.x` on a home LAN where the workstation can reach the cluster), the operator can pass `--skip-external-reachability` to downgrade the probe to a warning. The skill prints the warning and still writes `completed_at`, but the Phase 10 NOTES carry a "external reachability not verified — operator opted out" line so the result is auditable. + +## Phase 9.2 — Write status.cluster-install (mandatory) + +Before printing Phase 10 NOTES, the skill **must** write exactly one of `completed_at` / `failed_at` to `/.state.yaml`. This is the contract `cozystack:wizard` Phase 5 dispatch loop relies on — without it the chain hangs at the next dispatch decision (no `completed_at` ⇒ no progression; no `failed_at` ⇒ no `cozystack:debug` auto-dispatch). + +Success path (every gate from Phase 8 + Phase 9 passed): + +```yaml +status: + cluster-install: + dispatched_at: + completed_at: + installer_version: + platform_variant: + bundles: [...] + helm_releases: { ready: , total: } +``` + +Failure path (any STOP GATE refused, any Phase 8 HR stuck past timeout, any Phase 9 verification failed): + +```yaml +status: + cluster-install: + dispatched_at: + failed_at: + error: "" + error_detail: "" +``` + +Write with `sops --encrypt --in-place` afterwards when `state.sops.enabled: true`. Never write both fields. Never leave both unset. + +## Phase 10 — NOTES-style access summary + +Lean. Entry points, file paths, where credentials live, where to read more. No prose paragraphs. + +```text +cozystack ready + +cluster: + context: $CTX + api: $API_URL + installer: oci://ghcr.io/cozystack/cozystack/cozy-installer:$VERSION + installer variant: $INSTALLER_VARIANT + platform variant: $PLATFORM_VARIANT (bundles: $BUNDLES_CSV) + nodes: $N ready | helmreleases: $M / $M ready + +access: + dashboard: https://dashboard.$HOST + api: $API_ENDPOINT + grafana: https://grafana.$HOST (if monitoring HR ready) + cert solver: $SOLVER (http01 / dns01) + tls certs: $CERT_READY_COUNT / $CERT_TOTAL Ready (cert-manager — first issuance can take ~5 min) + +storage (ZFS): + per-node: $N × zpool '$POOL_NAME' ($LAYOUTS) + linstor: storage pool '$LINSTOR_POOL_NAME' + verify: kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- linstor storage-pool list + +api server HA: + apiServerHost: $API_HOST ($API_HOST_SOURCE) + # On generic with extractedprism (default), every node dials 127.0.0.1:7445 and the DaemonSet + # forwards to a healthy CP endpoint. To remove extractedprism later: + # helm --kube-context $CTX uninstall extractedprism --namespace kube-system + # (and re-apply the cozy-installer chart with the operator's chosen apiServerHost). + verify: kubectl --context $CTX --namespace kube-system get daemonset extractedprism 2>/dev/null \ + || echo "extractedprism not installed (Talos KubePrism / operator override / hosted variant)" + +credentials: + keycloak admin (initial): + kubectl --context $CTX --namespace cozy-keycloak get secret keycloak-credentials \ + --output jsonpath='{.data.admin-password}' | base64 --decode; echo + dashboard sso: SSO via Keycloak — first user provisioned by tenant-root + +artifacts on disk: + values file: /cozystack-platform-package.yaml + helm release: kube-system/cozy-installer + cluster-scoped: package.cozystack.io/cozystack.cozystack-platform + +handy commands: + watch state: kubectl --context $CTX get hr --all-namespaces + operator logs: kubectl --context $CTX --namespace cozy-system logs deploy/cozystack-operator -f + list tenants: kubectl --context $CTX get tenants.apps.cozystack.io --all-namespaces + +next: + docs: https://cozystack.io/docs/v/ + first tenant: https://cozystack.io/docs/v/getting-started/install-cozystack#51-setup-root-tenant-services + upgrade later: run cozystack:cluster-upgrade +``` + +Fill all `$placeholders` from collected values and live lookups. Drop lines whose preconditions weren't met (e.g. omit grafana line when the `paas` bundle was not enabled). + +## Error handling — upstream issue handoff + +If any phase hits a fatal failure that looks like an upstream bug or doc gap, follow `references/issue-templates.md`: + +1. Stop the current phase. Don't try further fixes. +2. Run the diagnostic-bundle script. Show the bundle path. +3. Pick the right repo from the routing table. +4. Render the matching issue-body template, fill placeholders from collected state, write to `/tmp/.../issue-body.md`. Show the rendered text to the user. +5. Print the exact `gh issue create` command but **do not execute it**. + +## Guardrails + +- NEVER apply, patch, label, install, or delete without showing the exact command and getting explicit user approval. Prior approval does not carry forward. +- NEVER overwrite values, namespaces, or labels that belong to another Helm release. Refuse, surface ownership, and let the user decide. +- NEVER skip a STOP GATE because earlier gates passed cleanly. +- NEVER assume current `kubectl` context is the right one — pin `--context` in every command, every time. +- NEVER call success while any HR is not Ready, even on a 59-minute clock. +- NEVER open a GitHub issue automatically. Draft, show, hand the command to the user. +- NEVER include private infrastructure names (cluster name, client, internal namespaces) in drafted public issue bodies. Replace with generic placeholders. +- NEVER create a storage pool without explicit per-node approval in Phase 5.5 — even if the operator approved the consolidated plan in Phase 5. Plan approval does not cascade into mutations. +- NEVER `zpool create` over an existing pool. If `zpool list` shows the target name already exists, refuse and ask the operator to either pick a different name, reuse the existing pool, or destroy it manually with `zpool destroy` + `wipefs` (`references/storage-backends.md` backout section). +- NEVER offer LVM / LVM-thin as a storage backend. Cozystack standardises on ZFS; LVM paths were removed because cozystack does not validate or document them. Operators who need LVM are on their own with the piraeus-operator CRD. +- NEVER bootstrap Talos nodes or invoke `boot-to-talos` / `talm` from inside this skill — that flow lives in `/cozystack:talos-bootstrap`. Refuse and hand off. +- NEVER auto-rollback a partially provisioned storage state — print backout commands and let the operator decide. +- NEVER accept a custom `publishing.host` without an explicit operator confirmation that they own the domain and will configure wildcard DNS — the HTTP-01 cert solver fails silently otherwise. nip.io patterns skip this gate because nip.io is publicly hosted DNS. +- ALWAYS patch `tenants/root.spec.ingress=true` from inside the Phase 8 watch loop as soon as the CR appears, on `system`-bundle installs. The OIDC chicken-and-egg makes Phase 8 unreachable otherwise — dashboard / keycloak / flux-plunger loop forever, every other downstream HR stalls on the missing root ingress. The CR can appear at any point during the watch loop; do not gate the patch behind a fixed pre-Phase-8 wait. +- ALWAYS read variant overlays and `requirements.md` before declaring "this looks fine" — variant-specific checks (CP-label value, ZFS availability, KubeOVN MASTER_NODES) are easy to miss. +- ALWAYS pull live data over cached assumption: `kubectl get` over "I think this is …". +- ALWAYS write Phase 4 collected values to disk in `/cozystack-platform-package.yaml` before applying — the file is part of the diagnostic bundle if Phase 8 fails. ZFS pool registration is stored separately under `/.state.yaml` `cozystack.storage.nodes[]` and replayed by the Phase 8 post-Ready hook (there is no `LinstorSatelliteConfiguration` CR for the ZFS path). +- NEVER silently fall back to plain writes when `state.sops.enabled` is true but `sops` is missing. Refuse Phase 4 and tell the operator to install `sops` + the configured age/PGP key, or re-invoke with `--no-sops`. +- NEVER leave plain `/cozystack-platform-package.yaml` on disk between Phase 4 and Phase 7 when sops is on. Decrypt to a temp file for `kubectl apply`, remove the temp file immediately after. + +## References + +- `references/requirements.md` — cluster + node + variant matrix. +- `references/node-checks.md` — exact `kubectl debug node` commands and value thresholds. +- `references/variants.md` — picker logic and what each variant deploys. +- `references/values-template.md` — canonical chart values, Platform Package YAML, extractedprism values shape, and the ZFS pool registration hook. +- `references/storage-backends.md` — ZFS pool create / verify / backout commands run in Phase 5.5. Cozystack standardises on ZFS; LVM paths are explicitly out of scope (the doc spells out why). Talos privileged DaemonSet bootstrap pattern lives here. +- `references/provider-pitfalls.md` — provider-specific networking and runtime gotchas: OCI 1:1 NAT, GCP NAT'd external IPs, AWS EIP / NLB proxy-protocol, Talos system-extension namespacing, PSA baseline on k8s 1.25+, talosctl reset preserving user disks, cozystack v1.3.x quirks. Cross-referenced from Phase 4 publishing slot and Phase 5.5 storage. +- `references/helmrelease-monitoring.md` — polling pattern, stuck-state triage. +- `references/known-failures.md` — top failures with recovery commands. +- `references/issue-templates.md` — diagnostic bundle script and issue-body templates per upstream repo. + +External: + +- `https://cozystack.io/docs/v1.3/install/kubernetes/generic/` — primary install guide. +- `https://cozystack.io/docs/v1.3/install/hardware-requirements/` — hardware sizing. +- `https://github.com/cozystack/cozystack` — chart, operator, packages. +- `https://github.com/cozystack/ansible-cozystack` — node-prep playbooks. diff --git a/plugins/cozystack/skills/cluster-install/references/helmrelease-monitoring.md b/plugins/cozystack/skills/cluster-install/references/helmrelease-monitoring.md new file mode 100644 index 0000000..397ebca --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/helmrelease-monitoring.md @@ -0,0 +1,106 @@ +# Watching HelmReleases until green + +After applying the Platform Package, the operator reconciles the bundle and emits dozens of HelmReleases. Don't declare success until every expected HR reports `Ready=True`. + +## Shell compatibility (helper scripts the skill emits) + +Watch loops and other helper scripts the skill writes to `/tmp/` or `/` MUST be portable across macOS bash 3.2 (`/bin/bash` on macOS) and Linux bash 4+/5+: + +- Shebang: `#!/usr/bin/env bash` — picks up Homebrew bash 5 on macOS when on PATH, falls back to system bash on Linux. +- **Avoid bash 4+ features** in emitted scripts: no `declare -A` (associative arrays), no `mapfile -t` (use `while IFS= read -r ...`), no `${var,,}` / `${var^^}` case conversion (use `tr '[:upper:]' '[:lower:]'`), no `**` globstar (enable explicitly only when guarded with `(( BASH_VERSINFO[0] >= 4 ))`). +- When an associative-array-like structure is genuinely needed, fall back to two parallel arrays + a linear lookup loop, or write the data to a tempfile and use `awk` / `jq` / `yq` to query. +- For the rare case where bash 4+ is unavoidable, document it loudly in the script header and add a runtime guard: + + ```bash + #!/usr/bin/env bash + if (( BASH_VERSINFO[0] < 4 )); then + echo "This script needs bash 4+; on macOS install via 'brew install bash' and re-run." >&2 + exit 1 + fi + ``` + +A real session hit a bash-3.2 incompatibility on macOS where `declare -A` failed silently and the watch loop never ran. Default to POSIX-compatible constructs unless guarded. + +## Polling loop + +```bash +kubectl --context $CTX get hr --all-namespaces --output json +``` + +Parse the array; for each HR extract `metadata.namespace`, `metadata.name`, the `Ready` condition status, and `lastAppliedRevision`. Summarise as `Ready: N/M, Progressing: P, Failing: F` every 30 seconds. List the failing ones explicitly. + +Cap the loop at 60 minutes. Initial reconcile on a slow registry can take 30+ minutes. + +## What "expected" means by variant + +The Platform values render a different set of HRs per bundle. Read the actual list from the cluster — don't hard-code, the chart evolves. + +```bash +# Once the operator has settled — usually 1–2 minutes after Package apply. +kubectl --context $CTX get hr --all-namespaces --output jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' | sort -u +``` + +That list is your denominator. Refresh it on every poll — new HRs appear as dependencies unlock. + +Typical counts for orientation (not gospel): + +| Variant | Approx HR count | +| ----------- | ----------- | +| `isp-full` | 40–50 | +| `isp-full-generic` | 40–50 | +| `isp-hosted` | 15–20 | +| `default` | 1–3 | + +## Reading a stuck HR + +For each HR not Ready after 5 minutes: + +```bash +kubectl --context $CTX --namespace $NS describe helmrelease $NAME | sed -n '/Status:/,$p' +kubectl --context $CTX --namespace $NS get events --sort-by=.lastTimestamp \ + --field-selector involvedObject.kind=HelmRelease,involvedObject.name=$NAME +``` + +Helm-controller log slice: + +```bash +kubectl --context $CTX --namespace flux-system logs deploy/helm-controller --tail=100 \ + | grep -i "$NAME" || true +``` + +Operator log slice: + +```bash +kubectl --context $CTX --namespace cozy-system logs deploy/cozystack-operator --tail=100 +``` + +## Expected transient errors (not blockers) + +For the first ~5 minutes after Package apply: + +- `ExternalArtifact not found` — chart artifact not pulled yet. +- `dependency is not ready: cozy-cilium` — Cilium hasn't finished installing. +- HRs in `Progressing` with empty `lastAppliedRevision` — initial install. + +These are normal. Don't escalate before five minutes have passed since the HR's `lastTransitionTime`. + +## Reading the operator's view + +The operator exposes its own status on the `Package` CR: + +```bash +kubectl --context $CTX get package cozystack.cozystack-platform --output yaml | sed -n '/status:/,$p' +``` + +When the operator considers the bundle reconciled, `status.conditions[Type=Ready].status=True`. + +## When to escalate + +A stuck HR is worth escalating (offer the user to abort or capture a diagnostic bundle) when: + +- It's been `Failing` for > 10 minutes with the same error. +- Helm-controller log shows a stack trace or `context deadline exceeded` repeating. +- Operator log shows a panic. +- Pods in the HR's namespace are in `ImagePullBackOff`, `CrashLoopBackOff`, or `Pending` with no scheduling event. + +Don't escalate just because progress is slow — pulls on a fresh cluster can take 20 minutes on a thin uplink. diff --git a/plugins/cozystack/skills/cluster-install/references/issue-templates.md b/plugins/cozystack/skills/cluster-install/references/issue-templates.md new file mode 100644 index 0000000..c5b9245 --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/issue-templates.md @@ -0,0 +1,135 @@ +# Upstream issue handoff + +When `cozystack:cluster-install` can't continue and the cause looks upstream, assemble a diagnostic bundle and draft an issue body for the user. Never open the issue automatically — the user must read it and choose. + +## Routing decision tree + +| Failure shape | Repo | +| ----------- | ----------- | +| HelmRelease render or runtime bug; operator panic; resource quotas wrong | `cozystack/cozystack` | +| cozy-installer Helm chart misbehaves (template error, broken hooks, wrong namespace adoption) | `cozystack/cozystack` (label `area/installer`) | +| Docs ambiguous, missing a step, or contradict reality | `cozystack/website` | +| Node prep automation missing a task or broken on a distro | `cozystack/ansible-cozystack` | +| Storage / DRBD / LINSTOR specific failure on a working install | use `linstor:recover` skill before filing — the bug is most likely operational, not upstream | + +## Diagnostic bundle + +Drop everything below into `/diagnostics-/` and reference it from the issue body. Strip secrets (`Opaque` / `kubernetes.io/tls` / `kubernetes.io/dockerconfigjson` data fields) before sharing. The diagnostics subdirectory is gitignored automatically (covered by the `*.tar.gz` rule in the cozystack `.gitignore` section). + +```bash +TS="$(TZ=UTC date +%Y%m%d-%H%M%S)" +DUMP="$CONFIG_DIR/diagnostics-${TS}" +mkdir -p "$DUMP" + +kubectl --context $CTX cluster-info dump --output-directory "$DUMP/cluster-info" +kubectl --context $CTX get nodes --output yaml > "$DUMP/nodes.yaml" +kubectl --context $CTX get hr --all-namespaces --output yaml > "$DUMP/helmreleases.yaml" +kubectl --context $CTX get pods --all-namespaces --output wide > "$DUMP/pods.txt" +kubectl --context $CTX get events --all-namespaces \ + --sort-by=.lastTimestamp > "$DUMP/events.txt" +kubectl --context $CTX --namespace cozy-system logs deploy/cozystack-operator \ + --tail=2000 > "$DUMP/operator.log" 2>&1 || true +kubectl --context $CTX --namespace flux-system logs deploy/helm-controller \ + --tail=2000 > "$DUMP/helm-controller.log" 2>&1 || true +cp /tmp/cozystack-platform-package.yaml "$DUMP/platform-package.yaml" 2>/dev/null || true + +# Optional: failing HR detail +for hr in $(kubectl --context $CTX get hr --all-namespaces \ + --output jsonpath='{range .items[?(@.status.conditions[?(@.type=="Ready" && @.status!="True")])]}{.metadata.namespace}/{.metadata.name} {end}'); do + ns=${hr%%/*}; name=${hr##*/} + kubectl --context $CTX --namespace "$ns" describe hr "$name" > "$DUMP/hr-${ns}-${name}.txt" +done + +tar -czf "${DUMP}.tar.gz" --directory /tmp "$(basename "$DUMP")" +echo "Bundle: ${DUMP}.tar.gz" +``` + +## Issue body templates + +All public. English. Singular first person. No private cluster names or client identifiers — replace them in the draft before showing the user. No internal tool names (the user is filing this as a generic bug report). + +### cozystack/cozystack (operator / chart / package bug) + +```markdown +### What happened + + + +### Expected behaviour + + + +### Steps to reproduce + +1. Fresh v cluster, bootstrapped per `docs/v/install/kubernetes//`. +2. `helm upgrade --install cozy-installer oci://ghcr.io/cozystack/cozystack/cozy-installer --version --namespace kube-system --set cozystackOperator.variant= --set cozystack.apiServerHost=` +3. Apply Platform Package with `spec.variant: `. Full Package YAML attached. +4. Observe `kubectl get hr --all-namespaces` — . + +### Environment + +- Cozystack installer version: +- Kubernetes: +- Nodes: × on +- Variant: / + +### Logs and manifests + +Diagnostic bundle attached: `cozystack-install-.tar.gz` (logs redacted of secrets). +``` + +### cozystack/website (docs gap) + +```markdown +### Documentation page + +`https://cozystack.io/docs/v/install/kubernetes//` (or local path: `content/en/docs/v/install/kubernetes/.md`) + +### What I expected to find + + + +### What the page says + + + +### What I had to do instead + + + +### Suggested change + + +``` + +### cozystack/ansible-cozystack (playbook gap) + +```markdown +### Distribution / OS + + + +### Task that should exist but doesn't + +/install/kubernetes/generic.md` (or equivalent) the role does not automate.> + +### Manual workaround + + + +### Suggested role change + + +``` + +## `gh` commands (do NOT auto-run) + +After showing the draft and getting explicit approval, the user can run: + +```bash +gh issue create --repo cozystack/cozystack \ + --title "" \ + --body-file /diagnostics-/issue-body.md +``` + +`cozystack:cluster-install` writes the body to a file under the diagnostic bundle directory so the user can review it before posting. diff --git a/plugins/cozystack/skills/cluster-install/references/known-failures.md b/plugins/cozystack/skills/cluster-install/references/known-failures.md new file mode 100644 index 0000000..33fa08e --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/known-failures.md @@ -0,0 +1,362 @@ +# Known failure modes + +Each entry: symptom → likely cause → recovery. Reference this when an HR is stuck or post-install verification fails. Don't guess — match the exact symptom string before applying a fix. + +## Dashboard / Keycloak / flux-plunger stuck in OIDC chicken-and-egg + +**Symptom** + +After Platform Package apply, most HRs go Ready, but three (or a similar trio) stall indefinitely: + +```text +NAMESPACE NAME READY STATUS MESSAGE +cozy-dashboard dashboard Unknown Running 'install' action with timeout of 10m0s + then InstallFailed: context deadline exceeded + then retry, forever +cozy-keycloak keycloak similar +cozy-fluxcd flux-plunger False dependency 'cozy-dashboard/dashboard' is not ready +``` + +Pod-level look: + +```bash +kubectl --context $CTX -n cozy-dashboard get pods +# gatekeeper-... CrashLoopBackOff + +kubectl --context $CTX -n cozy-dashboard logs deploy/gatekeeper | head -20 +# Unable to fetch OIDC well-known: dial tcp :443: connection refused +# (or: x509: certificate signed by unknown authority) +``` + +**Cause** + +cozy-dashboard ships gatekeeper (oauth2-proxy), which on startup does OIDC discovery against the **public FQDN** `https://keycloak.${HOST}/realms/cozy/.well-known/openid-configuration` — not an in-cluster service. Without a root ingress controller running, nothing listens on the public IP at 443, gatekeeper crashes, the dashboard HR can't reach Ready, and anything that depends on the dashboard HR (notably `cozy-fluxcd/flux-plunger`) is stuck on the dependency. + +The root ingress controller doesn't start until `tenants.apps.cozystack.io/root` is patched with `spec.ingress: true`. The Platform Package does not apply that patch by itself — it's documented as a manual step in `cozystack/docs/cozystack-installation.md:160`. + +This is a chicken-and-egg of the `isp-full*` variant + OIDC combination, not a bug in any single component: + +- Platform Package does not patch `tenant root.spec.ingress`. +- The cozystack dependency graph is built so gatekeeper can't come up before ingress, and dashboard can't come up before gatekeeper. +- But flux-plunger waits on dashboard, which waits on ingress, which waits on the missing manual patch. + +`cozystack:cluster-install` Phase 8 patches `tenants/root.spec.ingress=true` inline as soon as the CR appears in the watch loop, which avoids the trap entirely on a fresh install regardless of when the CRD lands relative to other HRs. + +**Recovery on an install that has already stalled in Phase 8** + +```bash +kubectl --context $CTX --namespace tenant-root wait tenants.apps.cozystack.io/root \ + --for=jsonpath='{.metadata.name}'=root --timeout=300s + +kubectl --context $CTX --namespace tenant-root patch tenants.apps.cozystack.io root \ + --type=merge --patch '{"spec":{"ingress":true}}' +``` + +Within ~2 min: + +- `root-ingress-controller` pods come up in `tenant-root-ingress` namespace. +- External IPs from the LB pool get the wildcard ingress. +- `dashboard.${HOST}` / `keycloak.${HOST}` become reachable from the public internet. +- gatekeeper does OIDC discovery successfully, exits CrashLoop. +- dashboard / keycloak HRs reach Ready. +- flux-plunger picks up the dashboard dependency, reaches Ready. + +The whole cluster typically reaches `N/N Ready` within 5 min of the patch. + +If gatekeeper still crashes after the patch, double-check: + +1. DNS: `dig +short keycloak.${HOST}` returns the external IPs. +2. cert-manager certificate for `*.${HOST}` is Ready (gatekeeper rejects self-signed unless `insecureSkipVerify: true`). +3. Port 80 is reachable from the public internet (Let's Encrypt HTTP-01 needs this — without a valid cert, gatekeeper TLS verify still fails). + +## LINSTOR: HR stuck with no storage pool registered + +**Symptom** + +`cozy-linstor` HelmRelease reaches Ready but the controller logs and the LINSTOR API show no storage pool: + +```bash +kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list +# Empty body (only the DfltDisklessStorPool entry on each node) +``` + +`cozy-linstor-csi` driver pods may then fail to mount volumes; downstream HRs that depend on PVCs (anything in `paas`, monitoring) sit Pending. + +**Cause** + +One of: + +- `cozystack:cluster-install` Phase 5.5 was skipped or aborted partway. The zpool was not created on the node, or `linstor storage-pool create zfs ` was not run after `linstor-controller` reached Ready. +- The zpool exists with one name on the node, but `linstor storage-pool create` was called with a different name. LINSTOR does a byte-comparison and silently never imports the pool. +- The zpool was destroyed (operator-side wipefs / disk swap) and the LINSTOR pool entry is stale. + +**Recovery** + +1. Re-check what is on the node: + + ```bash + kubectl --context $CTX debug node/$NODE --image=alpine:3 --profile=sysadmin -- \ + chroot /host /bin/sh -c 'zpool list -H -o name,size,free,health' + ``` + +2. Re-check what LINSTOR thinks: + + ```bash + kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list + ``` + +3. If the zpool exists on the node but LINSTOR doesn't know about it, register it: + + ```bash + kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool create zfs data + ``` + +4. If the zpool does not exist, run the create commands from `references/storage-backends.md` inside `kubectl debug node`, then re-run step 3. + +5. Bounce the satellite once after the registration lands: + + ```bash + kubectl --context $CTX --namespace cozy-linstor rollout restart daemonset/linstor-satellite + ``` + +6. Verify: + + ```bash + kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list + # Expect one row per node with the chosen pool name and a non-zero `Capacity`. + ``` + +## KubeOVN: chart render fails with "No nodes found with label …" + +**Symptom** + +The `kubeovn` HelmRelease never goes Ready. `kubectl describe hr` shows: + +```text +Reason: InstallFailed +Message: ... template: kube-ovn/templates/_helpers.tpl: error calling fail: + No nodes found with label 'node-role.kubernetes.io/control-plane=true'. + Please check your MASTER_NODES_LABEL configuration or ensure master nodes are properly labeled. +``` + +Or, when render succeeds but pods can't schedule, `ovn-central-*` pods sit in `Pending` and `kubectl describe` shows `0/N nodes are available`. + +**Cause** + +`node-checks.md` covers the mechanics in full. Short version: cozystack-platform's generic variant pins `MASTER_NODES_LABEL=node-role.kubernetes.io/control-plane=true` (literal value `true`). kubeadm sets that label with **empty** value; k3s and RKE2 set it to `true`. The chart's `fail` call fires when the value byte-comparison misses. + +**Recovery** + +Option 1 — relabel CP nodes (works on kubeadm, no reconciler fights back): + +```bash +for n in $(kubectl --context $CTX get nodes \ + --selector node-role.kubernetes.io/control-plane \ + --output jsonpath='{.items[*].metadata.name}'); do + kubectl --context $CTX label node "$n" \ + node-role.kubernetes.io/control-plane=true --overwrite +done +``` + +Then poke the HR: + +```bash +kubectl --context $CTX --namespace cozy-system annotate helmrelease kubeovn \ + reconcile.fluxcd.io/requestedAt="$(TZ=UTC date +%s)" --overwrite +``` + +Option 2 — pin explicit IPs (safest for kubeadm and Cluster API / Rancher-managed clusters that reconcile labels): + +```bash +kubectl --context $CTX patch package cozystack.cozystack-platform --type=merge --patch ' +spec: + components: + platform: + values: + networking: + kubeovn: + MASTER_NODES: "10.0.0.10,10.0.0.11,10.0.0.12" +' +``` + +Source for the IPs: INTERNAL-IP column of `kubectl get nodes --output wide`. Comma-separated, no spaces. + +## linstor-scheduler: InvalidImageName + +**Symptom** + +```text +kubectl --context $CTX -n cozy-linstor get pods +linstor-scheduler-... 0/1 InvalidImageName +``` + +**Cause** + +k3s reports kubelet version as `v1.35.0+k3s1` — the `+` is illegal in Docker image tags. linstor-scheduler templates that tag into an image reference and the API server rejects it. + +**Recovery** + +Fixed in Cozystack v1.0.0+ — upgrade `--installer-version` to a release that has the fix. If pinned to an older release, patch the StatefulSet manually with a fixed tag (`docker.io/piraeusdatastore/...:vX.Y.Z`). + +## Cilium: API server unreachable when CP1 dies (generic HA without extractedprism) + +**Symptom** + +Multi-CP generic Kubernetes (k3s / kubeadm / RKE2) loses Cilium control on every node a few minutes after CP1 reboots or crashes. Cilium pods log: + +```text +level=fatal msg="Unable to connect to Kubernetes apiserver" error="Get https://:6443/api/v1/...: dial tcp :6443: connect: connection refused" +``` + +Other CP nodes are healthy and kube-apiserver is reachable on them at the same port, but Cilium was configured with `k8sServiceHost: ` (or the cozystack-operator was started with `cozystack.apiServerHost: `), so it only dials CP1. + +**Cause** + +`cozystack:cluster-install` defaults the `cozystack.apiServerHost` for the generic variant to either CP1's internal IP (with `--no-extractedprism`) or `127.0.0.1` via extractedprism (default). On `--no-extractedprism` without a VIP / external LB, CP1 becomes a single point of failure for kube-apiserver routing on every other node — Cilium fails open when CP1 dies. Talos avoids this with its built-in `localhost:7445` KubePrism; generic Linux has no KubePrism equivalent unless something explicit (extractedprism / kube-vip / external LB) is installed. + +**Recovery** + +If extractedprism wasn't installed at bootstrap time and a CP went down: + +1. Temporary fix — point the operator's apiServerHost at a live CP: + + ```bash + kubectl --context $CTX --namespace kube-system get configmap cozystack \ + --output yaml | sed "s###g" \ + | kubectl --context $CTX apply --filename - + kubectl --context $CTX --namespace cozy-system rollout restart deploy/cozystack-operator + ``` + +2. Permanent fix — install extractedprism after the fact and re-point the operator. The chart's `endpoints` is a comma-separated string scalar (`values.schema.json` `type=string`); render it as a values file rather than relying on `--set`, which mangles the `:` and `,` characters in `host:port` lists: + + ```bash + cat > /tmp/extractedprism-values.yaml <" + k8sServicePort: "6443" +``` + +## Inotify limits exhausted + +**Symptom** + +Pods fail with `too many open files`, `inotify_add_watch failed`, or kubelet logs `Failed to start cAdvisor: inotify_add_watch ...`. + +**Cause** + +The required sysctl values (see `node-checks.md`) weren't applied or didn't persist across reboot. + +**Recovery** + +Drop `/etc/sysctl.d/99-cozystack.conf` (content in `node-checks.md`), `sysctl --system`, then reboot the affected node or restart kubelet. + +## cozy-system namespace owned by another release + +**Symptom** + +```text +helm --kube-context $CTX install ... cozy-installer ... +Error: namespace "cozy-system" exists and cannot be imported into the current release: invalid ownership metadata +``` + +**Cause** + +A previous failed install, or `kubectl create ns cozy-system` run by hand, left the namespace without Helm's ownership labels. The cozy-installer chart templates `Namespace cozy-system` itself and refuses to adopt an unmarked namespace. + +**Recovery** + +Adopt the namespace before re-running `helm install` (see the snippet in `values-template.md`). Or, if nothing meaningful lives in it yet, `kubectl delete namespace cozy-system` and retry. + +If it's owned by a *different* helm release (different name/namespace in the annotation), refuse. The operator must decide whether to uninstall the conflicting release. + +## ZFS unavailable on RHEL 10 / Rocky 10 / Alma 10 + +**Symptom** + +LINSTOR storage-pool creation fails on Phase 5.5 with `zpool: command not found` or `kernel module zfs not found`. + +**Cause** + +OpenZFS does not ship a release RPM for the RHEL 10 family yet (Rocky 10, Alma 10 in 2026). Cozystack standardises on ZFS for the LINSTOR backend — there is no first-class fallback. + +**Recovery** + +Switch the affected nodes to RHEL 9 / Rocky 9 / Alma 9 (OpenZFS does ship there) or to Ubuntu / Debian / Talos. `cozystack:cluster-install` refuses to proceed without ZFS available on every storage node — the LVM / LVM-thin paths existed in earlier revisions of this skill and were removed because cozystack does not validate or document them. Operators who insist on LVM are on their own with the piraeus-operator CRD's `lvmPool` / `lvmThinPool` slots; this skill will not help. + +## Iptables INPUT reject on cloud images (Ubuntu OCI) + +**Symptom** + +Inter-pod traffic broken; Cilium logs show drops on the INPUT chain. + +**Cause** + +Some cloud images (Ubuntu on OCI, Oracle Linux variants) ship with a restrictive `iptables -A INPUT -j REJECT` rule that fires before Cilium's chains. + +**Recovery** + +Flush the host iptables INPUT chain (`iptables -F INPUT`) and persist via the distro's iptables-persistent mechanism. The ansible-cozystack role has a `cozystack_flush_iptables: true` toggle that automates this — for click-ops users without ansible, document the manual fix and link to it. + +## Where to escalate + +If the failure doesn't match any entry above: + +- Cozystack chart / operator runtime → `cozystack/cozystack` issues. +- Install docs ambiguous or missing a step → `cozystack/website` issues. +- Node prep automation missing a task → `cozystack/ansible-cozystack` issues. +- Bug in the cozy-installer chart itself (label `area/installer`) → `cozystack/cozystack` issues. + +See `issue-templates.md` for ready-to-paste issue bodies that pull from the diagnostic bundle. diff --git a/plugins/cozystack/skills/cluster-install/references/node-checks.md b/plugins/cozystack/skills/cluster-install/references/node-checks.md new file mode 100644 index 0000000..44a0ffe --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/node-checks.md @@ -0,0 +1,250 @@ +# Node readiness checks + +Run from the workstation against each node via `kubectl debug node/ --profile=sysadmin --image=alpine:3 -it -- chroot /host /bin/sh`. For sysadmin profile to mount /host, the cluster must support ephemeral containers (default in k8s 1.25+). + +For homogeneous clusters, check one sample node and warn that the rest are assumed identical. For heterogeneous (mixed OS, mixed arch, control-plane vs worker), check every node. + +## Pattern + +For non-interactive runs, drive the debug pod via `-- chroot /host /bin/sh -c ''` and parse the output. The `chroot /host` step pivots into the node's real root filesystem — without it, `lsmod`, `systemctl`, `sysctl` see the alpine container, not the node. + +`kubectl debug` leaves an ephemeral pod on the node. Delete it after checks: + +```bash +kubectl --context $CTX delete pod --field-selector=spec.nodeName=$NODE -n default \ + -l 'debug.kubernetes.io/component=ephemeral' --ignore-not-found +``` + +Better: run all checks in one `chroot /host /bin/sh -c '...'` invocation so the pod exits immediately after. + +## Checks + +### Kernel modules + +Cozystack needs these loaded (the chart fails opaquely otherwise): + +```sh +lsmod | awk '{print $1}' | grep -Ex 'overlay|br_netfilter|nf_conntrack|ip_tables|iptable_nat' +``` + +Required: `overlay`, `br_netfilter`, `nf_conntrack`, `ip_tables`, `iptable_nat`. + +For Kube-OVN (non-hosted): `openvswitch`, `geneve` (autoloaded by ovs userspace — soft check). + +For DRBD storage (non-hosted): either `drbd` already in `lsmod` or a buildable kernel headers tree (`/usr/src/linux-headers-$(uname -r)`) so piraeus-operator can compile in-cluster. Ubuntu Secure Boot needs `drbd-dkms` pre-installed and signed — `lsmod | grep drbd` must succeed before install. + +### Required services (systemd) + +```sh +systemctl is-active iscsid multipathd +``` + +Both must return `active`. If `inactive` or `not-found`, install + enable them: + +```sh +apt-get install -y nfs-common open-iscsi multipath-tools +systemctl enable --now iscsid multipathd +``` + +(Or distro equivalents — `dnf install` + `iscsi-initiator-utils` + `device-mapper-multipath` on RHEL family.) + +Skip this check on Talos and on hosted/managed (no SSH-style node prep there). + +### sysctl values + +```sh +for k in \ + fs.inotify.max_user_watches \ + fs.inotify.max_user_instances \ + fs.inotify.max_queued_events \ + fs.file-max \ + fs.aio-max-nr \ + net.ipv4.ip_forward \ + net.ipv4.conf.all.forwarding \ + net.bridge.bridge-nf-call-iptables \ + net.bridge.bridge-nf-call-ip6tables \ + vm.swappiness; do + printf '%s = %s\n' "$k" "$(sysctl -n $k 2>/dev/null || echo MISSING)" +done +``` + +Required minimums (from `docs/v1.3/install/kubernetes/generic.md`): + +| Key | Required value | +| ----------- | ----------- | +| `fs.inotify.max_user_watches` | `>= 524288` | +| `fs.inotify.max_user_instances` | `>= 8192` | +| `fs.inotify.max_queued_events` | `>= 65536` | +| `fs.file-max` | `>= 2097152` | +| `fs.aio-max-nr` | `>= 1048576` | +| `net.ipv4.ip_forward` | `= 1` | +| `net.ipv4.conf.all.forwarding` | `= 1` | +| `net.bridge.bridge-nf-call-iptables` | `= 1` | +| `net.bridge.bridge-nf-call-ip6tables` | `= 1` | +| `vm.swappiness` | `<= 1` | + +If any value is missing or below minimum, the fix is `/etc/sysctl.d/99-cozystack.conf` + `sysctl --system`. + +### Multipath blacklist + +```sh +test -f /etc/multipath/conf.d/cozystack-drbd-blacklist.conf && echo PRESENT || echo MISSING +``` + +If `MISSING` and storage is part of the chosen variant, instruct user to add the file with content: + +```conf +blacklist { + devnode "^drbd[0-9]+" +} +``` + +Then `systemctl reload multipathd`. + +### Storage discovery (for LINSTOR — non-hosted only) + +This is a Phase 2 lookup, **before** Phase 5.5 provisioning. Cozystack standardises on ZFS — see `references/storage-backends.md`. Run the three checks per node and surface the result so the operator picks devices in Phase 4 with context. + +Unmounted block devices: + +```sh +lsblk --noheadings --output NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE | awk '$3=="disk" && $4=="" && $5==""' +``` + +Pick the largest unmounted disk per node and surface it to the user for confirmation in Phase 4. Don't auto-pick — operator must own the choice (especially for mirror / raidz layouts). + +Existing zpools (Phase 4 pre-fills defaults from whatever is already there): + +```sh +zpool list -H -o name,size,free,health 2>/dev/null +``` + +Empty output is fine (means nothing yet — Phase 5.5 will create). Non-empty output is a signal: either reuse (the skill takes the name as default) or refuse to touch (if the pool is already serving data). + +ZFS tooling availability: + +```sh +command -v zpool zfs +zfs version 2>/dev/null | head -1 +``` + +Both binaries must be present and the kernel module loadable. If missing on non-Talos, the install command per distro: + +- Ubuntu / Debian: `apt-get install -y zfsutils-linux` (ansible-cozystack's `prepare-ubuntu.yml` does this). +- RHEL 9 / Rocky 9 / Alma 9: install OpenZFS from `zfsonlinux.org` repo. +- RHEL 10 / Rocky 10 / Alma 10: **not supported** — OpenZFS does not publish for the RHEL 10 family yet. See `references/known-failures.md`. +- Talos: built into the cozystack-tuned image as the `siderolabs/zfs` extension. If `lsmod | grep zfs` returns nothing, the image is not cozystack-tuned — see Phase 3 Talos gate below. + +Skip the whole storage section on hosted variant. + +### Talos cozystack-tuned image — Phase 3 gate (Talos only) + +If Phase 2 detected Talos on any node, run these four checks. **All four must pass on every Talos node**, otherwise Phase 3 STOP GATE 1 fails with `cozystack:cluster-install` refusing to continue and pointing at `/cozystack:talos-bootstrap`. + +```sh +# Kernel modules (cozystack-tuned image ships these as extensions; vanilla Talos does not) +lsmod | awk '{print $1}' | grep -Ex 'drbd|zfs|openvswitch' +``` + +All three names must appear in the output. + +```sh +# LVM filter — cozystack-tuned machine-config writes this verbatim +grep -E '^\s*global_filter\s*=' /etc/lvm/lvm.conf | grep -E 'drbd|zd|dm-' +``` + +The cozystack filter is `global_filter = [ "r|^/dev/drbd.*|", "r|^/dev/dm-.*|", "r|^/dev/zd.*|" ]` — the grep returns a non-empty line on a tuned image and nothing on vanilla. + +`talosctl` on the workstation (used by the operator if they need to fix things — not by the skill itself): + +```bash +talosctl version --client +``` + +If `talosctl` is missing, the install path is `brew install siderolabs/tap/talosctl` or downloading from `https://github.com/siderolabs/talos/releases`. + +Skip the whole Talos gate on non-Talos clusters. + +### Cluster-domain (cluster-wide, not per-node) + +```bash +kubectl --context $CTX --namespace kube-system get configmap coredns --output jsonpath='{.data.Corefile}' | grep -E 'kubernetes\s+\S+' || true +``` + +Must contain `kubernetes cozy.local`. If it shows `cluster.local` or anything else, refuse — see `requirements.md`. + +### Control-plane node label (cluster-wide) — KubeOVN compatibility gate + +KubeOVN's chart looks nodes up by a key=value pair from `MASTER_NODES_LABEL`. **The expected pair depends on the platform variant** (`~/git/github.com/cozystack/cozystack/packages/core/platform/templates/bundles/system.yaml`): + +| Platform variant | Expected label and value | +| ----------- | ----------- | +| `isp-full` (Talos) | `node-role.kubernetes.io/control-plane=""` (empty value — Talos default) | +| `isp-full-generic` (k3s / kubeadm / RKE2) | `node-role.kubernetes.io/control-plane=true` (literal string `true`) | +| `isp-hosted` | not used — KubeOVN is not deployed | +| `default` | not used unless the operator hand-rolls KubeOVN | + +The lookup compares the value byte-for-byte. If it doesn't match exactly, the chart `fail`s with: + +```text +No nodes found with label 'node-role.kubernetes.io/control-plane=true'. +Please check your MASTER_NODES_LABEL configuration or ensure master nodes are properly labeled. +``` + +The trap: **kubeadm** sets the label with an empty value, but the generic platform variant expects `=true`. k3s and RKE2 set it to `=true` and pass. Talos sets it to `=""` and passes only with `isp-full`. Mixing variants and distributions is the failure path. + +Check (returns `name=value` per node — value is empty when the label has no value): + +```bash +kubectl --context $CTX get nodes \ + --output jsonpath='{range .items[*]}{.metadata.name}={.metadata.labels.node-role\.kubernetes\.io/control-plane}{"\n"}{end}' +``` + +Parse the output and match against the variant's expected value. Treat a node as a CP node when the label exists at all (key present), regardless of value — but for the KubeOVN gate, the value must match the variant. + +If the value does **not** match what the variant expects, surface two recovery paths to the user (let them pick): + +1. **Relabel** — fast, but only safe when the cluster's own label management won't fight back. kubeadm leaves user labels alone; k3s preserves them across reboots; some operators (Cluster API, Rancher) reconcile labels. + + ```bash + # For isp-full-generic on kubeadm — set value to literal "true": + kubectl --context $CTX label node $CP_NODE \ + node-role.kubernetes.io/control-plane=true --overwrite + ``` + +2. **Pin explicit IPs via `MASTER_NODES`** — bypasses the lookup entirely. Collect the CP nodes' internal IPs (`kubectl get nodes -o wide`, INTERNAL-IP column) and set `networking.kubeovn.MASTER_NODES` to the comma-separated list in the values collected in Phase 4. This is the safest choice for kubeadm. + +Gate: do not apply the Platform Package until **either** the label matches the variant's expected value on at least one node, **or** `MASTER_NODES` is set explicitly in the collected values. Skip this gate on `isp-hosted`. + +### CNI conflict (cluster-wide) + +```bash +kubectl --context $CTX --namespace kube-system get pods --output name | \ + grep -Ei 'calico|flannel|weave|kube-flannel|aws-node|azure-cni' +``` + +Empty output is the only safe result for non-hosted. On hosted variant, the provider CNI is expected and OK. + +### Pod CIDR / Service CIDR + +```bash +# Pod CIDR from node specs +kubectl --context $CTX get nodes --output jsonpath='{.items[*].spec.podCIDR}' + +# Service CIDR — best-effort: read the apiserver flag, else first IP of kubernetes Service +kubectl --context $CTX --namespace kube-system get pod --selector component=kube-apiserver --output yaml 2>/dev/null \ + | grep -oE -- '--service-cluster-ip-range=[^ ]+' | head -1 +kubectl --context $CTX get service kubernetes --output jsonpath='{.spec.clusterIP}' +``` + +On managed k8s the apiserver pod is invisible — fall back to the clusterIP of `default/kubernetes` and infer the CIDR (typically the IP's /16 or /12 base). + +### Conflicting workloads + +```bash +kubectl --context $CTX get deploy,daemonset --all-namespaces \ + --output jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' \ + | grep -Ei 'ingress-nginx|cert-manager|metrics-server|kube-proxy|traefik|servicelb' || true +``` + +Any hit (other than the managed-k8s provider's own copies) is a blocker. Either remove the workload or refuse install. diff --git a/plugins/cozystack/skills/cluster-install/references/provider-pitfalls.md b/plugins/cozystack/skills/cluster-install/references/provider-pitfalls.md new file mode 100644 index 0000000..5067f87 --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/provider-pitfalls.md @@ -0,0 +1,133 @@ +# Provider-specific networking and runtime pitfalls + +Things that aren't deducible from "all HRs Ready" or from generic Kubernetes docs — they bite during a Cozystack install on specific providers and were each a multi-hour debug episode in a real install run. Cross-reference from SKILL.md Phase 4 (publishing slot) and Phase 5.5 (storage). + +## OCI 1:1 NAT — externalIPs must be internal + +**Symptom**: every HelmRelease is `Ready=True`, dashboard ingress controller pods are Running, dashboard URL resolves correctly via DNS, but `curl https://dashboard./` from the workstation returns `Connection refused` (TCP RST) or `Connection reset by peer`. cert-manager challenges sit `pending` because Let's Encrypt validators get the same RST on port 80. + +**Mechanism**: Oracle Cloud Infrastructure attaches a public IP to a VM via 1:1 NAT on the OCI virtual network fabric. Packets destined for the public IP are rewritten to the VM's VCN-internal address (`10.X.X.X`) **before** they reach the VM's NIC — the kernel never sees the public IP on any interface. Cilium's `externalIPs` BPF program matches on the packet's destination IP as observed by the host kernel; since the public IP is not present there, the match fails, no NAT rule applies, and the packet falls through to the default `tcp.reset` reply. + +**Fix**: set `Service.spec.externalIPs` (and Cozystack's `publishing.externalIPs`) to the VCN-internal IPs of each CP node, not the public IPs. OCI handles the public→internal NAT itself; from the cluster's point of view, the internal IPs are the only addresses that matter. + +How `cozystack:cluster-install` Phase 4 catches this: + +- Reads `cozystack_intake.external_ips.strategy` from the wizard (default `internal` when `intent_hints.platform: oci`). +- Validates against `Node.status.addresses`: when `InternalIP` ≠ `ExternalIP` on a NAT-fronted platform, refuses the `external` strategy and explains the failure mode. + +## GCP NAT'd external IPs + +**Symptom**: same as OCI — `Ready=True`, dashboard unreachable, RST/timeout. + +**Mechanism**: GCP VMs reached via Cloud NAT or alias IP mappings exhibit the same 1:1 NAT behaviour as OCI. Public IPs from `--external-ip` or via Cloud NAT egress are translated to the VM's internal IP before delivery; same Cilium-externalIPs miss. + +**Fix**: same as OCI — pick internal IPs for `Service.externalIPs`. Phase 4's NAT-provider gate (`intent_hints.platform: gcp-with-nat`) forces internal. + +Exception: GCP VMs with a directly-attached public ephemeral IP (not via Cloud NAT) **may** have the public IP on the interface — verify with `kubectl debug node` + `ip addr show`. When the public IP is present on the host interface, `external` strategy works. When it isn't, use `internal`. + +## AWS Elastic IP / NLB Proxy Protocol + +**Symptom**: TCP-level connectivity works but the dashboard backend logs every request as coming from the NLB's internal address, breaking RBAC and source-IP-based features. Or: HTTP 502 because the backend sees PROXY-protocol header bytes as garbage HTTP. + +**Mechanism**: AWS NLB in proxy-protocol mode prepends a PROXY v1/v2 header to each connection containing the real client IP. Workloads that don't speak proxy-protocol see the header bytes inside the TCP stream and return errors. + +**Fix**: either disable proxy-protocol on the NLB and use NLB's internal source-IP preservation (set `service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: ""` to none), or enable proxy-protocol in the ingress controller (ingress-nginx supports it via `use-proxy-protocol: "true"` in its ConfigMap; cozystack's root-ingress-controller does not by default in v1.3.x — out of scope to enable here). + +EIP-only (no NLB): direct attachment behaves like a public IP on the interface, so `external` strategy works. Verify with `ip addr show` on the host. + +## Talos system-extension binaries unavailable outside `ext-*` namespaces + +**Symptom**: `kubectl debug node --image=alpine:3 -- chroot /host zpool create` fails with `relocation error: /lib64/ld-linux-x86-64.so.2: not found` or `/bin/sh: not found`. + +**Mechanism**: Talos's host rootfs is musl-statically-linked and contains only the `machined` Go binary at PID 1. System extensions (zfs, drbd, openvswitch userspace) live in dedicated namespaces (`ext-zfs-service`, etc.) with their own glibc rootfs and dependencies. The host rootfs that `kubectl debug ... chroot /host` exposes has neither the glibc loader nor `/bin/sh`. + +**Fix**: see SKILL.md Phase 5.5 (Talos path) and `references/storage-backends.md` (Talos privileged DaemonSet bootstrap pattern). Run a privileged Pod from `ubuntu:24.04`, `apt-get install zfsutils-linux`, bind-mount `/dev/zfs` and the target disk, partition manually with `sgdisk`, then `zpool create`. + +## Pod Security Admission `baseline` blocks `kubectl debug node` + +**Symptom**: `kubectl debug node/ --image=... --profile=sysadmin` fails with `pods "node-debugger-..." is forbidden: violates PodSecurity "baseline:v1.28": host namespaces (hostPID=true), hostPath volumes (volume "host-root"), allowPrivilegeEscalation != false (container "container-00")` — on a default `kube-system` or `default` namespace. + +**Mechanism**: k8s 1.25+ enforces PodSecurity `baseline` by default on every namespace without an explicit label override. The `sysadmin` profile of `kubectl debug` needs `hostPID`, `hostPath`, and `allowPrivilegeEscalation: true` — all forbidden by `baseline`. + +**Fix**: create a dedicated namespace with `pod-security.kubernetes.io/enforce=privileged`: + +```bash +kubectl --context $CTX create ns cozy-storage-bootstrap +kubectl --context $CTX label ns cozy-storage-bootstrap \ + pod-security.kubernetes.io/enforce=privileged --overwrite +``` + +Then run the debug Pod (or any privileged bootstrap workload) in that namespace. Delete the namespace after the bootstrap completes. + +## Talos `talosctl reset` leaves user disks intact + +**Symptom**: re-installing Cozystack on previously-Cozystack'd Talos nodes: `zpool create` fails with `EBUSY` or silently produces a `DEGRADED` pool. `pvs` shows leftover LVM VG; `dmsetup ls` shows `linstor_data-thinpool-tdata` mappings; `linstor storage-pool list` would have shown them as registered before the reset. + +**Mechanism**: `talosctl reset` wipes the system disk (the one in `machine.install.disk`) but **does not** touch user/data disks by default. Previous-install LINSTOR LVM-thin pool state on `/dev/sdb` survives `talosctl reset` even with the maintenance-mode reset profile. dm-thin devices stay mapped, LVM PV signatures stay readable, `zpool create` then either refuses or creates a degraded pool that loses data on the next reboot. + +**Fix**: before `zpool create`, wipe explicitly: + +```bash +# Inside the privileged bootstrap Pod +vgchange -an # deactivate all LVM VGs +dmsetup ls | grep linstor | awk '{print $1}' | xargs --no-run-if-empty dmsetup remove --force +dd if=/dev/zero of="$DEVICE" bs=1M count=10 # wipe header +sgdisk --zap-all "$DEVICE" +wipefs --all "$DEVICE" +``` + +The skill's Phase 5.5 step 7 (pre-existing-data check) catches this before `zpool create` and refuses to proceed without operator approval of the wipe. + +## Cozystack v1.3.x does not create StorageClasses automatically + +**Symptom**: cluster reaches "all HRs Ready", but every stateful tenant workload sits in `Pending: pod has unbound immediate PersistentVolumeClaims`. `kubectl get storageclass` returns no rows. + +**Mechanism**: in v1.3.x, neither the cozy-installer chart nor the Platform Package emits StorageClasses; they expect the operator to apply them by hand after `linstor storage-pool create`. v1.4+ exposes `tenants.apps.cozystack.io spec.storageClasses` and the operator creates them based on the tenant declaration. + +**Fix**: SKILL.md Phase 8.6 creates `local` (placementCount=1) and `replicated` (placementCount=3, isDefaultClass=true) for v1.3.x. Skip on v1.4+. + +## Cozystack v1.3.3 `isp-full` bundle does not include Keycloak + +**Symptom**: dashboard SSO link fails — operator expected Keycloak admin URL but `cozy-keycloak` namespace is absent. + +**Mechanism**: in v1.3.x the `isp-full` overlay does not enable Keycloak; cozystack's dashboard ships its own self-issued OIDC provider (`cozystack-issuer`). Keycloak is opt-in via a separate bundle. v1.4+ may change this. + +**Fix**: not really a fix — clarification. The dashboard works without Keycloak via the bundled self-issued OIDC. Operators expecting external SSO need to layer Keycloak themselves (out of scope for v1 of `cozystack:cluster-install`). + +## `api.` ingress speaks TCP passthrough, not HTTP + +**Symptom**: `curl https://api./healthz` returns HTTP 401 with a self-signed certificate that browsers reject, even though `dashboard.` returns a valid Let's Encrypt R12 cert. + +**Mechanism**: cozystack's `api.` ingress is a TCP passthrough to `kube-apiserver:6443`. apiserver terminates TLS itself with its own PKI (not cert-manager-issued); the ingress controller only forwards the encrypted bytes. The 401 is apiserver's auth challenge — expected. The self-signed cert is apiserver's, also expected. + +**Fix**: this is by design — the `api.` endpoint is for `kubectl` consumers, not for browsers. Use the apiserver's own kubeconfig CA bundle, not the public-CA chain. Cozystack docs call this out at "after install, access the cluster with `kubectl --kubeconfig `". + +## HelmRelease count varies during install + +**Symptom**: Phase 8 watch loop sees 84 HRs at the 1-minute mark, 86 HRs at the 4-minute mark, 88 HRs at the 8-minute mark. An operator who hard-codes "wait for 88 Ready" may declare success at 84/84 before the missing 4 HRs land. + +**Mechanism**: cozystack's Platform Package is unfolded incrementally by cozystack-operator as dependencies become available. Some HRs only get created after their prerequisite chart is Ready (cascading dependency graph). + +**Fix**: the SKILL.md Phase 8 watch loop polls the HR-not-Ready list dynamically; it does not depend on a fixed expected count. The list-empty condition is the success signal. + +## `kubectl exec linstor` from outside the controller pod requires mTLS client cert + +**Symptom**: from the workstation, `linstor --controllers linstor+ssl://10.0.0.10:3371 storage-pool list` fails with `SSL handshake failed: peer not authenticated`. + +**Mechanism**: LINSTOR speaks mTLS by default in cozystack's deployment. The controller pod's `/etc/linstor/client/` directory has the client certificate; outside the pod, no certificate exists. + +**Fix**: always invoke the CLI inside the controller pod via `kubectl exec`: + +```bash +kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- linstor +``` + +The Phase 8 storage-pool registration block and `references/storage-backends.md` both use this form throughout. + +## HelmRelease cascade warnings ("secret not found", "rolebinding not found") + +**Symptom**: in the first 5–10 minutes of Phase 8 watch loop, `kubectl get events` shows warnings about secrets and rolebindings that don't yet exist. Operators reading the events stream see them as errors. + +**Mechanism**: cozystack's HelmReleases reconcile in parallel; some declare cross-namespace references (Secret consumers, ServiceAccount tokens) that race against the producing HRs. Flux retries with backoff; the warnings disappear once the producing HR completes. + +**Fix**: not a fix — clarify in operator-facing summary. Phase 8 watch-loop output ignores events older than the most recent reconcile attempt. If an operator surfaces a warning, the skill answers "transient cascade — Flux will retry; if still failing after 10 min on the same error, capture diagnostics". diff --git a/plugins/cozystack/skills/cluster-install/references/requirements.md b/plugins/cozystack/skills/cluster-install/references/requirements.md new file mode 100644 index 0000000..8456e50 --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/requirements.md @@ -0,0 +1,71 @@ +# Cluster requirements matrix + +Source of truth: `https://cozystack.io/docs/v1.3/install/kubernetes/generic/` and `packages/core/installer/values.yaml` in the upstream cozystack monorepo. Pin the doc URL to the major version that matches `--installer-version`. + +## Kubernetes distribution & version + +| Distribution | Minimum version | Notes | +| ----------- | ----------- | ----------- | +| k3s | v1.32+ | `--flannel-backend=none --disable=traefik,servicelb,local-storage,metrics-server --disable-network-policy --disable-kube-proxy --cluster-domain=cozy.local` | +| kubeadm | v1.28+ | `dnsDomain: cozy.local` in ClusterConfiguration; KubeProxyConfiguration `mode: "none"`; `--skip-phases=addon/kube-proxy` | +| RKE2 | v1.28+ | `cni: none`, `disable: [rke2-ingress-nginx, rke2-metrics-server]`, `cluster-domain: cozy.local`, `disable-kube-proxy: true` | +| Talos | matched by talm | Use `cozystack:cluster-install` only on already-bootstrapped Talos. Bootstrap itself is out of scope. | +| Managed k8s (EKS / GKE / AKS / DOKS) | provider's current default | Hosted variant only — no LINSTOR, no KubeVirt, no Kube-OVN. | + +## Cluster-wide configuration (hard requirements) + +- **Cluster domain**: `cozy.local`. The Package chart enforces this in `networking.clusterDomain` and components assume it. If kube-apiserver was bootstrapped with `cluster.local` (or anything else), `cozystack:cluster-install` must refuse — re-bootstrap of the cluster is the only fix. +- **podCIDR / serviceCIDR**: must match what kube-apiserver and kubelet were started with. Mismatched values silently break service routing. +- **CNI**: none installed. Cozystack ships Cilium + Kube-OVN. If any other CNI pod (Calico, Flannel, Weave, AWS VPC CNI) is running in `kube-system`, refuse with explanation. Exception: managed/hosted variant — provider CNI stays. +- **kube-proxy**: disabled. Cilium replaces it. If kube-proxy DaemonSet exists, the install will conflict. +- **Ingress controller**: none installed. Cozystack ships ingress-nginx (`cozy-ingress-nginx`). +- **cert-manager**: none installed. Cozystack ships its own (`cozy-cert-manager`). +- **Storage provisioner**: none installed (for non-hosted). Cozystack ships LINSTOR (piraeus-operator). +- **metrics-server**: none installed. VictoriaMetrics covers metrics. + +## Per-node prerequisites (generic variant — Ubuntu/Debian) + +- OS: Ubuntu 22.04+ or Debian 12+ (kernel 5.x+, systemd). +- Arch: amd64 or arm64. +- Packages installed: `nfs-common`, `open-iscsi`, `multipath-tools` (or distro equivalents). +- Services enabled and running: `iscsid`, `multipathd`. +- Kernel module loaded: `br_netfilter` (persisted via `/etc/modules-load.d/`). +- sysctl values (see `node-checks.md` for the full list and exact thresholds). +- Secure Boot on Ubuntu: pre-install `drbd-dkms` from the LINBIT PPA before deploy — piraeus-operator's in-cluster compile path is rejected by kernel lockdown. See [Ubuntu + Secure Boot](https://cozystack.io/docs/v1.3/install/kubernetes/ubuntu-secure-boot/). + +## Per-node prerequisites for storage (LINSTOR — non-hosted) + +- One unmounted secondary block device per node (≥100 GB recommended; for prod sizes, see hardware requirements doc). +- Multipath blacklist file at `/etc/multipath/conf.d/cozystack-drbd-blacklist.conf` blocking `drbd*` devices. +- ZFS-on-Linux available **or** opt out of ZFS (LVM fallback) on distros where ZFS is unsupported (Rocky/Alma 10, etc.). + +## Hardware (per node, minimum) + +- Control plane: 4+ vCPU, 8+ GiB RAM. +- Worker: 4+ vCPU, 8+ GiB RAM (more if running VMs via KubeVirt). +- Node count: minimum 3 for production (1 CP + 2 workers); 1-node k3s sandbox works but no replicated storage. + +## Network + +- All nodes in the same L2 segment (or KubeSpan with RTT < 10 ms). +- LB IP range or external IPs reserved and routable. +- DNS: either real FQDN under operator's control, or `.nip.io` for sandbox. +- L2 anti-spoofing disabled on the upstream switch if MetalLB L2 mode is used. + +## Installer variant ↔ platform variant mapping + +The cozy-installer Helm chart has its own variant (`cozystackOperator.variant`) — this picks how the operator deploys and which extra wiring it expects. The Platform Package CR has its own `spec.variant`. The two must match. + +| Installer variant | Platform variant | When to pick | +| ----------- | ----------- | ----------- | +| `talos` | `isp-full` (or `distro-full`) | Talos Linux nodes — full IaaS + PaaS. | +| `generic` | `isp-full-generic` | kubeadm / k3s / RKE2 — full IaaS + PaaS on generic Linux. Requires `cozystack.apiServerHost` (internal IP of CP node). | +| `hosted` | `isp-hosted` | Managed k8s — PaaS only (no VMs, no LINSTOR). | +| any | `default` | Bare minimum — controller only, no bundles. Power-user / development. | + +## Where this is enforced + +- Chart values defaults: `~/git/github.com/cozystack/cozystack/packages/core/installer/values.yaml`. +- Platform defaults: `~/git/github.com/cozystack/cozystack/packages/core/platform/values.yaml`. +- Variant overlays: `packages/core/platform/values-isp-full*.yaml`, `values-isp-hosted.yaml`. +- Ansible reference: `~/git/github.com/cozystack/ansible-cozystack/roles/cozystack/{defaults,tasks}/main.yml` — read this when in doubt about a value's required form. diff --git a/plugins/cozystack/skills/cluster-install/references/storage-backends.md b/plugins/cozystack/skills/cluster-install/references/storage-backends.md new file mode 100644 index 0000000..a99a10f --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/storage-backends.md @@ -0,0 +1,208 @@ +# Storage backend: ZFS (the only supported path) + +Cozystack standardises on ZFS for LINSTOR storage pools. Other backends (LVM, LVM-thin) exist in piraeus-operator's CRD and `linstor physical-storage create-device-pool` supports them, but cozystack documentation and the upstream platform tooling assume ZFS — using anything else means leaving the supported path and dealing with edge cases the project hasn't validated. + +Reference: `https://cozystack.io/docs/next/storage/disk-preparation/`. + +This file is the source of truth for the commands `cozystack:cluster-install` Phase 5.5 runs in `kubectl debug` → `chroot /host` to provision the pool on each storage node. + +## Prerequisites + +- One dedicated unmounted block device per storage node (`/dev/nvme0n1`, `/dev/sdb`, …). Phase 2 lookup surfaces unmounted disks. +- `zfsutils-linux` (Ubuntu/Debian) or `zfs` (Talos via the cozystack-tuned image extension) installed on every storage node. +- Kernel module `zfs` loaded. On Talos this is in the cozystack-tuned image as `siderolabs/zfs:-`. On generic Linux ansible-cozystack's `prepare-ubuntu.yml` handles installation and module load. + +## Default names + +`cozystack:cluster-install` proposes these defaults; operator can override: + +| Slot | Default | Notes | +|---|---|---| +| ZFS pool | `data` | Conventional cozystack name. | +| LINSTOR storage pool | `data` | What `linstor storage-pool list` shows; referenced by every StorageClass `parameters.linstor.csi.linbit.com/storagePool`. | + +Names must be consistent between three places: + +1. The on-disk artefact (`zpool list` shows the pool). +2. The `linstor storage-pool create zfs ` registration. +3. Any StorageClass that references the pool. + +## Create command (per storage node, inside `chroot /host`) + +```sh +set -euo pipefail +test -b "$DEVICE" + +# Refuse if the device already has a zpool — operator must wipe first. +if zpool list -H -o name 2>/dev/null | grep -qx "$POOL_NAME"; then + echo "zpool '$POOL_NAME' already exists on this node" >&2 + exit 1 +fi + +# ashift=12 — 4 KiB physical sector alignment; safe for SSD + HDD. +# atime=off, compression=lz4 — cozystack-conventional tuning. +zpool create -o ashift=12 "$POOL_NAME" "$DEVICE" +zfs set compression=lz4 "$POOL_NAME" +zfs set atime=off "$POOL_NAME" +``` + +For multi-disk pools (mirror / RAID-Z), the cozystack docs recommend a single `zpool create` with the vdev layout inline: + +```sh +# Two-way mirror: +zpool create -o ashift=12 "$POOL_NAME" mirror "$DEVICE1" "$DEVICE2" + +# RAID-Z (3+ disks, one parity): +zpool create -o ashift=12 "$POOL_NAME" raidz "$DEVICE1" "$DEVICE2" "$DEVICE3" +``` + +Phase 4 collects the layout per node (single / mirror / raidz) so the skill renders the right `zpool create` invocation. + +## Register the pool with LINSTOR + +The `LinstorSatelliteConfiguration` CRD has **no** `zfsPool` slot — ZFS pools are registered at runtime via the LINSTOR API. `cozystack:cluster-install` runs the registration **inside the Phase 8 watch loop**, gated on `linstor-controller` reporting at least one Ready replica — not on all HRs being Ready, because paas / monitoring HRs that request PVCs depend on the storage pool existing, so an all-HRs-Ready gate would deadlock. See SKILL.md Phase 8 for the implementation, not a forward-reference. The block: + +```bash +yq --output-format=json '.cozystack.storage.nodes' "$STATE_FILE" \ + | jq --compact-output '.[]' \ + | while IFS= read -r entry; do + NODE=$(jq --raw-output '.name' <<<"$entry") + ZPOOL=$(jq --raw-output '.zpool' <<<"$entry") + LINPOOL=$(jq --raw-output '.linstor_pool' <<<"$entry") + # Idempotent: skip if already registered. + if kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list --node "$NODE" --storage-pool "$LINPOOL" \ + --output-version v1 2>/dev/null | grep -q "$LINPOOL"; then + continue + fi + kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool create zfs "$NODE" "$LINPOOL" "$ZPOOL" + done +``` + +The `while read -r` form is safer than `for entry in $(...)`: avoids word-splitting on whitespace inside JSON values, preserves quoting, and reads one JSON document per line as `jq -c` emits them. + +The loop iterates every storage-providing node persisted in `state.cozystack.storage.nodes[]`, **not** only control-plane. Phase 5.5 writes that list with the per-node zpool and linstor_pool names. + +## Verify + +```sh +# On the node, after create: +zpool status "$POOL_NAME" +zpool list -H -o name,size,free,health "$POOL_NAME" + +# From the workstation, after LINSTOR registration: +kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list +# Expect one ZFS row per storage node with the chosen name and non-zero Capacity. +``` + +## Backout (only on operator approval) + +The skill never auto-destroys. If operator changes their mind: + +```sh +# Inside chroot /host on the node: +zpool destroy "$POOL_NAME" +wipefs --all "$DEVICE" +``` + +`wipefs` clears the partition signature so a re-run of Phase 5.5 sees the device as truly empty. + +## What about LVM / LVM-thin? + +The piraeus-operator CRD has `lvmPool` and `lvmThinPool` slots and `linstor physical-storage create-device-pool` supports them. Cozystack itself does not validate or document these paths, the platform charts assume ZFS storage classes, and several troubleshooting playbooks in cozystack docs only cover ZFS. If an operator insists on LVM, the skill refuses to provision automatically and points at the upstream piraeus-operator docs — they're on their own for the integration. + +## Talos path — privileged DaemonSet bootstrap (the actual mechanism) + +`kubectl debug node/ --image=alpine:3 -- chroot /host zpool create` **does not work on Talos**. Three independent reasons: + +1. The cozystack-tuned Talos image's `zpool` userspace lives at `/usr/local/sbin/zpool` and is glibc-linked, depending on `/lib64/ld-linux-x86-64.so.2`. Talos's host rootfs is musl-statically-linked and has no glibc loader; the loader is only present inside the `ext-zfs-service` system-extension namespace. Running `zpool` in a chroot to the host rootfs gets `relocation error: /lib64/ld-linux-x86-64.so.2: not found`. +2. `chroot /host /bin/sh` fails — `/bin/sh` does not exist in Talos rootfs. Talos's PID 1 is the `machined` Go binary, not a POSIX shell environment. +3. Pod Security Admission (`baseline` enforced on `default` and `kube-system` since k8s 1.25) rejects the privileged debug Pod that `--profile=sysadmin` creates. The pod gets refused before it can even attempt the chroot. + +The real path: a one-shot privileged Pod from `ubuntu:24.04` in a dedicated namespace with PSA `privileged`. ubuntu installs `zfsutils-linux` via apt (the userspace ZFS 2.2.x is forward-compatible with the kernel ZFS 2.4.x the cozystack-tuned image ships), bind-mounts `/dev`, `/dev/zfs`, and the chosen target disk, and runs `sgdisk` + `zpool create` directly. + +```bash +# 1) Bootstrap namespace with the right PSA label +kubectl --context $CTX create ns cozy-storage-bootstrap 2>/dev/null || true +kubectl --context $CTX label ns cozy-storage-bootstrap \ + pod-security.kubernetes.io/enforce=privileged --overwrite + +# 2) Pod manifest — pinned to one node, hostPID for partprobe visibility +cat </dev/null | grep -q .; then + echo "DEVICE $DEVICE has an LVM VG; refuse" >&2; exit 1 + fi + if zpool list -H -o name 2>/dev/null | grep -qx "$POOL_NAME"; then + echo "zpool '$POOL_NAME' already exists" >&2; exit 1 + fi + # Talos has no udev inside the pod; partition manually first + sgdisk --zap-all "$DEVICE" + sgdisk --new=1:0:0 --typecode=1:bf01 --change-name=1:cozystack-zpool "$DEVICE" + partprobe "$DEVICE"; sleep 1 + test -b "${DEVICE}1" + zpool create -o ashift=12 "$POOL_NAME" "${DEVICE}1" + zfs set compression=lz4 "$POOL_NAME" + zfs set atime=off "$POOL_NAME" + zpool status "$POOL_NAME" + zpool list -H -o name,size,free,health "$POOL_NAME" + securityContext: + privileged: true + volumeMounts: + - name: dev + mountPath: /dev + # /dev/zfs comes through the /dev mount above — devtmpfs is shared. + # A separate hostPath for the char device would nest inside an + # already-mounted volume and mask one of the two. + volumes: + - name: dev + hostPath: { path: /dev } +EOF + +# 3) Wait for the one-shot pod to terminate (Succeeded), capture logs, clean up. +# Use phase=Succeeded, NOT condition=Ready — Ready is False once a one-shot pod +# terminates, so --for=condition=Ready hangs the full timeout on every successful +# run. +kubectl --context $CTX --namespace cozy-storage-bootstrap wait pod/zpool-create-$NODE \ + --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s \ + || { kubectl --context $CTX --namespace cozy-storage-bootstrap describe pod/zpool-create-$NODE + kubectl --context $CTX --namespace cozy-storage-bootstrap logs pod/zpool-create-$NODE + exit 1; } +kubectl --context $CTX --namespace cozy-storage-bootstrap logs pod/zpool-create-$NODE +kubectl --context $CTX --namespace cozy-storage-bootstrap delete pod/zpool-create-$NODE +``` + +After every storage-providing node completes successfully, delete the bootstrap namespace: + +```bash +kubectl --context $CTX delete ns cozy-storage-bootstrap +``` + +### Why not `talosctl` from `nsenter` into `ext-zfs-service` + +Theoretically possible, but `ext-zfs-service` is a service-namespace not designed for ad-hoc command execution, the API isn't stable across Talos minor versions, and the `talosctl read` / `talosctl get` surface only exposes read-only Resource APIs (no shell-out). The privileged Pod path is the supported one. + +### Multi-disk on Talos (mirror / raidz) + +The bootstrap Pod above is single-device only. For mirror or raidz on Talos, the operator extends the pod manifest with additional `volumeMounts` for each `${DEVICE_N}`, runs `sgdisk` per disk, and ends with `zpool create ... mirror|raidz ${DEVICE1}1 ${DEVICE2}1 ${DEVICE3}1`. The skill does not auto-generate this path in v1 — surface as "multi-disk Talos requires manual Pod manifest, see this section for the shape". diff --git a/plugins/cozystack/skills/cluster-install/references/values-template.md b/plugins/cozystack/skills/cluster-install/references/values-template.md new file mode 100644 index 0000000..9cd6484 --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/values-template.md @@ -0,0 +1,201 @@ +# Canonical values for cozy-installer and the Platform Package + +## Installer chart values + +The cozy-installer Helm release lives in `kube-system` (the chart itself templates `Namespace cozy-system`, so the release secret can't live there). Two keys matter at install time: + +```yaml +cozystackOperator: + variant: generic # or talos / hosted +cozystack: + apiServerHost: "" # see the table below + apiServerPort: "" # see the table below +``` + +`apiServerHost` / `apiServerPort` resolution by variant: + +| Variant | `apiServerHost` | `apiServerPort` | Source | +|---|---|---|---| +| `talos` | `localhost` | `7445` | Talos KubePrism, built into machine-config. Hard-coded in `values-isp-full.yaml` for Cilium. | +| `generic` (default) | `127.0.0.1` | `7445` | extractedprism DaemonSet installed in Phase 5.6, proxies to a healthy CP endpoint. See the [extractedprism](#extractedprism-generic-kube-apiserver-ha) section below. | +| `generic` with `--no-extractedprism` | `` | `6443` | Operator passes `--api-host=`; that IP is a CP, a VIP, or an external LB. Single CP IP is a SPOF — see `known-failures.md`. | +| `hosted` | not set | not set | Managed provider handles kube-apiserver HA. | + +Install command shape: + +```bash +helm --kube-context $CTX upgrade --install cozy-installer \ + oci://ghcr.io/cozystack/cozystack/cozy-installer \ + --version $INSTALLER_VERSION \ + --namespace kube-system \ + --set cozystackOperator.variant=$INSTALLER_VARIANT \ + --set cozystack.apiServerHost=$API_HOST \ + --set cozystack.apiServerPort=$API_PORT \ + --wait --timeout 10m +``` + +If `cozy-system` already exists, the chart refuses with `invalid ownership metadata`. Adopt it first: + +```bash +kubectl --context $CTX patch namespace cozy-system --type=merge --patch '{ + "metadata": { + "labels": {"app.kubernetes.io/managed-by": "Helm"}, + "annotations": { + "meta.helm.sh/release-name": "cozy-installer", + "meta.helm.sh/release-namespace": "kube-system" + } + } +}' +``` + +(Skip adoption if the namespace doesn't exist; `--create-namespace` would conflict with the chart's own `Namespace` template, so don't pass it.) + +## Platform Package CR + +The Package is **cluster-scoped** (`cozystack.io/v1alpha1 / Package`). Apply once the operator deployment is Available and the `packages.cozystack.io` CRD is Established. + +```yaml +apiVersion: cozystack.io/v1alpha1 +kind: Package +metadata: + name: cozystack.cozystack-platform +spec: + variant: isp-full-generic # or isp-full / isp-hosted / default + components: + platform: + values: + bundles: + system: + enabled: true + iaas: + enabled: true + paas: + enabled: true + naas: + enabled: true + networking: + podCIDR: "10.244.0.0/16" # cozystack default, from packages/core/platform/values.yaml + podGateway: "10.244.0.1" # first IP of podCIDR + serviceCIDR: "10.96.0.0/16" # cozystack default + joinCIDR: "100.64.0.0/16" # cozystack default + kubeovn: + MASTER_NODES: "" # comma-separated CP IPs; leave empty to let Helm lookup find them + publishing: + host: "example.com" + apiServerEndpoint: "https://api.example.com:6443" + exposedServices: + - api + - dashboard + externalIPs: + - 192.0.2.10 + exposure: externalIPs # or "loadBalancer" +``` + +## extractedprism (generic kube-apiserver HA) + +On the `generic` variant, `cozystack:cluster-install` Phase 5.6 installs the extractedprism DaemonSet **before** the cozy-installer chart so the operator's apiServerHost already resolves to a healthy CP endpoint when cozystack-operator starts dialing. + +Chart: `oci://ghcr.io/lexfrei/charts/extractedprism` (BSD-3-Clause; source `https://github.com/lexfrei/extractedprism`). + +Default values the skill passes via a rendered file under `/extractedprism-values.yaml` (this is the canonical artifact the orchestrator advertises and the sops opt-in encrypts): + +```yaml +# /extractedprism-values.yaml +endpoints: "10.0.0.1:6443,10.0.0.2:6443,10.0.0.3:6443" +``` + +```bash +helm --kube-context $CTX upgrade --install extractedprism \ + oci://ghcr.io/lexfrei/charts/extractedprism \ + --version 0.2.0 \ + --namespace kube-system \ + --values "$CONFIG_DIR/extractedprism-values.yaml" \ + --wait --timeout 5m +``` + +`endpoints` is a **single string scalar** (chart `values.schema.json` `type=string`) holding a comma-separated list of every control-plane node's `:6443`. A YAML list shape (`endpoints: [...]`) is rejected by helm schema validation. The chart's defaults are sane for cozystack: + +- `bindAddress: 127.0.0.1` + `bindPort: 7445` — same shape as Talos KubePrism so `cozystack.apiServerHost=127.0.0.1` / `cozystack.apiServerPort=7445` in the cozy-installer values just works. +- `hostNetwork: true` — pod listens on the node's loopback. +- `priorityClassName: system-node-critical` — survives eviction storms. +- `tolerations: [{operator: Exists}]` — runs on every node regardless of taints; this proxy is critical infra. + +Skip on: + +- `talos` variant (built-in KubePrism at `localhost:7445`). +- `hosted` variant (managed provider handles HA). +- `--no-extractedprism` flag — operator supplied `--api-host=` (single CP, VIP, or external LB) and accepts the trade-off. + +## LINSTOR storage pool registration (ZFS, runtime via CLI) + +Cozystack standardises on ZFS for LINSTOR storage pools. The `LinstorSatelliteConfiguration` CRD does **not** have a `zfsPool` slot — ZFS pools are registered at runtime via the LINSTOR API once `linstor-controller` reaches Ready in Phase 8. There is no declarative-CR path the skill emits up front; instead it queues a per-node `linstor storage-pool create zfs` invocation. + +The skill stores the per-node mapping in `/.state.yaml` under `cozystack.storage.nodes[]` so the Phase-8 hook knows what to register: + +```bash +# Run once linstor-controller is Ready (Phase 8 hook). +# Iterates over every node listed in state.cozystack.storage.nodes[]. +for entry in $STORAGE_NODES; do + node="${entry%%:*}" + zpool="${entry##*:}" + kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool create zfs "$node" data "$zpool" +done +``` + +Where `data` is the LINSTOR pool name and the zpool name defaults to `data` (collected in Phase 4, overridable per node). + +After registration, verify: + +```bash +kubectl --context $CTX --namespace cozy-linstor exec deploy/linstor-controller -- \ + linstor storage-pool list +# Expect one ZFS row per storage node with non-zero Capacity. +``` + +Schema source for the CRD that `LinstorSatelliteConfiguration` does support (for non-ZFS adventurers — not the cozystack path): `~/git/github.com/cozystack/cozystack/packages/system/piraeus-operator-crds/templates/crds.yaml`. + +## CIDR defaults from cozystack source + +Cozystack pins its own platform-level CIDRs in `~/git/github.com/cozystack/cozystack/packages/core/platform/values.yaml`. These are the values the Platform Package CR expects, regardless of the underlying distribution's default Pod / Service CIDRs: + +| Field | Cozystack default | +| ----------- | ----------- | +| `podCIDR` | `10.244.0.0/16` | +| `serviceCIDR` | `10.96.0.0/16` | +| `joinCIDR` | `100.64.0.0/16` | + +The defaults the cluster distribution itself uses are **not** what cozystack wants — cozystack runs Kube-OVN over the host's CNI shape, so the `podCIDR` cozystack expects is the Kube-OVN-managed range, not the distro's. Specifically: + +- k3s defaults to `10.42.0.0/16 / 10.43.0.0/16`, but on a cozystack install Kube-OVN overlays `10.244.0.0/16 / 10.96.0.0/16` on top — the distro defaults become irrelevant. +- kubeadm defaults to `10.244.0.0/16 / 10.96.0.0/16` (coincidentally the same as cozystack's). +- RKE2 defaults to `10.42.0.0/16 / 10.43.0.0/16` — same story as k3s. + +For cozystack purposes always use cozystack's defaults. Change only if they overlap with host routing or another in-network range; on conflict, pick non-overlapping `/16`s and document. + +`wizard` Phase 4 reads the canonical values from `packages/core/platform/values.yaml` (see `wizard/SKILL.md` for the resolution order) and surfaces the source path alongside the value so the operator knows which file informed the default. + +## When the user has no domain + +Use `nip.io` dash notation: if the LB IP is `192.0.2.10`, set `publishing.host: "192-0-2-10.nip.io"`. Every subdomain resolves to that IP without DNS provisioning. Spell this out — click-ops users don't always know the trick. + +## After Package apply + +If `system` bundle is on and `cozystack_tenant_root_ingress` semantics are desired, patch the root tenant after the operator creates it: + +```bash +kubectl --context $CTX wait tenants.apps.cozystack.io/root --namespace tenant-root \ + --for=jsonpath='{.metadata.name}'=root --timeout=300s +kubectl --context $CTX --namespace tenant-root patch tenants.apps.cozystack.io root \ + --type=merge --patch '{"spec":{"ingress":true}}' +``` + +This is what creates the `IngressClass` and brings up `ingress-nginx`. + +## Per-variant overlay files in upstream + +- `~/git/github.com/cozystack/cozystack/packages/core/platform/values-isp-full.yaml` +- `~/git/github.com/cozystack/cozystack/packages/core/platform/values-isp-full-generic.yaml` +- `~/git/github.com/cozystack/cozystack/packages/core/platform/values-isp-hosted.yaml` + +These describe **only** the bundle deltas. Use the merged values from `packages/core/platform/values.yaml` plus the variant overlay as the baseline for what the user is editing in Phase 4. diff --git a/plugins/cozystack/skills/cluster-install/references/variants.md b/plugins/cozystack/skills/cluster-install/references/variants.md new file mode 100644 index 0000000..0f671b0 --- /dev/null +++ b/plugins/cozystack/skills/cluster-install/references/variants.md @@ -0,0 +1,58 @@ +# Variant picker + +Two variant axes: + +1. **Installer variant** — `cozystackOperator.variant` in the cozy-installer Helm chart. Picks how the controller wires itself to the cluster. +2. **Platform variant** — `spec.variant` in the `cozystack.cozystack-platform` Package CR. Picks which bundle of system/IaaS/PaaS components is rendered. + +They must match per the table in `requirements.md`. + +## What each variant gives you + +### `talos` / `isp-full` +Full Cozystack stack on Talos Linux: Cilium + Kube-OVN, LINSTOR storage, KubeVirt, Cluster API, full PaaS (databases, applications), monitoring. The platform owns the OS too — Talos machine-config carries the kernel modules, sysctl, services. `cozystack:cluster-install` only deploys onto an already-bootstrapped Talos cluster — it doesn't bootstrap Talos itself. + +### `generic` / `isp-full-generic` +Same stack as `isp-full`, but for generic Linux (kubeadm / k3s / RKE2). Requires the user to have prepared the OS (kernel modules, sysctl, iscsid, multipathd) themselves. `cozystack:cluster-install` checks for OS readiness via `kubectl debug node` and refuses if anything is missing. + +Requires the internal IP of the API server (`cozystack.apiServerHost`) — the operator needs to reach kube-apiserver to read state for things like KubeOVN's `MASTER_NODES` lookup. + +### `hosted` / `isp-hosted` +PaaS-only on top of a managed Kubernetes (EKS / GKE / AKS / DOKS) or any vanilla cluster where Cozystack should not manage networking/storage/VMs. No Cilium override, no Kube-OVN, no LINSTOR, no KubeVirt. Bundles: `paas` and `naas` on by default; `system` and `iaas` off. + +### `default` +Bare minimum — operator + PackageSource + reconciler. No bundles enabled. The operator can later be steered by hand-rolled Package CRs. Power-user territory; rarely the right pick for click-ops. + +## Recommendation logic for `cozystack:cluster-install` + +Drive it off cluster lookups in this order: + +1. Any node has `feature.node.kubernetes.io/system-os_release.ID=talos` or `nodeInfo.osImage` starts with `Talos`? + → recommend `talos` + `isp-full`. +2. Provider label on any node (`eks.amazonaws.com/*`, `gke.io/*`, `aks.io/*`, `kubernetes.azure.com/cluster`, `node.kubernetes.io/instance-type` matches a known managed pattern), or apiserver hostname looks managed? + → recommend `hosted` + `isp-hosted`. +3. Any node has `nodeInfo.kubeletVersion` ending in `+k3s*`, `+rke2*`, or `osImage` matches Ubuntu/Debian/RHEL family? + → recommend `generic` + `isp-full-generic`. +4. Otherwise: + → recommend `default`, but warn that the user is on their own. + +Surface the chosen recommendation as the first option in the AskUserQuestion. Always allow override. + +## Bundles inside a variant + +Independently of variant, bundles are flags: + +| Bundle | Default for variant | Contains | +| ----------- | ----------- | ----------- | +| `system` | on for `isp-full*`, off for `isp-hosted` | Cilium, Kube-OVN, LINSTOR, ingress-nginx, cert-manager, KubeVirt — the platform infra. | +| `iaas` | on for `isp-full*`, off for `isp-hosted` | Cluster API, Kamaji (managed k8s for tenants), VM provisioning. | +| `paas` | on everywhere | MariaDB / PostgreSQL / Redis / RabbitMQ / Kafka / Grafana / VictoriaMetrics operators. | +| `naas` | on everywhere | Network-as-a-Service for tenants. | + +`enabledPackages` / `disabledPackages` override individual packages inside whichever bundle they belong to. + +## Source of truth + +- `~/git/github.com/cozystack/cozystack/packages/core/installer/values.yaml` — variant key for the operator. +- `~/git/github.com/cozystack/cozystack/packages/core/platform/values-isp-full.yaml`, `values-isp-full-generic.yaml`, `values-isp-hosted.yaml` — what each platform variant turns on. +- `https://cozystack.io/docs/v1.3/install/kubernetes/generic/` — canonical generic flow. diff --git a/skills/cozystack-upgrade/skills/cozystack-upgrade/SKILL.md b/plugins/cozystack/skills/cluster-upgrade/SKILL.md similarity index 65% rename from skills/cozystack-upgrade/skills/cozystack-upgrade/SKILL.md rename to plugins/cozystack/skills/cluster-upgrade/SKILL.md index aa7124e..863f48c 100644 --- a/skills/cozystack-upgrade/skills/cozystack-upgrade/SKILL.md +++ b/plugins/cozystack/skills/cluster-upgrade/SKILL.md @@ -1,16 +1,24 @@ --- -name: cozystack-upgrade +name: cluster-upgrade description: Use when upgrading a running Cozystack v1.x cluster to a newer v1.x patch or minor version. Not for v0.x → v1.0 major migration. --- -# Cozystack Upgrade +# cozystack:cluster-upgrade Guided upgrade of a running Cozystack v1.x cluster. Source of truth: `https://cozystack.io/docs//operations/cluster/upgrade/` (substitute the version selector value matching your target — e.g. `v1.3`). -## Core principle +## Core principles **Release-notes-driven upgrade.** Every release changes a specific set of components. Generic "all green" checks miss regressions in the areas that actually changed. Read the target's release notes, extract the change list, and run targeted pre/post checks — not just the default health checklist. +**Match the operator's natural language.** Detect from prior conversation messages (or read `/.state.yaml` `operator_language` if a wizard chain is in progress). Use it in every prompt, AskUserQuestion option, summary, and gate. Never ask "what language?" separately. Code identifiers, commands, file paths, and GitHub-public text stay in their canonical form. + +**One valid path → just do it.** After the operator approved the risk summary at the upgrade stop gate, the skill runs `helm upgrade` + post-upgrade health checks + tenant verification back-to-back. Approval gates remain only for the named STOP GATEs in the workflow (risk summary, helm upgrade itself, final report) and for any rollback action (always destructive-risky). + +**Front-load the interview.** Read the cluster + release notes + pre-flight checks up front and surface a single risk-summary screen with the upgrade plan, change-list, and any blockers. Approval of that screen is the only interview gate — subsequent phases (helm upgrade, post-upgrade checks) execute without re-prompting. + +**Layer-pure operator output.** The skill never says "returning control to wizard" or any other orchestration commentary in the **operator-facing** summary. Whoever invoked the skill (a human directly, or a chain orchestrator) figures out what's next on their own. Internal documentation references are fine; `wizard` does not appear in text shown to the operator. + ## Workflow ```dot @@ -42,14 +50,22 @@ Request explicit user approval before each. Prior approval does NOT carry forwar 3. **Deleting any resource** (old ConfigMaps, orphan HRs, helm release secrets). 4. **Patching `Tenant`, `TenantControlPlane`, or `sh.helm.release.v1.*` secrets** — see `references/known-failures.md` for why. +## Guardrails + +- NEVER run `kubectl` or `helm` without `--context $CTX` / `--kube-context $CTX`. Operators commonly have prod + staging configured in the same kubeconfig and `current-context` can flip between sessions. Bare commands silently target the wrong cluster. Step 0 pins `$CTX` once; every subsequent invocation reuses it. +- NEVER skip Step 1 release-notes analysis — generic "all green" doesn't catch regressions in changed areas. +- NEVER use `helm upgrade --reuse-values` for the cozy-installer chart — see Step 5 for why. + ## Steps -### Step 0 — Identify versions +### Step 0 — Pin the context, then identify versions + +Pin `$CTX` to the cluster being upgraded before any other command. Read from `/.state.yaml` `cluster.context` when a wizard chain is in progress; otherwise show `kubectl config current-context` and ask the operator to confirm or pick a different one. Every subsequent `kubectl` and `helm` invocation in this skill passes `--context $CTX` / `--kube-context $CTX` explicitly — bare commands are forbidden, see Guardrails. ```bash -kubectl -n cozy-system get deployment cozystack-operator \ +kubectl --context $CTX --namespace cozy-system get deployment cozystack-operator \ -o jsonpath='{.spec.template.spec.containers[0].image}{"\n"}' -kubectl get packages.cozystack.io cozystack.cozystack-platform -o yaml # bundle + values +kubectl --context $CTX get packages.cozystack.io cozystack.cozystack-platform -o yaml # bundle + values ``` If cluster is on v0.x, stop — this skill does not cover the v0→v1 migration. @@ -87,14 +103,14 @@ Show user: current→target version, change-risk summary from Step 1, pre-flight Always run. Missing these annotations can delete `cozy-system` on upgrade. ```bash -kubectl annotate namespace cozy-system helm.sh/resource-policy=keep --overwrite -kubectl annotate configmap -n cozy-system cozystack-version helm.sh/resource-policy=keep --overwrite +kubectl --context $CTX annotate namespace cozy-system helm.sh/resource-policy=keep --overwrite +kubectl --context $CTX --namespace cozy-system annotate configmap cozystack-version helm.sh/resource-policy=keep --overwrite ``` ### Step 5 — Upgrade ```bash -helm upgrade cozystack oci://ghcr.io/cozystack/cozystack/cozy-installer \ +helm --kube-context $CTX upgrade cozystack oci://ghcr.io/cozystack/cozystack/cozy-installer \ --version X.Y.Z \ --namespace cozy-system ``` @@ -102,14 +118,14 @@ helm upgrade cozystack oci://ghcr.io/cozystack/cozystack/cozy-installer \ **Do not use `--reuse-values`.** The `cozy-installer` chart pins the platform OCI repository in its default values; reusing old values would point the new operator at old package versions. Inspect the currently-installed user-overrides first, then re-apply only the ones the user actually set, explicitly with `--set` (e.g. `--set disableTelemetry=true`): ```bash -helm get values cozystack -n cozy-system +helm --kube-context $CTX get values cozystack --namespace cozy-system ``` ### Step 6 — Monitor ```bash -kubectl logs -n cozy-system deploy/cozystack-operator -f -bash /hack/check-readiness.sh -w 10 +kubectl --context $CTX --namespace cozy-system logs deploy/cozystack-operator --follow +bash /hack/check-readiness.sh -w 10 --context $CTX ``` Expect: operator pod Running, `cozystack.cozystack-platform` Package `Ready=True`, all HRs converge within a few minutes (tenant charts may take longer). @@ -126,7 +142,7 @@ Show user: result (success/partial/failed), before→after version, HR/Package t ## Rollback -`helm rollback cozystack -n cozy-system` is possible but has caveats (data migrations don't reverse). Before rolling back, snapshot: `kubectl get packages.cozystack.io -A -o yaml > pre-rollback.yaml`. +`helm --kube-context $CTX rollback cozystack --namespace cozy-system` is possible but has caveats (data migrations don't reverse). Before rolling back, snapshot: `kubectl --context $CTX get packages.cozystack.io -A -o yaml > pre-rollback.yaml`. **Details, caveats, when not to roll back:** read `references/rollback.md`. @@ -142,7 +158,7 @@ High-blast-radius stuck states — stuck helm `uninstalling`, Kamaji datastore c |---|---| | `Package.Ready=False, ValidationFailed` | Release tightened `values.schema.json`; fix Package before proceeding | | HR `Ready=False, ExternalArtifact ... not found` | App removed in target version → orphan HR (known-failures #6) | -| `cozystack-operator` in CrashLoopBackOff | Stale CRD / RBAC; `kubectl logs -n cozy-system deploy/cozystack-operator --previous` | +| `cozystack-operator` in CrashLoopBackOff | Stale CRD / RBAC; `kubectl --context $CTX --namespace cozy-system logs deploy/cozystack-operator --previous` | | HR `UninstallFailed, failed to delete release` | Stuck helm history (known-failures #1) | | TCP `INSTALLED VERSION` diverges from `VERSION` | Kamaji upgrade stuck (known-failures #4) | | `cozy-system` namespace gone | Missing `helm.sh/resource-policy=keep` (known-failures #7); restore from backup | diff --git a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/known-failures.md b/plugins/cozystack/skills/cluster-upgrade/references/known-failures.md similarity index 83% rename from skills/cozystack-upgrade/skills/cozystack-upgrade/references/known-failures.md rename to plugins/cozystack/skills/cluster-upgrade/references/known-failures.md index 1755351..675412a 100644 --- a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/known-failures.md +++ b/plugins/cozystack/skills/cluster-upgrade/references/known-failures.md @@ -25,13 +25,13 @@ Flux helm-controller attempted to uninstall the release (often because the paren flux suspend hr -n # 2. Delete the stuck version secret (the one with status=uninstalling) -kubectl -n delete secret sh.helm.release.v1..vN +kubectl --context $CTX -n delete secret sh.helm.release.v1..vN # 3. Relabel the previous version as deployed (find it: status was "superseded") -kubectl -n label secret sh.helm.release.v1..vM status=deployed --overwrite +kubectl --context $CTX -n label secret sh.helm.release.v1..vM status=deployed --overwrite # 4. If any intermediate "failed" versions exist, relabel them to "superseded" -kubectl -n label secret sh.helm.release.v1..vK status=superseded --overwrite +kubectl --context $CTX -n label secret sh.helm.release.v1..vK status=superseded --overwrite # 5. Resume and force reconcile flux resume hr -n @@ -64,23 +64,23 @@ If `Tenant/root.spec.etcd` is set to `false` (manually or via Package values dri ```bash # 1. Confirm the cause: Tenant/root has etcd: false -kubectl -n tenant-root get tenants.apps.cozystack.io root -o jsonpath='{.spec}{"\n"}' -kubectl -n tenant-root get hr etcd # NotFound or stuck uninstalling +kubectl --context $CTX -n tenant-root get tenants.apps.cozystack.io root -o jsonpath='{.spec}{"\n"}' +kubectl --context $CTX -n tenant-root get hr etcd # NotFound or stuck uninstalling # 2. Preserve data: verify etcd PVCs still exist (they should) -kubectl -n tenant-root get pvc | grep etcd # data-etcd-{0,1,2} +kubectl --context $CTX -n tenant-root get pvc | grep etcd # data-etcd-{0,1,2} # 3. If HR is stuck uninstalling, recover it first (see #1 above), then delete it cleanly # If the HR is NotFound, proceed: # 4. Patch Tenant to re-enable etcd -kubectl -n tenant-root patch tenants.apps.cozystack.io root --type=merge -p '{"spec":{"etcd":true}}' +kubectl --context $CTX -n tenant-root patch tenants.apps.cozystack.io root --type=merge -p '{"spec":{"etcd":true}}' # 5. Force reconcile the tenant HR so it regenerates the etcd HR flux reconcile hr tenant-root -n tenant-root --force # 6. Wait for etcd StatefulSet to be 3/3 ready (new pods rejoin existing PVC data) -kubectl -n tenant-root get sts etcd -w +kubectl --context $CTX -n tenant-root get sts etcd -w ``` **Follow-up:** after etcd is back, proceed to #3 below (datastore cert regeneration), because etcd reinstall regenerated the CA. @@ -110,17 +110,17 @@ tcp_ns= tcp= # 1. Delete the stale datastore-certificate secret so Kamaji regenerates it -kubectl -n "$tcp_ns" delete secret "${tcp}-datastore-certificate" +kubectl --context $CTX -n "$tcp_ns" delete secret "${tcp}-datastore-certificate" # 2. Trigger Kamaji reconcile -kubectl -n "$tcp_ns" annotate tenantcontrolplane "$tcp" \ +kubectl --context $CTX -n "$tcp_ns" annotate tenantcontrolplane "$tcp" \ kamaji.clastix.io/trigger-reconcile="$(date +%s)" --overwrite # 3. Verify the secret is regenerated with a recent creationTimestamp -kubectl -n "$tcp_ns" get secret "${tcp}-datastore-certificate" -o jsonpath='{.metadata.creationTimestamp}{"\n"}' +kubectl --context $CTX -n "$tcp_ns" get secret "${tcp}-datastore-certificate" -o jsonpath='{.metadata.creationTimestamp}{"\n"}' # 4. Restart tenant apiserver pods to pick up the new cert -kubectl -n "$tcp_ns" delete pod -l kamaji.clastix.io/name="$tcp" +kubectl --context $CTX -n "$tcp_ns" delete pod -l kamaji.clastix.io/name="$tcp" ``` If Kamaji does NOT regenerate the secret, likely case is #4 (upgrade phase blocking reconciliation). @@ -150,18 +150,18 @@ Break the chicken-and-egg by matching spec to installed version temporarily: ```bash # 1. Downgrade spec.kubernetes.version to match the installed version -kubectl -n patch tenantcontrolplane --type=merge \ +kubectl --context $CTX -n patch tenantcontrolplane --type=merge \ -p '{"spec":{"kubernetes":{"version":"v1.33.0"}}}' # match INSTALLED VERSION # 2. Wait for other phases to complete (datastore cert regen, etc.) -kubectl -n get pods -l kamaji.clastix.io/name= -w # 4/4 Running +kubectl --context $CTX -n get pods -l kamaji.clastix.io/name= -w # 4/4 Running # 3. Once tenant apiserver is up, restore original version -kubectl -n patch tenantcontrolplane --type=merge \ +kubectl --context $CTX -n patch tenantcontrolplane --type=merge \ -p '{"spec":{"kubernetes":{"version":"v1.33.8"}}}' # target version # 4. Watch Kamaji perform the upgrade -kubectl -n get tenantcontrolplane -w +kubectl --context $CTX -n get tenantcontrolplane -w ``` ## 5. Flux HR stalled `MissingRollbackTarget` @@ -209,12 +209,12 @@ The ApplicationDefinition / package was removed in the current Cozystack version ```bash # 1. Check for live workloads / user data behind the orphan -kubectl -n get pvc,pv -o wide | grep # preserve if data exists -kubectl -n get all -l app.kubernetes.io/name= +kubectl --context $CTX -n get pvc,pv -o wide | grep # preserve if data exists +kubectl --context $CTX -n get all -l app.kubernetes.io/name= # 2. Suspend the HR and delete it (PVCs retained) flux suspend hr -n -kubectl -n delete hr +kubectl --context $CTX -n delete hr # 3. PVCs remain so user data is preserved. Communicate this to the user: # "orphan HR deleted; PVC preserved for data recovery" @@ -237,8 +237,8 @@ The `helm.sh/resource-policy=keep` annotation was missing on the namespace and/o Always run Step 4 of the main skill: ```bash -kubectl annotate namespace cozy-system helm.sh/resource-policy=keep --overwrite -kubectl annotate configmap -n cozy-system cozystack-version helm.sh/resource-policy=keep --overwrite +kubectl --context $CTX annotate namespace cozy-system helm.sh/resource-policy=keep --overwrite +kubectl --context $CTX annotate configmap -n cozy-system cozystack-version helm.sh/resource-policy=keep --overwrite ``` ### Recovery @@ -247,7 +247,7 @@ Restore from backup. There is no clean in-cluster recovery for a deleted `cozy-s 1. Back up any remaining Package/PackageSource/HR specs you can still extract: ```bash - kubectl get packages.cozystack.io,packagesources.cozystack.io,hr -A -o yaml > rescue.yaml + kubectl --context $CTX get packages.cozystack.io,packagesources.cozystack.io,hr -A -o yaml > rescue.yaml ``` 2. Re-install cozystack-operator fresh (follows the bootstrap procedure, not the upgrade one). 3. Re-apply the Platform Package from rescue.yaml (manual review required; CRD schemas may have moved). diff --git a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/post-upgrade-checks.md b/plugins/cozystack/skills/cluster-upgrade/references/post-upgrade-checks.md similarity index 84% rename from skills/cozystack-upgrade/skills/cozystack-upgrade/references/post-upgrade-checks.md rename to plugins/cozystack/skills/cluster-upgrade/references/post-upgrade-checks.md index 6bd2810..896fe53 100644 --- a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/post-upgrade-checks.md +++ b/plugins/cozystack/skills/cluster-upgrade/references/post-upgrade-checks.md @@ -6,23 +6,23 @@ Read this when on Step 7. Run **general** checks always; add **targeted** checks ```bash # 1. Versions match target -kubectl -n cozy-system get deployment cozystack-operator \ +kubectl --context $CTX -n cozy-system get deployment cozystack-operator \ -o jsonpath='{.spec.template.spec.containers[0].image}{"\n"}' -kubectl -n cozy-system get configmap cozystack-version -o yaml 2>/dev/null +kubectl --context $CTX -n cozy-system get configmap cozystack-version -o yaml 2>/dev/null # 2. Full readiness (must be all-green) bash /hack/check-readiness.sh -kubectl get hr -A | grep -v True # empty -kubectl get packages.cozystack.io -A | grep -v True # empty -kubectl get pods -A --no-headers | awk '$4!="Running" && $4!="Completed"' # empty +kubectl --context $CTX get hr -A | grep -v True # empty +kubectl --context $CTX get packages.cozystack.io -A | grep -v True # empty +kubectl --context $CTX get pods -A --no-headers | awk '$4!="Running" && $4!="Completed"' # empty # 3. No new suspensions introduced by upgrade for k in helmreleases kustomizations; do - kubectl get "$k" -A -o json | jq -r ".items[] | select(.spec.suspend==true) | \"$k: \(.metadata.namespace)/\(.metadata.name)\"" + kubectl --context $CTX get "$k" -A -o json | jq -r ".items[] | select(.spec.suspend==true) | \"$k: \(.metadata.namespace)/\(.metadata.name)\"" done # 4. Tenant control planes reachable, versions match -kubectl get tenantcontrolplane -A +kubectl --context $CTX get tenantcontrolplane -A # 5. Storage backend still healthy alias linstor='kubectl exec -n cozy-linstor deploy/linstor-controller -ti -- linstor' @@ -62,7 +62,7 @@ for tcp_ns_name in $(kubectl get tenantcontrolplane -A -o jsonpath='{range .item ns="${tcp_ns_name%/*}"; name="${tcp_ns_name#*/}" echo "=== $ns/$name ===" tmp_kubeconfig=$(mktemp) - kubectl -n "$ns" get secret "${name}-admin-kubeconfig" -o jsonpath='{.data.admin\.conf}' | base64 -d > "$tmp_kubeconfig" + kubectl --context $CTX -n "$ns" get secret "${name}-admin-kubeconfig" -o jsonpath='{.data.admin\.conf}' | base64 -d > "$tmp_kubeconfig" KUBECONFIG="$tmp_kubeconfig" kubectl get nodes KUBECONFIG="$tmp_kubeconfig" kubectl get pods -A --no-headers | awk '$4!="Running" && $4!="Completed"' rm "$tmp_kubeconfig" diff --git a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/preflight-checks.md b/plugins/cozystack/skills/cluster-upgrade/references/preflight-checks.md similarity index 64% rename from skills/cozystack-upgrade/skills/cozystack-upgrade/references/preflight-checks.md rename to plugins/cozystack/skills/cluster-upgrade/references/preflight-checks.md index fd88001..105f537 100644 --- a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/preflight-checks.md +++ b/plugins/cozystack/skills/cluster-upgrade/references/preflight-checks.md @@ -6,8 +6,8 @@ Read this when on Step 2. Run every check. Any `False`, stuck, or suspended reso ```bash echo "KUBECONFIG: $KUBECONFIG" # confirm with user before proceeding -kubectl cluster-info -kubectl get nodes # all Ready, versions consistent +kubectl --context $CTX cluster-info +kubectl --context $CTX get nodes # all Ready, versions consistent ``` ## Cozystack readiness @@ -20,10 +20,10 @@ bash /hack/check-readiness.sh If you don't have the repo locally, the equivalent inline: ```bash -kubectl get packages.cozystack.io -A | grep -v True -kubectl get artifactgenerators.source.extensions.fluxcd.io -A | grep -v True -kubectl get externalartifacts.source.toolkit.fluxcd.io -A | grep -v True -kubectl get hr -A | grep -v True +kubectl --context $CTX get packages.cozystack.io -A | grep -v True +kubectl --context $CTX get artifactgenerators.source.extensions.fluxcd.io -A | grep -v True +kubectl --context $CTX get externalartifacts.source.toolkit.fluxcd.io -A | grep -v True +kubectl --context $CTX get hr -A | grep -v True ``` Any output = blocker. @@ -34,7 +34,7 @@ Suspended HRs will NOT apply the new chart on upgrade. Un-suspending is usually ```bash for k in helmreleases kustomizations gitrepositories helmrepositories ocirepositories buckets artifactgenerators; do - kubectl get "$k" -A -o json 2>/dev/null | \ + kubectl --context $CTX get "$k" -A -o json 2>/dev/null | \ jq -r ".items[] | select(.spec.suspend==true) | \"$k: \(.metadata.namespace)/\(.metadata.name)\"" done ``` @@ -42,9 +42,9 @@ done ## Workload health ```bash -kubectl get pods -A --no-headers | awk '$4!="Running" && $4!="Completed"' -kubectl get tenantcontrolplane -A # STATUS=Ready, VERSION==INSTALLED VERSION (no version drift) -kubectl get tenants.apps.cozystack.io -A # READY=True +kubectl --context $CTX get pods -A --no-headers | awk '$4!="Running" && $4!="Completed"' +kubectl --context $CTX get tenantcontrolplane -A # STATUS=Ready, VERSION==INSTALLED VERSION (no version drift) +kubectl --context $CTX get tenants.apps.cozystack.io -A # READY=True ``` ## Storage — LINSTOR @@ -63,17 +63,17 @@ alias ovn-appctl='kubectl -n cozy-kubeovn exec deploy/ovn-central -c ovn-central ovn-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound | grep -E "Role|Status" ovn-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound | grep -E "Role|Status" # Server count must match control-plane node count, no duplicate IPs -kubectl get node -o wide -l node-role.kubernetes.io/control-plane= +kubectl --context $CTX get node -o wide -l node-role.kubernetes.io/control-plane= ``` ## etcd (if tenants use kamaji with etcd datastore) ```bash -kubectl -n tenant-root get hr etcd # Ready=True -kubectl -n tenant-root get sts etcd # 3/3 Ready -kubectl get datastores.kamaji.clastix.io # endpoints resolve +kubectl --context $CTX -n tenant-root get hr etcd # Ready=True +kubectl --context $CTX -n tenant-root get sts etcd # 3/3 Ready +kubectl --context $CTX get datastores.kamaji.clastix.io # endpoints resolve # DNS resolution of etcd service from inside cluster: -kubectl -n tenant-root get svc etcd +kubectl --context $CTX -n tenant-root get svc etcd ``` If etcd is gone or HR is stuck, see known-failures.md #1 (stuck uninstall) and #2 (apiserver crashloop). @@ -83,7 +83,7 @@ If etcd is gone or HR is stuck, see known-failures.md #1 (stuck uninstall) and # Upgrade may bump resource limits. Make sure tenant namespaces aren't already at quota. ```bash -kubectl get resourcequota -A +kubectl --context $CTX get resourcequota -A ``` ## When to stop diff --git a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/release-notes-analysis.md b/plugins/cozystack/skills/cluster-upgrade/references/release-notes-analysis.md similarity index 100% rename from skills/cozystack-upgrade/skills/cozystack-upgrade/references/release-notes-analysis.md rename to plugins/cozystack/skills/cluster-upgrade/references/release-notes-analysis.md diff --git a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/rollback.md b/plugins/cozystack/skills/cluster-upgrade/references/rollback.md similarity index 71% rename from skills/cozystack-upgrade/skills/cozystack-upgrade/references/rollback.md rename to plugins/cozystack/skills/cluster-upgrade/references/rollback.md index 97ccb42..b1bb335 100644 --- a/skills/cozystack-upgrade/skills/cozystack-upgrade/references/rollback.md +++ b/plugins/cozystack/skills/cluster-upgrade/references/rollback.md @@ -21,28 +21,28 @@ Always capture current state first: ```bash mkdir -p pre-rollback-backup -kubectl get packages.cozystack.io -A -o yaml > pre-rollback-backup/packages.yaml -kubectl get packagesources.cozystack.io -A -o yaml > pre-rollback-backup/packagesources.yaml -kubectl get hr -A -o yaml > pre-rollback-backup/helmreleases.yaml -kubectl get configmap -n cozy-system -o yaml > pre-rollback-backup/cozy-system-cms.yaml -helm history cozystack -n cozy-system > pre-rollback-backup/helm-history.txt +kubectl --context $CTX get packages.cozystack.io -A -o yaml > pre-rollback-backup/packages.yaml +kubectl --context $CTX get packagesources.cozystack.io -A -o yaml > pre-rollback-backup/packagesources.yaml +kubectl --context $CTX get hr -A -o yaml > pre-rollback-backup/helmreleases.yaml +kubectl --context $CTX get configmap -n cozy-system -o yaml > pre-rollback-backup/cozy-system-cms.yaml +helm --kube-context $CTX history cozystack -n cozy-system > pre-rollback-backup/helm-history.txt ``` ## Rollback commands ```bash # 1. Identify target revision -helm history cozystack -n cozy-system +helm --kube-context $CTX history cozystack -n cozy-system # 2. Show user the target revision's chart version + revision number # STOP GATE — explicit user approval required # 3. Rollback -helm rollback cozystack -n cozy-system +helm --kube-context $CTX rollback cozystack -n cozy-system # 4. Monitor operator reconciliation -kubectl logs -n cozy-system deploy/cozystack-operator -f -kubectl get hr -A | grep -v True +kubectl --context $CTX logs -n cozy-system deploy/cozystack-operator -f +kubectl --context $CTX get hr -A | grep -v True ``` ## Post-rollback verification @@ -51,12 +51,12 @@ Same checks as post-upgrade (see `post-upgrade-checks.md`), with one addition: * ```bash # Compare versions -kubectl -n cozy-system get deployment cozystack-operator \ +kubectl --context $CTX -n cozy-system get deployment cozystack-operator \ -o jsonpath='{.spec.template.spec.containers[0].image}{"\n"}' # Should match the previous running version (before the failed upgrade) # Compare HR count -kubectl get hr -A --no-headers | wc -l +kubectl --context $CTX get hr -A --no-headers | wc -l # Should match the pre-upgrade HR count — sudden drop indicates HRs weren't restored ``` diff --git a/plugins/cozystack/skills/debug/SKILL.md b/plugins/cozystack/skills/debug/SKILL.md new file mode 100644 index 0000000..621d5aa --- /dev/null +++ b/plugins/cozystack/skills/debug/SKILL.md @@ -0,0 +1,238 @@ +--- +name: debug +description: Investigate and resolve a stuck or broken Cozystack install. Gathers symptoms via kubectl, classifies the failure (operator error / config drift / upstream bug / not-yet-supported), looks up the relevant cozystack docs to verify the operator did the right thing, searches the cozystack monorepo source for the failure path, applies a local fix or workaround when one exists, and on operator approval drafts an upstream issue with the diagnostic bundle. Never opens PRs and never opens issues silently — drafts get an explicit yes/no per filing. The `cozystack:wizard` auto-dispatches this skill whenever a downstream skill in the chain reports `failed_at` in `.state.yaml`. Also callable directly when an already-running cluster develops a problem. +argument-hint: "[--config-dir=] [--context=] [--target=] [--no-issue]" +--- + +# cozystack:debug + +Work in reasoning mode. Use the phrasing `cozystack:debug`. Announce phase transitions: `cozystack:debug Phase N — `. + +> **Note on language in this SKILL.md** — every operator-facing prompt below is written in English for clarity. At runtime the skill matches the operator's natural language detected from prior conversation messages (or read from `/.state.yaml` `operator_language` when the wizard chain is in progress). Code identifiers, commands, file paths, and any text destined for GitHub stay canonical. + +## What this skill does, in one sentence + +When something in a Cozystack install is broken or stuck, find out *what*, classify *whose fault*, fix what's fixable locally, and on approval prepare a clean upstream issue for what isn't. + +## Two invocation paths + +- **From `cozystack:wizard`** — automatically dispatched when any chain step writes `status..failed_at` to `/.state.yaml`. Inherits `config_dir`, `cluster.context`, and the failing skill's error string. +- **Direct** — operator invokes `/cozystack:debug` against an already-running cluster. Interviews for `--config-dir` (default `$PWD`), `--context` (default `kubectl config current-context`), and what hurts. + +## Core principles + +- Match the operator's natural language. Read from `state.operator_language` (set by `cozystack:wizard` Phase 0) or, when invoked directly, detect from the operator's previous messages in the conversation. Use that language in every prompt, AskUserQuestion option, summary, and gate. Never ask "what language?" separately. Code identifiers, command examples, file paths, and any text that ends up in a GitHub issue body stay in their canonical form (usually English). +- Read first, mutate second. Phases 1–4 are read-only. +- One valid path → just do it. When Phase 3 classification + Phase 4 action are unambiguous (operator error with a documented fix, config drift with a clear restore command), the skill applies the fix without an extra "ok to apply?" question — the symptom + classification + proposed action were already shown. Gates remain for (a) destructive workarounds (resource recreation, secret regeneration), (b) issue / PR drafts that go public (Phase 5 explicit yes/no per filing). +- Front-load the interview. The only question the debug skill should ask is in Phase 4 when classification + proposed action are ready: a single screen with symptom, classification, root cause, proposed fix / workaround, and the issue-filing question (yes / no / show-draft-first). The operator either approves the lot or names what to change. Phases 1–3 (symptom gathering, doc check, classification) are read-only and run before that screen. +- Layer-pure operator output. The skill never says "returning control to wizard", "the wizard will retry the failing skill", or any other orchestration commentary in the **operator-facing** summary. Whoever invoked the skill (a human directly, or the wizard's auto-dispatch on `failed_at`) figures out what's next on their own. Internal SKILL.md references to `cozystack:wizard` are fine for documentation; `wizard` does not appear in any text shown to the operator. +- Classify before acting. The same symptom can be operator error or upstream bug; the fix is different. +- Doc-check is not optional. Cozystack documentation is the operator's contract; verify the operator's setup against the relevant page **before** declaring upstream fault. +- One filing at a time. If multiple issues surface, finish one before starting the next — operators get confused fast. +- No PRs. v1 only drafts issue bodies. Patches that the operator wants to upstream are out of scope and stay manual. +- No silent filings. Every issue draft is shown for approval. If the operator doesn't have a GitHub account, no problem — the diagnostic bundle is still useful to them. + +## Phase 1 — Gather symptoms + +Find the failing surface. Sources in priority order: + +1. **State file** — if `/.state.yaml` exists and has `status..failed_at`, read `error` and `dispatched_at`. This is the wizard-auto-dispatch path. +2. **Operator-supplied target** — `--target=hr//` or `--target=pod//` or `--target=namespace/` narrows scope. +3. **Cluster scan** — when nothing's specified, run a broad sweep: + + ```bash + kubectl --context $CTX get hr -A | grep -v ' True ' + kubectl --context $CTX get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded + kubectl --context $CTX get events -A --sort-by=.lastTimestamp --field-selector=type=Warning + ``` + + Surface the first 10 anomalies and ask the operator which to investigate. + +For each candidate failure, pull deeper: + +```bash +kubectl --context $CTX --namespace $NS describe hr $NAME | sed -n '/Status:/,$p' +kubectl --context $CTX --namespace $NS get events --sort-by=.lastTimestamp --field-selector involvedObject.kind=HelmRelease,involvedObject.name=$NAME +kubectl --context $CTX --namespace flux-system logs deploy/helm-controller --tail=200 | grep -i "$NAME" || true +kubectl --context $CTX --namespace cozy-system logs deploy/cozystack-operator --tail=200 +``` + +For stuck pods: + +```bash +kubectl --context $CTX --namespace $NS describe pod $POD +kubectl --context $CTX --namespace $NS logs $POD --all-containers --tail=200 +``` + +Record findings as a structured **symptom record** the rest of the phases will reference. See `references/diagnostic-bundle.md` for the canonical format. + +## Phase 2 — Doc check (is the operator doing the right thing?) + +Pull the cozystack docs page relevant to the failing surface. Priority: + +1. **For an HR failure** — `https://cozystack.io/docs/v/operations/troubleshooting//` if it exists, else the install or component page for that namespace (`cozy-dashboard` → dashboard docs, `cozy-linstor` → storage section, etc.). +2. **For a Phase-N failure surfaced by the wizard** — the corresponding section in the install guide (`https://cozystack.io/docs/v/install/kubernetes//`). + +The local checkouts are the fastest source: + +- `~/git/github.com/cozystack/website/content/en/docs/v/...` +- `~/git/github.com/cozystack/cozystack/docs/changelogs/v.md` +- `~/git/github.com/cozystack/ansible-cozystack/CHANGELOG.rst` + +Compare the operator's actual state (`.state.yaml`, `cozystack-platform-package.yaml`, `inventory.yml`) against what the docs say should be set. Surface deltas: + +```text +doc check — cozy-dashboard HR stuck + +cozystack/docs/v1.3/install/kubernetes/generic.md line 160 says: + "Run kubectl patch tenants/root --type=merge --patch '{\"spec\":{\"ingress\":true}}' after Platform Package apply." + +your state: + - cluster-install Phase 8 watch loop applied the patch inline at 17:25 UTC + - root Tenant CR shows .spec.ingress=true + - root-ingress-controller Service has EXTERNAL-IP 10.0.0.50 — Ready + +verdict: operator did the documented step. The failure is downstream. +``` + +This is the **operator-not-dumb** check. If the verdict is `operator did NOT do the documented step`, the fix is to do that step, not to file an upstream issue. + +## Phase 3 — Classify + +After the doc check, place the failure into one of four buckets: + +| Bucket | Signal | Action in Phase 4 | +|---|---|---| +| **operator error** | The docs prescribed step X, the operator's state shows X not done. | Walk operator through doing X. No issue. | +| **config drift** | Something on the cluster changed since install (manual `kubectl edit`, third-party operator, etc.) that breaks the documented invariant. | Restore the invariant. Maybe file a note in the operator's runbook; no upstream issue. | +| **upstream bug** | Operator followed docs, state matches what docs prescribe, but the system behaves contrary to documented behaviour. | Apply local workaround if known. Offer to draft an upstream issue. | +| **not-yet-supported** | Operator is asking the system to do something the docs explicitly say is unsupported (RHEL 10 ZFS, LVM thin in v1, etc.). | Print the unsupported-by-design note. Offer to draft a feature-request issue. | + +If the bucket is ambiguous after one round of investigation, **ask the operator** to clarify the path — don't guess. Two rounds of bad classification waste more time than one explicit question. + +## Phase 4 — Act + +### operator error +Print the missing step from the docs verbatim, with the exact command and approve prompt. If the operator approves, run it. If not, stop and let them. + +### config drift +Show the divergence (what doc said vs what state shows now), propose the corrective command, ask approve. After the fix lands, suggest the operator add it to their cluster runbook. + +### upstream bug +Look up the failure in the cozystack source. See `references/source-search.md` for the grep pattern that maps HR names / error strings → file paths in the monorepo. Common shapes: + +- Error message from `cozystack-operator` logs → grep the operator Go source for the format string. +- Chart render failure → grep the failing chart's `templates/` for the broken template path; check `values.yaml` for the unset key. +- LinstorSatelliteConfiguration / piraeus issues → forward to `linstor:recover` skill instead of this one. + +If a local workaround exists (`cluster-install/references/known-failures.md` is the catalogue), apply it on approve. Track the upstream root cause separately: + +```text +workaround applied: +root cause: + +The workaround keeps your cluster running. The root cause is a real upstream bug. + +Open an issue in with the diagnostic bundle? + + - Yes — I'll prepare the issue body and the gh command for you to review. + - No — skip the upstream filing. + - Not sure — show me how the issue draft would look first. +``` + +### not-yet-supported +Print the relevant doc section that says "not supported". Operator can still ask to file a feature request: + +```text +This is not currently supported by Cozystack: + - + - + +Options: + - Use a supported alternative — + - Open a feature-request issue in cozystack/cozystack + - Drop the requirement +``` + +## Phase 5 — Issue draft (only on Phase 4 yes) + +Skip entirely if the operator said no or chose a local-only resolution. + +1. **Search existing issues first** — duplicates are noise: + + ```bash + gh issue list --repo cozystack/ --state all --search "" + ``` + + If something looks like a match, surface the existing issue number and ask: comment on the existing thread, or open a new one with cross-link? + +2. **Choose the repo** — see `references/upstream-routing.md`. The fast table: + + | Failure shape | Repo | + |---|---| + | Chart render / operator runtime / Package CR semantics | `cozystack/cozystack` | + | Install doc gap / contradicts reality | `cozystack/website` | + | ansible-cozystack playbook missing a step or broken on a distro | `cozystack/ansible-cozystack` | + | talm tool / chart preset | `cozystack/talm` | + | boot-to-talos | `cozystack/boot-to-talos` | + | piraeus-operator / LINSTOR / DRBD / kube-ovn upstream-of-cozystack | direct to the upstream-upstream repo, mention cozystack version in the body | + +3. **Render the issue body** from the symptom record + doc-check delta + diagnostic bundle reference. Templates per repo in `references/issue-templates.md`. Public-content rules apply: English, singular first person, no private cluster names, no internal tool names, no `@mentions`. + +4. **Print the exact `gh` command** but **do not execute it**: + + ```bash + gh issue create --repo cozystack/ \ + --title "" \ + --body-file /diagnostics-/issue-body.md + ``` + + Tell the operator if they don't have a GitHub account, the diagnostic bundle at `/diagnostics-/` is still useful — they can hand it to someone who does, or attach it to a Slack thread / support ticket. + +## Phase 6 — Update state and hand back + +If invoked from `cozystack:wizard`, write to `/.state.yaml`: + +```yaml +status: + debug: + completed_at: + target: // + classification: + action: + issue_repo: +``` + +The wizard's dispatch loop reads this and decides next step: + +- `classification: operator-error` + `action: resolved` → retry the originally-failing skill (e.g. re-run `cluster-install` Phase 8 wait). +- `classification: upstream-bug` + `action: workaround` → retry the originally-failing skill. +- `classification: upstream-bug` + `action: issue-drafted` (no workaround) → wizard pauses, offers Cancel / Skip the failing step. +- `classification: not-supported` → wizard offers Cancel. + +If invoked directly, print a one-page summary instead. + +## Guardrails + +- NEVER open a GitHub issue automatically. Always show the draft body and the gh command, then wait for the operator. If they say no, drop it. +- NEVER open a pull request. v1 only drafts issue bodies. Patches that the operator wants to send upstream are a manual follow-up. +- NEVER skip the doc-check in Phase 2. Even if the failure looks obviously upstream, verify the operator's state against docs first — saves filing a duplicate or wrong issue. +- NEVER classify as "upstream bug" without naming the source location (file:line in a cozystack/* repo). "Something is wrong somewhere in cozystack" is not a classification. +- NEVER apply a destructive workaround without explicit approve. Per-step gate. +- NEVER include private infrastructure names in issue bodies. Replace with generic placeholders. +- ALWAYS prefer to extend the local `known-failures.md` catalogue when a new workaround surfaces — that's how the catalogue gets useful. +- ALWAYS leave the operator with a clear next step, even if the skill couldn't resolve anything. + +## References + +- `references/classification.md` — decision tree operator vs upstream + signals per bucket. +- `references/diagnostic-bundle.md` — symptom-record format + the bundle script (shares the script with `cluster-install/references/issue-templates.md`). +- `references/source-search.md` — how to grep the cozystack monorepo from a symptom (error string / chart name / namespace / operator log line). +- `references/upstream-routing.md` — per-repo routing for issue filings. +- `references/issue-templates.md` — per-repo issue body templates with placeholders. + +Cross-references: + +- `/cozystack:cluster-install`'s `known-failures` catalogue at `plugins/cozystack/skills/cluster-install/references/known-failures.md` — catalogue of known failures with workarounds; reuse before reinventing. +- `/linstor:recover` — for DRBD / LINSTOR storage failures, prefer that skill; it's more specialised. +- `/cozystack:wizard` — auto-dispatches this skill when downstream `failed_at` is set. diff --git a/plugins/cozystack/skills/debug/references/classification.md b/plugins/cozystack/skills/debug/references/classification.md new file mode 100644 index 0000000..837298d --- /dev/null +++ b/plugins/cozystack/skills/debug/references/classification.md @@ -0,0 +1,89 @@ +# Classification decision tree + +Phase 3 of `cozystack:debug` places every failure into one of four buckets. The bucket determines what Phase 4 does — local fix, workaround, upstream issue, or refuse — so getting this right matters. Use the signals below to classify; ask the operator when ambiguous rather than guess. + +## The four buckets + +### operator error + +**Signal**: a documented step is missing from the operator's actual state. + +Examples: + +- `tenants.apps.cozystack.io/root` has `.spec.ingress=false` and the install guide says to patch it to `true` after Platform Package apply. +- `cozystack_create_platform_package: true` was left at the ansible default but the operator wanted `cozystack:cluster-install` to handle Cozystack — they should have set it to `false`. +- Wrong `cozystack.apiServerHost` value — internal vs public IP confused. +- `cluster-domain` not set to `cozy.local` at k8s bootstrap. + +**How Phase 4 acts**: print the missing step from the docs verbatim, with the exact command. If the operator approves, run it. Don't open an upstream issue — the docs are correct, the gap is in execution. + +### config drift + +**Signal**: something on the cluster changed since install that broke a documented invariant. Operator-side change, not a Cozystack bug. + +Examples: + +- Manual `kubectl edit deployment/cozystack-operator` that removed `--apiserver-port=7445`. +- Third-party operator installed alongside cozystack that grabbed the `kubernetes` service externalIPs. +- `kubectl delete pod -n cozy-keycloak --all` to "fix" something, killing the long-running migration job. +- DNS records edited away from the configured `publishing.host`. + +**How Phase 4 acts**: show the divergence (doc-prescribed value vs current value), propose the corrective command, ask approve. After the fix lands, suggest adding the invariant to the operator's runbook. No upstream issue — cozystack didn't drift, the operator did. + +### upstream bug + +**Signal**: operator followed docs, state matches what docs prescribe, but the system behaves contrary to documented behaviour. + +Examples: + +- `helm install cozy-installer` succeeds, `cozystack-operator` is Available, but the operator's logs show `panic: nil pointer in Reconcile(): missing field X` — that's a chart or operator bug. +- `LinstorSatelliteConfiguration` shape exactly matches the CRD, but piraeus-operator's reconcile loop never creates the pool entry in LINSTOR. Source code shows the field is read but never passed downstream. +- Dashboard ingress is up, gatekeeper started, but the OIDC discovery request fails with a TLS error against keycloak's own cert. cert-manager Order is Ready. cozystack chart shipped without `--insecure-skip-verify` and there's no values key to enable it. + +**How Phase 4 acts**: look up the failure in cozystack source (see `source-search.md`), apply local workaround if known (see `cluster-install/references/known-failures.md`), draft upstream issue with the diagnostic bundle. The classification **must** name a source file:line — "something is wrong somewhere in cozystack" is not enough. + +### not-yet-supported + +**Signal**: operator is asking the system to do something the docs explicitly say is unsupported, or that's not in the supported matrix. + +Examples: + +- RHEL 10 / Rocky 10 / Alma 10 — OpenZFS RPMs don't exist for that family yet; cozystack/docs/v1.3/storage/disk-preparation.md only covers ZFS. +- LVM Thin pool on the LINSTOR backend — cozystack documents only ZFS. +- IPv6-only cluster — not in the supported matrix. +- kubeadm with a non-default cluster CIDR overlapping the cozystack `joinCIDR`. + +**How Phase 4 acts**: print the relevant doc section that says "not supported" and the link. Offer the operator a supported alternative + the option to file a feature-request issue in `cozystack/cozystack`. No local workaround — by definition there isn't one. + +## When ambiguous, ask + +Two rounds of bad classification waste more time than one explicit question. If after Phase 2 doc-check the bucket is unclear: + +```text +classification — ambiguous + +Symptom: +Doc check: vs + +This could be: + - operator error (you missed step X — likely if you didn't follow the documented order) + - config drift (something changed after install — likely if the install was working before) + - upstream bug (cozystack reading the field but never acting on it — would explain the symptom but unusual) + +What happened recently on this cluster? (last `kubectl edit`, addon installed, ansible re-run, etc.) +``` + +The answer almost always disambiguates. + +## Class-shaped signals reference + +| Symptom | Most likely bucket | Why | +|---|---|---| +| HR `Failing` with `dependency 'X' is not ready` for over 10 min | upstream bug or operator error | If X is a normal HR that just needs time → operator-error (impatience). If X is permanently broken → upstream. | +| Pod `CrashLoopBackOff` immediately after install | config drift or operator error | Almost always something wrong in the operator's values; rarely an upstream bug. | +| `cluster-install` Phase 5.5 `zpool create` fails with `permission denied` | operator error | Talos / Ubuntu nodes have the right tools but operator skipped node prep. | +| `cluster-install` Phase 8 watch loop reaches "all HRs Ready" but never observed the tenants/root CR appearing (patch not applied) | upstream bug | The tenants CRD or the cozystack-operator that creates the root Tenant never landed. Without the Tenant CR the inline patch has nothing to target; downstream tenant-root-ingress workloads never come up. Source check on cozystack-operator + tenants CRD installation order. | +| cert-manager Order pending → certificates not ready | config drift or operator error | DNS misconfigured or port 80 firewalled — Phase 4 question 7 domain gate should have caught this. | +| `cozystack-operator` panics on startup | upstream bug | Operator never reaches the operator's values — definitionally upstream. | +| Storage class `data` exists but PVCs are Pending forever | upstream bug or config drift | Either piraeus-operator stopped reconciling (drift) or pool registration silently failed (upstream). Investigate. | +| `gh issue list` shows an open issue with the same error string | already known | Comment on the existing thread instead of opening a duplicate. | diff --git a/plugins/cozystack/skills/debug/references/diagnostic-bundle.md b/plugins/cozystack/skills/debug/references/diagnostic-bundle.md new file mode 100644 index 0000000..ee971b3 --- /dev/null +++ b/plugins/cozystack/skills/debug/references/diagnostic-bundle.md @@ -0,0 +1,127 @@ +# Symptom record + diagnostic bundle + +Two pieces of state Phase 1 produces: + +- **Symptom record** — a structured in-memory description of what's broken. Used by Phase 2 (doc check), Phase 3 (classify), and Phase 5 (issue body rendering). +- **Diagnostic bundle** — on-disk artefacts (logs, CR dumps, events) that go alongside any upstream issue. + +## Symptom record format + +The skill builds this in memory; not written to disk on its own (it's embedded in `.state.yaml` under `status.debug` when Phase 6 lands). + +```yaml +symptom: + surface: "hr" # hr / pod / namespace / cluster + namespace: "cozy-dashboard" + name: "dashboard" + observed_at: "2026-05-15T18:00:00Z" + condition: + type: "Ready" + status: "False" + reason: "InstallFailed" + message: "context deadline exceeded ..." + recent_events: + - "2026-05-15T17:50:00Z Warning ReconcileFailure: install retries exhausted (10/10)" + related_pods: + - name: "gatekeeper-xxx" + phase: "Running" + restarts: 47 + last_log: "Unable to fetch OIDC well-known: dial tcp keycloak.cluster.example.org:443: connection refused" + dependencies: + upstream_of: + - "cozy-fluxcd/flux-plunger" # depends on this HR + downstream_of: [] + cluster_context: + context: "cozystack-lab" + distribution: "k3s" + k8s_version: "v1.32.3+k3s1" + cozystack_installer_version: "v1.3.2" + platform_variant: "isp-full-generic" +``` + +This shape is what `references/issue-templates.md` body templates interpolate from. Keep field names stable. + +## Diagnostic bundle layout + +``` +/diagnostics-/ + README.md # one-page summary: symptom + classification + symptom.yaml # the in-memory record dumped to disk + cluster-info/ # output of kubectl cluster-info dump + namespaces/ + nodes.json + ... + helmreleases.yaml # kubectl get hr -A -o yaml + pods.txt # kubectl get pods -A -o wide + events.txt # kubectl get events -A --sort-by=.lastTimestamp + failing-hr.txt # kubectl describe hr for each not-Ready HR + operator.log # cozystack-operator last 2000 lines + helm-controller.log # helm-controller last 2000 lines + values.yaml # cozystack-platform-package.yaml at issue time (REDACTED) + state.yaml # .state.yaml at issue time (REDACTED) + bundle.tar.gz # everything above as a single tarball +``` + +## Bundle script + +Same body as `cluster-install/references/issue-templates.md` — copy verbatim to avoid drift; if the cluster-install version changes, propagate the change here. + +```bash +TS="$(TZ=UTC date +%Y%m%d-%H%M%S)" +DUMP="$CONFIG_DIR/diagnostics-${TS}" +mkdir -p "$DUMP" + +kubectl --context $CTX cluster-info dump --output-directory "$DUMP/cluster-info" +kubectl --context $CTX get nodes --output yaml > "$DUMP/nodes.yaml" +kubectl --context $CTX get hr --all-namespaces --output yaml > "$DUMP/helmreleases.yaml" +kubectl --context $CTX get pods --all-namespaces --output wide > "$DUMP/pods.txt" +kubectl --context $CTX get events --all-namespaces \ + --sort-by=.lastTimestamp > "$DUMP/events.txt" +kubectl --context $CTX --namespace cozy-system logs deploy/cozystack-operator \ + --tail=2000 > "$DUMP/operator.log" 2>&1 || true +kubectl --context $CTX --namespace flux-system logs deploy/helm-controller \ + --tail=2000 > "$DUMP/helm-controller.log" 2>&1 || true + +# Operator-supplied artefacts at the time of failure +cp "$CONFIG_DIR/cozystack-platform-package.yaml" "$DUMP/values.yaml" 2>/dev/null || true +cp "$CONFIG_DIR/.state.yaml" "$DUMP/state.yaml" 2>/dev/null || true + +# Per-failing-HR describe +for hr in $(kubectl --context $CTX get hr --all-namespaces \ + --output jsonpath='{range .items[?(@.status.conditions[?(@.type=="Ready" && @.status!="True")])]}{.metadata.namespace}/{.metadata.name} {end}'); do + ns=${hr%%/*}; name=${hr##*/} + kubectl --context $CTX --namespace "$ns" describe hr "$name" > "$DUMP/hr-${ns}-${name}.txt" +done + +# Redact sops-encrypted forms — re-encrypt if state.sops.enabled is true +if [ "$(yq '.sops.enabled // false' "$CONFIG_DIR/.state.yaml" 2>/dev/null)" = "true" ]; then + # The copies above came from already-encrypted on-disk files; nothing to do. + : +else + # Strip secrets from the plain copies before they go anywhere public. + yq --inplace 'del(.spec.components.platform.values.authentication.oidc)' \ + "$DUMP/values.yaml" 2>/dev/null || true +fi + +tar --create --gzip --directory "$CONFIG_DIR" --file "$DUMP.tar.gz" \ + "diagnostics-${TS}" +echo "Bundle: $DUMP.tar.gz" +``` + +## Redaction rules + +The bundle goes to GitHub. Strip these before sharing: + +- Any `Opaque` / `kubernetes.io/tls` / `kubernetes.io/dockerconfigjson` Secret `data` fields. `cluster-info dump` includes Secrets; rewrite them to ``. +- Operator-supplied passwords in values (Keycloak admin, registry creds). +- IP addresses that identify a customer or internal subnet — replace with `` or generic RFC 5737 (`192.0.2.x`). +- Hostnames that identify a customer — replace with `cluster.example.com`. +- SSH key paths from `.state.yaml`. + +The `*.tar.gz` glob is in the `.gitignore` cozystack section so bundles never get accidentally committed. + +## Bundle path under the cluster config directory + +`/diagnostics-/` keeps every artefact with the cluster it describes. `*.tar.gz` is gitignored. Operators can `rsync` the directory to a colleague or attach the tarball to a support thread. + +If `` isn't writable (read-only mount, exotic setup), fall back to `/tmp/cozystack-diagnostics-/` and surface the path explicitly. diff --git a/plugins/cozystack/skills/debug/references/issue-templates.md b/plugins/cozystack/skills/debug/references/issue-templates.md new file mode 100644 index 0000000..174f9da --- /dev/null +++ b/plugins/cozystack/skills/debug/references/issue-templates.md @@ -0,0 +1,241 @@ +# Issue body templates per repo + +Phase 5 of `cozystack:debug` renders one of these templates into `/diagnostics-/issue-body.md`, then prints the `gh issue create` command for the operator to review. Public-content rules apply: English, singular first person, no private cluster names, no internal tool names (no `cozystack:debug` mention in the body — operator's perspective: "I hit this while running Cozystack v1.3.x"), no `@mentions`. + +## Common preamble (every template) + +```markdown +**Cozystack version**: +**Kubernetes**: +**Platform variant**: +**Install path**: +**Nodes**: × on + +(Diagnostic bundle attached: `cozystack-diagnostics-.tar.gz` — logs redacted of secrets.) +``` + +## cozystack/cozystack — operator / chart / package CR bug + +```markdown +### What happened + + + +### Expected behaviour + + + +### Steps to reproduce + +1. Fresh v cluster, bootstrapped per `docs/v/install/kubernetes//`. +2. `helm upgrade --install cozy-installer oci://ghcr.io/cozystack/cozystack/cozy-installer --version --namespace kube-system --set cozystackOperator.variant= --set cozystack.apiServerHost=` +3. Apply Platform Package with `spec.variant: `. Full Package YAML attached at `values.yaml` in the bundle. +4. Observe `` — . + +### Source + +The failing path looks like: + +- `:` — + +(If unsure, leave this section as "I haven't pinpointed the exact source location — pointers welcome.") + +### Workaround + + + +### Logs and manifests + +Diagnostic bundle attached. Key files: + +- `helmreleases.yaml` — all HR statuses at the time of failure +- `operator.log` — cozystack-operator last 2000 lines +- `helm-controller.log` — helm-controller last 2000 lines +- `hr--.txt` — detailed describe of the failing HR +- `values.yaml` — the Package CR I applied (secrets redacted) +``` + +## cozystack/cozystack (feature request — missing value key) + +```markdown +### Use case + + + +### Current state + + + +### Proposed extension + + + +### Why not solve it locally + + + +### Workaround currently in use + + +``` + +## cozystack/website — doc gap + +```markdown +### Documentation page + +`https://cozystack.io/docs/v/install/kubernetes//` +(local path: `content/en/docs/v/install/kubernetes/.md`) + +### What I expected to find + + + +### What the page says + + + +### What I had to do instead + + + +### Suggested change + + +``` + +## cozystack/website — doc contradicts reality + +```markdown +### Documentation page + +`https://cozystack.io/docs/v/
//` +(local path: `content/en/docs/v/
/.md`) + +### What the page says + + + +### What the system does + + + +### Suggested change + + +``` + +## cozystack/ansible-cozystack — playbook gap + +```markdown +### Distribution / OS + + + +### Task that should exist but doesn't + +/install/kubernetes/generic.md` (or equivalent) the role does not automate, with quoted lines from the doc.> + +### Manual workaround + + + +### Suggested role change + + +``` + +## cozystack/ansible-cozystack — playbook broken on a distro + +```markdown +### Distribution / version + + + +### Symptom + + + +### Reproduction + +```bash +cd ~/git/github.com/cozystack/ansible-cozystack/examples/ +ansible-galaxy collection install --requirements-file requirements.yml +ansible-playbook --inventory inventory.yml site.yml +# Fails at task '' on '' +``` + +### Workaround + + + +### Suggested fix + + +``` + +## cozystack/talm — chart preset bug + +```markdown +### Symptom + + + +### Reproduction + +```bash +cd +talm init --preset cozystack --endpoint https://:6443 +# nodes/cp1.yaml is missing / contains when it should contain +``` + +### Source + +`~/git/github.com/cozystack/talm/charts/cozystack/templates/_helpers.tpl:` — + +### Suggested fix + + +``` + +## Upstream-upstream (piraeus / LINSTOR / Kube-OVN / Cilium / KubeVirt / cert-manager) + +```markdown +**Cross-link**: cozystack/cozystack#NNNN (if filed there too) +**Cozystack context**: Cozystack v, chart at `packages///charts//Chart.yaml` version . + +### What happened + + + +### Expected behaviour + + + +### Steps to reproduce + + + +### Environment + + +``` + +## Title shape per repo + +Keep titles factual, 60–80 chars, no marketing. + +| Repo | Title shape | +|---|---| +| cozystack/cozystack | `: in ` e.g. "dashboard: gatekeeper OIDC discovery times out in v1.3.2" | +| cozystack/website | `docs(
): ` e.g. "docs(install/generic): missing dm_thin_pool module note for LVM thin" | +| cozystack/ansible-cozystack | `: ` e.g. "examples/ubuntu: prepare-ubuntu.yml fails on RHEL 10 ZFS step" | +| cozystack/talm | `chart/cozystack: ` or `talm: ` | +| upstream-upstream | follow the project's own convention; check their CONTRIBUTING.md before filing. | + +## Don't include in the body + +- `cozystack:debug` skill name (operator-visible, not a Cozystack feature; would confuse maintainers). +- Any reference to "claude" / "AI" / the workflow that produced the report — irrelevant to the maintainer triage. +- Slack / Telegram screenshots (use the GitHub-flavored diagnostic bundle). +- Stack traces longer than 200 lines — put them in the bundle, reference by filename. diff --git a/plugins/cozystack/skills/debug/references/source-search.md b/plugins/cozystack/skills/debug/references/source-search.md new file mode 100644 index 0000000..9ed16a9 --- /dev/null +++ b/plugins/cozystack/skills/debug/references/source-search.md @@ -0,0 +1,146 @@ +# Searching cozystack monorepo from a symptom + +When Phase 3 classified as `upstream-bug`, Phase 4 must name a specific file:line in the cozystack source. "Something is wrong somewhere" doesn't help maintainers and doesn't justify an upstream issue. This document is the recipe for getting from a symptom to a source location. + +## Search roots + +Local checkouts (preferred — fast, no network, includes diffs against your local branch): + +- `~/git/github.com/cozystack/cozystack/` — main monorepo. Charts, operator Go code, platform Package templates. +- `~/git/github.com/cozystack/ansible-cozystack/` — ansible playbooks, role tasks, defaults. +- `~/git/github.com/cozystack/website/` — docs (Hugo content under `content/en/docs/`). +- `~/git/github.com/cozystack/talm/` — talm chart preset + binary source. +- `~/git/github.com/cozystack/boot-to-talos/` — bootstrap helper source. + +If a checkout is missing, `cd ~/git/github.com/cozystack && git clone https://github.com/cozystack/.git` first. + +## Patterns by symptom shape + +### Chart render failure + +The HR status shows something like: + +```text +Reason: InstallFailed +Message: ... template: /templates/_helpers.tpl: error calling fail: + No nodes found with label 'node-role.kubernetes.io/control-plane=true'. +``` + +Grep for the `fail` string verbatim: + +```bash +grep -rn "No nodes found with label" ~/git/github.com/cozystack/cozystack/packages/system// +``` + +If the chart is vendored under `charts//`, search there too: + +```bash +grep -rn "No nodes found with label" ~/git/github.com/cozystack/cozystack/packages/system//charts/ +``` + +The error text usually lives in a `templates/_helpers.tpl` `fail` call. From there, walk back to the `if` block that decides when to fire. + +### Operator runtime error + +`cozystack-operator` logs an error from its Go source. Grep the Go code: + +```bash +# The exact format string from the log line, with %s placeholders replaced by literals. +# Example log: "failed to reconcile HelmRelease cozy-dashboard/dashboard: install timeout" +grep -rn "failed to reconcile HelmRelease" ~/git/github.com/cozystack/cozystack/pkg/ ~/git/github.com/cozystack/cozystack/cmd/ +``` + +If the operator panics, the stack trace gives you the file:line directly. Otherwise look for `fmt.Errorf` / `klog.Error` calls in the matching function. + +### Package CR rejected by API server + +`kubectl apply` of `cozystack-platform-package.yaml` fails with: + +```text +The Package "cozystack.cozystack-platform" is invalid: spec.variant: Unsupported value: "isp-x" +``` + +The validation lives in the CRD or in admission webhooks: + +```bash +# CRD validation schema +grep -rn "spec.variant" ~/git/github.com/cozystack/cozystack/packages/core/installer/templates/ + +# Admission webhook code +grep -rn "variant" ~/git/github.com/cozystack/cozystack/internal/admission/ +``` + +### piraeus-operator / LINSTOR + +These are upstream-upstream — not in the cozystack monorepo. Forward to: + +- `https://github.com/piraeusdatastore/piraeus-operator` — the CRDs and reconciler. +- `https://github.com/LINBIT/linstor-server` — the LINSTOR API itself. +- `https://github.com/LINBIT/drbd` — kernel module. + +Cozystack pins versions in `packages/system/piraeus-operator/charts/piraeus/Chart.yaml` and `packages/system/linstor/charts//Chart.yaml`; report the exact upstream version in the issue body. + +For LINSTOR-specific failures, prefer `/linstor:recover` — it's more specialised than `cozystack:debug` for that area. + +### Kube-OVN / Cilium + +Upstream-upstream too: + +- `https://github.com/kubeovn/kube-ovn` +- `https://github.com/cilium/cilium` + +Cozystack vendors the charts under `packages/system/{kubeovn,cilium}/charts/`. Check `values.yaml` in those subtrees for cozystack-specific overrides before assuming the bug is in upstream. + +### Documentation contradicts reality + +Three possible outcomes: + +1. Docs wrong — file in `cozystack/website`. The relevant page is under `content/en/docs/v/...`. +2. Docs right, chart drifted — file in `cozystack/cozystack`. +3. Both wrong — file in both with cross-links. + +Use a side-by-side: `cat ~/git/github.com/cozystack/website/content/en/docs/v1.3/.md` vs the chart values / Go code. + +## Helpful grep idioms + +```bash +# Find every fail() call in chart templates, mapped to the strings that surface to operators +grep -rn --include='*.tpl' --include='*.yaml' 'fail (printf' ~/git/github.com/cozystack/cozystack/packages/ + +# Find the cozystack value key that controls a given chart sub-value +grep -rn 'apiServerHost' ~/git/github.com/cozystack/cozystack/packages/core/ + +# Find the HelmRelease that fires a given chart +grep -rn 'chart: ' ~/git/github.com/cozystack/cozystack/packages/core/platform/templates/ + +# Find changelog entries for a release (sometimes documents the bug already) +grep -rn '' ~/git/github.com/cozystack/cozystack/docs/changelogs/v*.md +``` + +## When source search dead-ends + +If 15 minutes of grepping doesn't surface a source location: + +1. Re-classify. Maybe it's config drift after all. +2. Ask the operator if the cluster has any non-cozystack operators that might be relevant. +3. Open an exploratory issue in `cozystack/cozystack` with the diagnostic bundle and a request for a maintainer to point at the right code path. Better than silently struggling. + +## Recording the find + +The Phase 6 state write captures it: + +```yaml +status: + debug: + target: hr/cozy-dashboard/dashboard + classification: upstream-bug + source: + repo: cozystack/cozystack + file: packages/system/dashboard/charts/dashboard/templates/gatekeeper.yaml + line: 47 + summary: "gatekeeper container always dials https://keycloak.${HOST} without a TLS skip-verify switch; values key not exposed" + action: workaround + issue_repo: cozystack/cozystack +``` + +That file:line ends up in the issue body so maintainers know where to look. Without it, the issue is much harder to triage. diff --git a/plugins/cozystack/skills/debug/references/upstream-routing.md b/plugins/cozystack/skills/debug/references/upstream-routing.md new file mode 100644 index 0000000..97d11ca --- /dev/null +++ b/plugins/cozystack/skills/debug/references/upstream-routing.md @@ -0,0 +1,57 @@ +# Upstream routing — which repo gets the issue + +Phase 5 of `cozystack:debug` picks one repo per filing. Wrong-repo filings get closed without action and waste everyone's time. Use the table below; when in doubt, ask the operator. + +## Routing table + +| Failure shape | Repo | +|---|---| +| Chart render error (`templates/*.yaml` fail / missing key / wrong values) | `cozystack/cozystack` | +| cozystack-operator runtime error (panic, reconcile failure, wrong reconcile decision) | `cozystack/cozystack` | +| Package CR rejected by API server (CRD schema / admission webhook) | `cozystack/cozystack` | +| Platform Package template doesn't expose a value the operator needs | `cozystack/cozystack` (feature request) | +| Install doc gap (step missing) | `cozystack/website` | +| Install doc wrong (step exists but contradicts reality) | `cozystack/website` | +| Hardware / supported-matrix question | `cozystack/website` | +| ansible-cozystack playbook missing a task | `cozystack/ansible-cozystack` | +| ansible-cozystack playbook broken on a specific distro / Secure Boot host | `cozystack/ansible-cozystack` | +| ansible-cozystack defaults / variable confusion | `cozystack/ansible-cozystack` | +| talm chart preset missing a kernel module / extension | `cozystack/talm` | +| talm binary bug (parser, render error) | `cozystack/talm` | +| boot-to-talos bug (kexec failure, install mode broken on a hardware family) | `cozystack/boot-to-talos` | +| extractedprism bug | `lexfrei/extractedprism` — independent BSD-3 project; file there for proxy-specific bugs. See README "Third-party dependencies" for the dependency policy. | +| LINSTOR / piraeus-operator behaviour (storage pool not registered, DRBD up but no replicas) | upstream `piraeusdatastore/piraeus-operator` (mention cozystack version + chart version in the body) | +| LINSTOR API issue (`linstor sp l` returns garbage, controller crashes) | upstream `LINBIT/linstor-server` | +| Kube-OVN issue (OVN central crash, IP allocation bug) | upstream `kubeovn/kube-ovn` | +| Cilium issue (CNI not working, kube-proxy replacement broken) | upstream `cilium/cilium` | +| KubeVirt issue (VM not starting, libvirt errors) | upstream `kubevirt/kubevirt` | +| cert-manager issue (Challenge stuck, Issuer wrong) | upstream `cert-manager/cert-manager` | +| Helm or Flux issue (chart not installable, controller stuck) | upstream `helm/helm` / `fluxcd/helm-controller` | + +## How to decide between cozystack/cozystack and upstream-upstream + +For LINSTOR / Kube-OVN / Cilium / cert-manager — the question is **where the misbehaviour lives**: + +- If cozystack's chart values cause the bug (operator follows docs, cozystack values render something wrong upstream gets confused about) → `cozystack/cozystack`. Maintainers there can fix the rendered values. +- If cozystack passes correct values and upstream still misbehaves → upstream-upstream. Include cozystack version + chart version in the body so reviewers know the context. + +If unsure, file in `cozystack/cozystack` first. Cozystack maintainers know upstream-upstream code and will redirect or take it on themselves. + +## Filing in upstream-upstream + +When the issue is in piraeus-operator / LINSTOR / Kube-OVN / Cilium / KubeVirt / cert-manager directly: + +- Include cozystack release: `Cozystack v1.3.2 (cozy-installer chart 1.3.2)`. +- Include the upstream version cozystack vendors: read from `packages///Chart.yaml` `dependencies[].version` or from the upstream image tag in `values.yaml`. +- Cross-link if you also opened (or are about to open) an issue in `cozystack/cozystack`: "Cross-link: cozystack/cozystack#NNNN". +- Don't open in upstream first and cozystack second. Cozystack maintainers are the operators' first line; they need to know about the issue too even if the fix lives upstream. + +## Don't open + +- General Kubernetes questions (kubeadm flag X doesn't work) — upstream `kubernetes/kubernetes` is wrong for that volume of churn; ask on the kubernetes slack first. +- "Why doesn't cozystack support X?" without a specific use case — write up the use case first, then a feature request. +- Issues where the bundle reveals private infrastructure that the operator hasn't redacted. Redact first, file second. + +## How the skill picks + +Phase 5 step 2 reads the symptom record's `classification` + `source.repo` (when set) and matches the table. When `source.repo` is empty but symptom signals point clearly to one repo, the skill proposes that repo and asks the operator to confirm. When signals are mixed (could be cozystack/cozystack or upstream-upstream), the skill asks explicitly. diff --git a/skills/cozy-external-app/skills/cozy-external-app/SKILL.md b/plugins/cozystack/skills/external-app-create/SKILL.md similarity index 98% rename from skills/cozy-external-app/skills/cozy-external-app/SKILL.md rename to plugins/cozystack/skills/external-app-create/SKILL.md index b8ad4dd..45d3699 100644 --- a/skills/cozy-external-app/skills/cozy-external-app/SKILL.md +++ b/plugins/cozystack/skills/external-app-create/SKILL.md @@ -1,10 +1,10 @@ --- -name: cozy-external-app +name: external-app-create description: Scaffold a new Cozystack external app package inside an external-apps repository. Generates the full chart skeleton (Chart.yaml, Makefile, values.yaml with cozyvalues-gen annotations, templates), registers it in core/platform (namespace, HelmRepository, HelmChart, HelmRelease, ApplicationDefinition), and wires dependency integration — supports managed CNPG Postgres clusters provisioned in-chart and external secret references for pre-existing services. Use when adding a new application (e.g. Immich, Gitea, Nextcloud) to an external-apps repo that follows the cozystack/external-apps-example layout. argument-hint: " [--depends-on=postgres,redis] [--operator=] [--repo-dir=]" --- -# cozy-external-app +# cozystack:external-app-create This skill scaffolds a new Cozystack external app package. It creates all files needed for the app to appear in the Cozystack dashboard and be deployable via the GitOps pipeline (GitRepository → Flux HelmRelease → ApplicationDefinition). @@ -12,11 +12,13 @@ This is a **generate-only** skill. It never applies anything to a cluster, never Work in reasoning mode. Follow the phases in order. When a step fails or is ambiguous, stop and ask — do not guess API shapes or secret names. -Use the phrasing "`cozy-external-app`" (not "the skill") in messages to the user, and state progress at each phase boundary. +Use the phrasing "`cozystack:external-app-create`" (not "the skill") in messages to the user, and state progress at each phase boundary. + +Match the operator's natural language detected from prior conversation messages — use it in prompts, AskUserQuestion options, summaries, and gates. Generated Helm template files, ApplicationDefinition values, and any text destined for git or the cozystack dashboard stay in their canonical form (usually English) per cozystack's public-content rules. ## Phase 1 — Parse arguments -`$ARGUMENTS` contains the free-form tail after `/cozy-external-app`. Extract: +`$ARGUMENTS` contains the free-form tail after `/cozystack:external-app-create`. Extract: - Positional `` — lowercase, hyphen-separated (e.g., `immich`, `my-app`). Required. - `--depends-on=` — comma-separated dependency names (e.g., `postgres`, `redis`). Default: none. @@ -117,7 +119,7 @@ For each dependency from Step 1, resolve a `$DEP_CONTRACT` from cozystack. Try t 3. **Live cluster** — when `kubectl` has a usable context (`kubectl config current-context` succeeds) AND the dep's ApplicationDefinition is installed: ```bash - kubectl get applicationdefinition --output yaml + kubectl --context $CTX get applicationdefinition --output yaml ``` Confirm the current context is the intended cozystack cluster before relying on this source (read-only operation, but still worth double-checking). This source is authoritative for *that specific cluster version* — if sources 1 or 2 disagree, prefer the live source and note the drift to the user. @@ -237,7 +239,7 @@ Build a plan document with five sections: Also list any **open items** explicitly: missing icon, missing upstream chart version, skipped dependency pattern, unresolved `$DEP_CONTRACT`. The user should not be surprised later. -Example for `/cozy-external-app gitea`: +Example for `/cozystack:external-app-create gitea`: ```text App : gitea (Kind: Gitea, Plural: giteas) diff --git a/skills/cozy-bump/skills/cozy-bump/SKILL.md b/plugins/cozystack/skills/package-bump/SKILL.md similarity index 95% rename from skills/cozy-bump/skills/cozy-bump/SKILL.md rename to plugins/cozystack/skills/package-bump/SKILL.md index e5f988f..2652041 100644 --- a/skills/cozy-bump/skills/cozy-bump/SKILL.md +++ b/plugins/cozystack/skills/package-bump/SKILL.md @@ -1,20 +1,22 @@ --- -name: cozy-bump +name: package-bump description: Bump a single package inside the cozystack monorepo (`packages/{apps,system,extra,core}//`). Detects the upstream source (vendored Helm chart, in-repo image build, or postgres-style enum), fetches the changelog between current and target versions, surfaces breaking changes / deprecated values / new required keys, applies adaptations, regenerates schema and ApplicationDefinition, runs `helm template` + `helm lint`, commits with a Conventional-Commit message, and optionally deploys the bumped version to a dev cluster via `cozyhr suspend` + `make apply` with a `ttl.sh` ephemeral image registry. Use when raising the upstream version of a cozystack-shipped component (e.g. bumping `apps/postgres` from 16.2 to 16.4, or refreshing a vendored subchart in `system/*`). argument-hint: " [--target-version=] [--no-deploy] [--registry=ttl.sh/] [--allow-dirty]" --- -# cozy-bump +# cozystack:package-bump This skill bumps the upstream version of a single package inside the cozystack monorepo (`~/git/github.com/cozystack/cozystack`, layout `packages/{apps,system,extra,core}//`). The bump is treated as a real review task — the changelog between the current and target versions is read, breaking changes and deprecations are surfaced, the package's own `values.yaml` and templates are adapted accordingly, and the result is verified locally before any commit. This skill **does** modify files inside the cozystack checkout, **does** create one signed-off commit, and **may** deploy the bumped version to a dev cluster on user approval. It **does not** push, open PRs, or touch production clusters. -Work in reasoning mode. Follow the phases in order. When a step fails or is ambiguous, stop and ask — do not guess upstream versions, image digests, or breaking-change semantics. Use the phrasing "`cozy-bump`" (not "the skill") in messages to the user, and state progress at each phase boundary. +Work in reasoning mode. Follow the phases in order. When a step fails or is ambiguous, stop and ask — do not guess upstream versions, image digests, or breaking-change semantics. Use the phrasing "`cozystack:package-bump`" (not "the skill") in messages to the user, and state progress at each phase boundary. + +Match the operator's natural language detected from prior conversation messages — use it in prompts, AskUserQuestion options, summaries, and gates. Code identifiers, commands, file paths, commit messages, and PR body drafts stay in their canonical form (usually English) per cozystack's public-content rules. ## Phase 1 — Parse arguments -`$ARGUMENTS` contains the free-form tail after `/cozy-bump`. Extract: +`$ARGUMENTS` contains the free-form tail after `/cozystack:package-bump`. Extract: - Positional `` — required. Either an absolute path, a path relative to the monorepo root (`packages/apps/postgres`), or a bare name (`postgres`) resolved against `packages/{apps,system,extra,core}//`. If multiple matches are found across types, ask the user via `AskUserQuestion` which one. `packages/library/` and `packages/tests/` are explicitly **out of scope** — those are internal helpers without an upstream version to track. If a bare-name match falls only into one of those, stop and explain. - `--target-version=` — explicit target (`16.4`, `v1.25.0`, `2.7.5`). If omitted, Phase 3 resolves the latest from upstream and asks the user to confirm. @@ -35,7 +37,7 @@ Bail early if any check fails. 3. **Working tree**: run `git -C "$REPO_ROOT" status --porcelain --untracked-files=no`. If output is non-empty and `$ALLOW_DIRTY` is unset, print a one-line summary and stop. Recommend `git stash` or `--allow-dirty`. 4. **Tools installed**: check that `yq` (v4 mikefarah), `jq`, `helm`, `docker buildx`, `cozyhr`, `cozyvalues-gen`, `kubectl`, `git`, `gh` are on `PATH` via `command -v`. Missing required tools → print install hints (link to each project's releases page) and stop. `gh` is required only for changelog scraping — if missing, warn and fall back to raw `curl` against the GitHub REST API in Phase 4. Note: every shell snippet in this skill is portable across macOS (BSD coreutils, GNU Make 3.81 default) and Linux (GNU coreutils). If you find yourself reaching for `make --eval`, `date --utc`, `date -u`, or any GNU-only flag, stop and rewrite using a portable form (e.g. `TZ=UTC date +...`, `make --makefile=- < /tmp/cozy-bump-releases.json -COUNT=$(jq --raw-output 'length' /tmp/cozy-bump-releases.json) + --json tagName,name,publishedAt > /tmp/cozystack-bump-releases.json +COUNT=$(jq --raw-output 'length' /tmp/cozystack-bump-releases.json) if [ "$COUNT" -ge "$GH_LIMIT" ]; then # COUNT == GH_LIMIT means we hit the cap (could be exactly that many releases, # could be more). Confirm by fetching the page beyond the cap — page number is @@ -130,10 +132,10 @@ For each tag in the window: ```bash gh release view --repo $UPSTREAM_OWNER/$UPSTREAM_REPO_NAME \ - --json tagName,name,body > /tmp/cozy-bump-release-.json + --json tagName,name,body > /tmp/cozystack-bump-release-.json ``` -Concatenate the `body` fields into `/tmp/cozy-bump-changelog.md` with `## ` headers between sections. +Concatenate the `body` fields into `/tmp/cozystack-bump-changelog.md` with `## ` headers between sections. If `gh release list` returns nothing for a project that uses tag-only releases (no GitHub Release objects), fall back to `git ls-remote --tags https://github.com/$UPSTREAM_OWNER/$UPSTREAM_REPO_NAME` plus reading `CHANGELOG.md` from the repo at each tag. If that also fails (sparse releases, no CHANGELOG), stop and ask the user where the changelog lives — never proceed without changelog evidence. @@ -142,10 +144,10 @@ If `gh release list` returns nothing for a project that uses tag-only releases ( Phase 3 already registered the upstream Helm repo unconditionally — this step can call `helm show values` directly. ```bash -helm show values / --version $CURRENT_VERSION > /tmp/cozy-bump-values-current.yaml -helm show values / --version $TARGET_VERSION > /tmp/cozy-bump-values-target.yaml -diff --unified=0 /tmp/cozy-bump-values-current.yaml /tmp/cozy-bump-values-target.yaml \ - > /tmp/cozy-bump-values.diff +helm show values / --version $CURRENT_VERSION > /tmp/cozystack-bump-values-current.yaml +helm show values / --version $TARGET_VERSION > /tmp/cozystack-bump-values-target.yaml +diff --unified=0 /tmp/cozystack-bump-values-current.yaml /tmp/cozystack-bump-values-target.yaml \ + > /tmp/cozystack-bump-values.diff ``` The diff is the most reliable indicator of renamed/removed/added top-level keys. @@ -156,7 +158,7 @@ If the chart has `crds/` or `templates/` containing `kind: CustomResourceDefinit ### Step 4 — Changelog analysis -Scan `/tmp/cozy-bump-changelog.md` for these patterns (case-sensitive where indicated): +Scan `/tmp/cozystack-bump-changelog.md` for these patterns (case-sensitive where indicated): - Case-insensitive substring: `breaking change`, `breaking:`, `deprecat`, `removed`, `renamed`, `migration`, `action required`, `incompatible`, `no longer supported`. To cut noise from `removed`/`renamed`/`migration` (which match phrases like "no items removed"), filter to lines that also fall within 5 lines of a version header (`## vX.Y.Z`, `### X.Y.Z`, etc.) or include "in this release"/"this version"/"in vX.Y" tokens — those proximity filters keep the matches anchored to a release. - Case-sensitive: `BREAKING`, RFC 2119 conformance keywords (`MUST` / `MUST NOT` / `MUST be`, `SHOULD` / `SHOULD NOT`, `MAY` / `MAY NOT`, `REQUIRED`, `SHALL` / `SHALL NOT`). Lowercase `must` matches almost every changelog and drowns the signal — don't grep on it. @@ -181,7 +183,7 @@ Each adaptation entry is a row of: | `action` | `rename`, `remove`, `add`, `update`, `review` | | `old_value` | `oldKey: foo` | | `new_value` | `newKey: foo` | -| `evidence` | tag + one-line excerpt from `/tmp/cozy-bump-changelog.md` | +| `evidence` | tag + one-line excerpt from `/tmp/cozystack-bump-changelog.md` | If the changelog is silent on something the diffs reveal (e.g., a values key removed without notice), still create an entry with `evidence: "values diff (no changelog mention)"` — these are the fragile spots. @@ -192,7 +194,7 @@ If **no adaptations** are found and the changelog says "no breaking changes", re Assemble every decision so far into one consolidated plan. Show via `AskUserQuestion`: ```text -cozy-bump plan for packages/$PKG_TYPE/$PKG_NAME +cozystack:package-bump plan for packages/$PKG_TYPE/$PKG_NAME Pattern: $BUMP_PATTERN ($CURRENT_VERSION → $TARGET_VERSION) Upstream: https://github.com/$UPSTREAM_OWNER/$UPSTREAM_REPO_NAME @@ -297,8 +299,8 @@ Path: 2. If `^generate:` is present: ```bash set -o pipefail # ensure the if-test sees `make`'s exit status, not `tee`'s - if ! make --directory "$PKG_DIR" generate 2>&1 | tee /tmp/cozy-bump-generate.log; then - echo "make generate failed — aborting before any commit. See /tmp/cozy-bump-generate.log." >&2 + if ! make --directory "$PKG_DIR" generate 2>&1 | tee /tmp/cozystack-bump-generate.log; then + echo "make generate failed — aborting before any commit. See /tmp/cozystack-bump-generate.log." >&2 exit 1 fi ``` @@ -331,7 +333,7 @@ There is no "lying bump" failure in path 1: `Chart.yaml.appVersion` and `values. Hard gates. If any fails, **stop and report**, do not commit, do not deploy. ```bash -helm template $PKG_DIR --output-dir /tmp/cozy-bump-template-out +helm template $PKG_DIR --output-dir /tmp/cozystack-bump-template-out helm lint $PKG_DIR git -C "$REPO_ROOT" diff --stat $PKG_DIR ``` @@ -345,7 +347,7 @@ If `helm template` or `helm lint` fail, surface the error verbatim. The most com ## Phase 8 — Commit -One commit per `cozy-bump` invocation. Format: +One commit per `cozystack:package-bump` invocation. Format: ```text chore(packages/$PKG_TYPE/$PKG_NAME): bump $PKG_NAME to $TARGET_VERSION @@ -365,7 +367,7 @@ The `Assisted-By: Claude` trailer is mandatory per the cozystack project commit Run: ```bash -COMMIT_MSG=$(mktemp -t cozy-bump-msg.XXXXXX) +COMMIT_MSG=$(mktemp -t cozystack-bump-msg.XXXXXX) trap 'rm -f "$COMMIT_MSG"' EXIT cat > "$COMMIT_MSG" < Flux reconciles the chart from an `OCIRepository`. There is no `spec.url`/`spec.ref.branch` to edit — instead the source pulls a chart artifact from an OCI registry. To verify the bump on a real cluster: > - > 1. Build and push the cozystack platform artifact to an OCI registry you control (out of scope for `cozy-bump` — see the cozystack docs for the artifact-build workflow). + > 1. Build and push the cozystack platform artifact to an OCI registry you control (out of scope for `cozystack:package-bump` — see the cozystack docs for the artifact-build workflow). > 2. Repoint the cluster's `OCIRepository.spec.url` (and `spec.ref.tag` or `spec.ref.digest`) at your fork's artifact. > 3. Wait for Flux to reconcile, then watch the workloads roll out. > 4. Once verified, restore the original `OCIRepository` spec. @@ -671,7 +673,7 @@ If the user picked `dry-run` in Step 1, print every command that Steps 3–6 wou Print: ```text -cozy-bump complete +cozystack:package-bump complete Package: $PKG_TYPE/$PKG_NAME Bump: $CURRENT_VERSION → $TARGET_VERSION @@ -713,7 +715,7 @@ Read these on demand when reasoning about behavior. Quote line ranges; structure - `cozystack/cozystack/hack/package.mk` — `apply: check suspend` then `cozyhr apply -n $(NAMESPACE) $(NAME)`. The `apply` target already suspends. - `cozystack/cozystack/hack/common-envs.mk` — defaults for `REGISTRY` (`ghcr.io/cozystack/cozystack`), `TAG` (`git describe --tags`), `PUSH` (`1`), `BUILDX_ARGS` assembly. - `cozystack/cozyhr/` — `cozyhr suspend|resume|apply|diff|show -n NAMESPACE NAME [--context CTX] [--kubeconfig PATH]`. `suspend` toggles `spec.suspend: true` on the HelmRelease via merge-patch with Flux field ownership. -- `cozystack/ccp/skills/cozy-deploy/skills/cozy-deploy/SKILL.md` — the established cozystack pattern for `ttl.sh` ephemeral registries (UUID-based, 24h TTL), HelmRelease shape detection, and `kubectl set image` fallback for ExternalArtifact releases. `cozy-bump` Phase 9 borrows from it directly. +- `cozystack/ccp/plugins/cozystack/skills/package-deploy/SKILL.md` — the established cozystack pattern for `ttl.sh` ephemeral registries (UUID-based, 24h TTL), HelmRelease shape detection, and `kubectl set image` fallback for ExternalArtifact releases. `cozystack:package-bump` Phase 9 borrows from it directly. - `cozystack/cozystack/packages/apps/postgres/hack/update-versions.sh` — reference for Pattern C enum updaters. - `cozystack/cozystack/packages/system/backup-controller/Makefile` — reference for Pattern B image-build targets that write digests back into `values.yaml` via `yq --inplace`. - Conventional Commits: https://www.conventionalcommits.org/ diff --git a/skills/cozy-deploy/skills/cozy-deploy/SKILL.md b/plugins/cozystack/skills/package-deploy/SKILL.md similarity index 93% rename from skills/cozy-deploy/skills/cozy-deploy/SKILL.md rename to plugins/cozystack/skills/package-deploy/SKILL.md index b655a47..e9e7257 100644 --- a/skills/cozy-deploy/skills/cozy-deploy/SKILL.md +++ b/plugins/cozystack/skills/package-deploy/SKILL.md @@ -1,23 +1,25 @@ --- -name: cozy-deploy +name: package-deploy description: Deploy a Cozystack package to a dev cluster via make + cozyhr. Handles both fresh install and dev-loop iteration — builds a custom image, detects whether the HelmRelease uses ExternalArtifact (in which case local values.yaml is ignored and kubectl set image is required), applies the change, waits for rollout, and offers to resume Flux afterwards. Use when iterating on a PR branch and wanting the change to land on a running cluster for manual verification or a screenshot. argument-hint: " [--registry=] [--tag=] [--context=] [--namespace=] [--release=] [--skip-build] [--no-resume] [--keep-values]" --- -# cozy-deploy +# cozystack:package-deploy This skill deploys a single Cozystack package (`packages/system/` or `packages/apps/`) from the current checkout to a Kubernetes cluster, using the repo's own `make` targets and the `cozyhr` wrapper. It is designed for developer iteration against a dev cluster — **do not run it against production**. Touching a real cluster requires explicit confirmation at a gate below. Work in reasoning mode. Follow the phases in order. Skip steps only when the argument flags explicitly say so. When a step fails, stop and report — do not try to work around by disabling safety checks. -Use the phrasing "`cozy-deploy`" (not "the skill") in messages to the user, and state progress at each phase boundary. +Use the phrasing "`cozystack:package-deploy`" (not "the skill") in messages to the user, and state progress at each phase boundary. + +Match the operator's natural language detected from prior conversation messages — use it in prompts, AskUserQuestion options, summaries, and gates. Code identifiers, commands, file paths, and commit messages stay in their canonical form (usually English). ## Phase 1 — Parse arguments -`$ARGUMENTS` contains the free-form tail after `/cozy-deploy`. Extract: +`$ARGUMENTS` contains the free-form tail after `/cozystack:package-deploy`. Extract: - Positional `` — the directory name under `packages/system/` or `packages/apps/`. Required. -- `--registry=` — container registry to push to (e.g., `ghcr.io/lexfrei`). If unset, fall back to a private `ttl.sh/` path (Phase 4). +- `--registry=` — container registry to push to (e.g., `ghcr.io/` or your team registry). If unset, fall back to a private `ttl.sh/` path (Phase 4). - `--tag=` — image tag. Default: `latest` (since `TAG` in `hack/common-envs.mk` resolves to `latest` outside of a git tag). - `--context=` — `kubectl` context to target. Default: whatever `kubectl config current-context` returns (with confirmation in Phase 3). - `--namespace=` — Kubernetes namespace. Default: value of `NAMESPACE` in the package `Makefile`. @@ -89,7 +91,7 @@ REGISTRY=$REGISTRY TAG=$TAG PLATFORM=linux/amd64 BUILDER=multi PUSH=1 \ Notes: -- `PLATFORM` should be `linux/amd64` for `dev9`-like clusters; detect from `kubectl --context $CONTEXT get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}'` and set accordingly. +- `PLATFORM` should be `linux/amd64` for amd64 clusters; detect from `kubectl --context $CONTEXT get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}'` and set accordingly. - `BUILDER=multi` matches the name from Phase 2 — if the user already had a builder with a different name, use that. - `make image` will re-write the package's `values.yaml` in place (via `yq -i`) to inject the resolved `@sha256:`. Verify the diff: ```bash diff --git a/plugins/cozystack/skills/talos-bootstrap/SKILL.md b/plugins/cozystack/skills/talos-bootstrap/SKILL.md new file mode 100644 index 0000000..b09fe66 --- /dev/null +++ b/plugins/cozystack/skills/talos-bootstrap/SKILL.md @@ -0,0 +1,673 @@ +--- +name: talos-bootstrap +description: Bootstrap Talos Linux nodes into a Cozystack-ready cluster via talm. Default assumption is that the operator's nodes are already in Talos maintenance mode (the standard starting point — Talos boots from a nocloud/raw image and listens on :50000 awaiting machine-config). When that's the case the skill goes straight to `talm init` and `talm apply`, no OCI / PXE / ISO dance needed. When nodes are not yet imaged, the skill offers a boot-method picker (OCI Custom Image, PXE/iPXE, boot-to-talos, ISO) and walks the operator through getting them into maintenance mode first. Verifies the resulting cluster has the cozystack-tuned extensions (drbd, zfs, openvswitch) and the LVM filter before handing off to `cozystack:cluster-install`. +argument-hint: "[--config-dir=] [--skip-boot-method] [--talos-version=] [--cozystack-repo=] [--installer-profile-url=]" +--- + +# cozystack:talos-bootstrap + +Work in reasoning mode. Use the phrasing `cozystack:talos-bootstrap`. Announce phase transitions: `cozystack:talos-bootstrap Phase N — `. + +> **Note on language in this SKILL.md** — every operator-facing prompt below is written in English for clarity. At runtime the skill matches the operator's natural language detected from prior conversation messages (or read from `/.state.yaml` `operator_language` when the wizard chain is in progress). Treat the English text as a template for tone, structure, and content. Code identifiers, commands, file paths, and any text destined for GitHub stay canonical regardless of operator language. + +## Core principles + +- Match the operator's natural language. Read from `/.state.yaml` `operator_language` (set by `cozystack:wizard` Phase 0) or detect from prior messages when invoked directly. Use it in prompts, AskUserQuestion options, summaries, and gates. Code identifiers, commands, file paths, and GitHub-public text stay in their canonical form. +- One valid path → just do it. The skill executes `talm init` / `talm apply` / `talosctl bootstrap` / `talosctl kubeconfig` / verification automatically once Phase 5 (boot method) or Phase 4 (maintenance-mode probe) confirmed there's one valid forward direction. Gates remain for (a) multi-option choices (Phase 3 needs-help, Phase 5 boot method per provider, Phase 7 per-node review), (b) destructive operations (`talosctl reset` of an already-configured node), (c) the consolidated plan presentation. No "I'll wait for you to say done" gates — the skill verifies on its own schedule. +- Front-load the interview. **Every question the skill might ask in any phase is collected upfront**, before `talm init` runs: needs-OS-install (Phase 3), per-node boot method (Phase 5, for nodes the probe found unready), per-node install disk choice on multi-disk nodes, VIP for HA, custom installer schematic if not the default. Phases 1 (state) + 2 (workstation prep) + 4 (maintenance probe) are read-only lookups that run before any question fires. `intent_hints` from wizard Phase 0 pre-fills wherever it can. Phase 7 per-node review is one consolidated screen for all node configs, not one screen per node. Phases 6–12 then execute end-to-end without re-prompting. +- Layer-pure operator output. The skill never says "returning control to wizard", "the wizard will dispatch next", or any other orchestration commentary in the **operator-facing** summary. Whoever invoked the skill (a human running `/cozystack:talos-bootstrap` directly, or the wizard's dispatch loop) figures out what's next on their own — the wizard reads `.state.yaml` and decides; a human reads the printed `next:` hint at the bottom of the NOTES. Internal SKILL.md references to `cozystack:wizard` are fine for documentation, but `wizard` does not appear in any text shown to the operator. +- **Maintenance mode is the baseline.** Talos in maintenance mode is the standard entry point — listening on `:50000` awaiting machine-config. The skill drives `talm apply` directly against the maintenance API; no SSH, no OCI dance, no boot-to-talos invocation when nodes are already there. Only when nodes aren't imaged yet does the skill detour through a boot-method picker. +- Source of truth for everything Cozystack expects in Talos: + - Tuned image: `ghcr.io/cozystack/cozystack/talos:vX.Y.Z` — read the pinned tag from the installer profile in this resolution order: + 1. `--installer-profile-url=` override (rare; for testing). + 2. `--cozystack-repo=/packages/core/talos/images/talos/profiles/installer.yaml` override. + 3. `~/git/github.com/cozystack/cozystack/packages/core/talos/images/talos/profiles/installer.yaml` default. + 4. URL fallback `https://raw.githubusercontent.com/cozystack/cozystack//packages/core/talos/images/talos/profiles/installer.yaml` when no local clone is present (`` is the cozystack tag from `state.cozystack.installer_version` or the latest from `git ls-remote --tags https://github.com/cozystack/cozystack` if not set). + - System extensions: drbd, zfs, openvswitch, plus firmware (amd-ucode / intel-ucode / intel-ice / etc.). + - Kernel modules in machine-config: drbd, zfs, spl, openvswitch, vfio_pci, vfio_iommu_type1. + - LVM filter in `/etc/lvm/lvm.conf`: `global_filter = [ "r|^/dev/drbd.*|", "r|^/dev/dm-.*|", "r|^/dev/zd.*|" ]`. + - talm preset: `cozystack` chart from `~/git/github.com/cozystack/talm/charts/cozystack`. +- Verify before declaring success. A green Apply is not enough — `kubectl get nodes Ready`, talos extensions present, LVM filter present. + +## Phase 1 — Read state + +Read `/.state.yaml`. Required: `config_dir`. Optional: `inventory.nodes` (the wizard fills it), `intent_hints` (Phase 0 free-form context may have set boot-method hints). + +When invoked outside a wizard chain, interview: + +- Node IPs / hostnames + role (cp/worker). Minimum 1 cp; 3 cp for HA via embedded etcd raft. +- Are they already in Talos maintenance mode? — yes/no. If unsure, the skill helps probe (Phase 3). + +Persist to `inventory.nodes`. + +## Phase 2 — Workstation prep + +Read-only checks on the operator's workstation: + +```bash +talosctl version --client +talm version 2>/dev/null +``` + +- `talosctl` is required. If missing → `brew install siderolabs/tap/talosctl` or `https://github.com/siderolabs/talos/releases`; refuse. +- `talm` is required (the skill drives `talm init` + `talm apply`). If missing → `brew install cozystack/tap/talm` or the cozystack/talm releases page; refuse. + +`boot-to-talos`, `helm`, `age` etc. only matter for specific boot methods; checked in Phase 4 if that path is taken. + +## Phase 3 — Do you need help installing Talos? + +Default assumption: Talos is already installed on the nodes — either the operator brought up the boxes themselves (their own image pipeline, an existing Talos cluster they're re-purposing) or they followed the cozystack docs for OCI / bare-metal / wherever and the nodes are now sitting in maintenance mode. The skill's primary job is `talm init` + `talm apply` against ready nodes; the OS-install detour is opt-in only. + +Single AskUserQuestion: + +```text +Is Talos already installed on the nodes and are they in maintenance mode? Or do you need help installing it? + + 1. Already installed / I'll handle it — skip the boot-method picker. + (Recommended — most scenarios. The skill goes straight to the + maintenance-mode probe to confirm.) + 2. Need help installing the OS — skill walks through the boot-method + picker (OCI Custom Image / boot-to-talos / ISO / PXE). + 3. Not sure — let me check first — skill runs the maintenance-mode + probe (Phase 4) and decides based on the result, offering help + if any node turns out unreachable. +``` + +If operator pre-filled `intent_hints.needs_os_install` in wizard Phase 0 (e.g. "I just have raw hardware, need to install Talos") → skip this question, pre-select option 2. + +Record `state.talos.needs_os_install` (true / false / unsure). + +## Phase 4 — Maintenance-mode probe (always run) + +Regardless of Phase 3 answer, probe every node in inventory to ground-truth the state. The probe choice depends on Talos minor version because the insecure surface area changed between releases: + +- **Talos 1.12** (the version cozystack v1.3.x pins): `talosctl version --insecure` returns `API is not implemented in maintenance mode` — looks like a fail but the node IS in maintenance. `get machineconfig --insecure` returns `PermissionDenied`. The only insecure-API surface that works reliably is `get disks` (read-only, machine-config-independent). +- **Talos 1.13+**: `get machineconfig --insecure` returns `PermissionDenied` (changed from 1.12's behaviour), but `version --insecure` works. `get disks --insecure` continues to work. + +The single probe that works across Talos 1.12 / 1.13 / 1.14: + +```bash +talosctl get disks --insecure --nodes "$NODE_IP" 2>&1 | head -5 +# Maintenance mode (any Talos version) → returns disk list (NODE / NAMESPACE / TYPE / ID / VERSION / SIZE rows) +# Already configured → returns the same disk list (still works, but Phase 3 cross-check via talosctl with the cluster's talosconfig tells us if config is loaded) +# No route to host → not yet imaged or wrong IP +# Connection refused → port 50000 not open / wrong IP +``` + +If `get disks` returns rows, the node speaks the Talos API at port 50000 — that's the maintenance-mode signal. To distinguish "maintenance" from "already configured", try `get machineconfig --insecure` afterward — if it returns `NotFound` / `PermissionDenied` the node is in maintenance; if it returns the config, the node is already provisioned (refuse to overwrite without `--reset`). + +Cross-check with `talosctl version --insecure --short` only on nodes you've already established are running Talos 1.13+ (e.g. via `get disks --insecure --node $IP --output yaml | yq '.spec.system_info.os_release'`). + +Aggregate the matrix: + +```text +maintenance-mode probe + + cp1 (10.0.0.10): ✓ in maintenance mode (Talos vX.Y.Z, no config) + cp2 (10.0.0.11): ✓ in maintenance mode + cp3 (10.0.0.12): ✗ already configured — refuse to overwrite without --reset + w1 (10.0.0.20): ✗ unreachable (timeout) — node not yet imaged or wrong IP +``` + +Outcomes: + +- **All nodes in maintenance mode** → Phase 5 boot-method picker is skipped entirely. Jump to Phase 6 (talm init). Operator's Phase 3 answer didn't matter — reality matched expectations. +- **Some nodes unreachable / not imaged**: + - Phase 3 said `needs_os_install: false` → reconcile: "you said Talos is already installed, but cp3 isn't responding on :50000. Re-check IP / firewall / provider? Or do you actually want help installing on those nodes?". Operator picks: re-check (re-run Phase 4), help (jump to Phase 5 for those nodes), skip the node, or cancel. + - Phase 3 said `needs_os_install: true` → Phase 5 boot-method picker for those nodes. + - Phase 3 said `unsure` → ask explicitly now that we have data: help with these nodes or you'll handle them yourself? +- **Some nodes already configured** (not maintenance, not unreachable — fully running) → refuse for those nodes. Operator decides: `talosctl reset` to wipe (destroys data), skip the node from this install, or cancel. Never silently overwrite. + +If `--skip-boot-method` was passed, refuse for any node not in maintenance mode — operator promised they're ready and the skill takes them at their word. + +## Phase 5 — Boot method (only when help is needed) + consolidated intake + +This is the **one** interview phase. After Phase 4 probe knows which nodes are ready and which aren't, the skill collects everything the rest of the flow needs in one pass and presents a consolidated summary. Later phases (talm init / talm apply / bootstrap / kubeconfig / verify) consume the collected answers without re-prompting. + +Slots filled here: + +- Per-node **boot method** (only for nodes the probe found unready). See provider tables below. +- Per-node **install disk** if the node has multiple candidates. Default is the largest unmounted disk; surface choice only on ambiguity. +- **VIP** for HA kube-apiserver if more than one CP node — operator-supplied or auto-detect from cloud LB if reachable. +- **Custom installer image** override (rare — default is the pinned cozystack-tuned tag from `~/git/github.com/cozystack/cozystack/packages/core/talos/images/talos/profiles/installer.yaml`). +- **Cluster name** for `talm init --endpoint` and the eventual `kubectl` context. Default from `state.intent_hints.cluster_name` or `cozystack-lab`. +- **Kubeconfig merge target** (when `/kubeconfig.yaml` already exists or operator wants merge into `~/.kube/config`). + +After collecting, present the consolidated summary: + +```text +cozystack:talos-bootstrap — collected values + +inventory: + cp1 (10.0.0.10): maintenance ✓ → talm apply (default disk: /dev/sda) + cp2 (10.0.0.11): maintenance ✓ → talm apply (default disk: /dev/sda) + cp3 (10.0.0.12): unreachable → boot method: OCI Custom Image + w1 (10.0.0.20): maintenance ✓ → talm apply (default disk: /dev/sda) + +cluster: + endpoint: https://10.0.0.10:6443 + vip: (none — single-CP would use cp1.host; with 3 CP set if desired) + cluster name: cozystack-lab + installer image: ghcr.io/cozystack/cozystack/talos:v1.13.0 (default) + kubeconfig: /kubeconfig.yaml (will overwrite if exists — operator can pick merge instead) + +options: + - Approve all — proceed to Phase 6 (talm init) and then execute end-to-end + - Edit + - Cancel +``` + +Skip entirely when Phase 4 said all nodes are already in maintenance mode, OR Phase 3 said `needs_os_install: false` and Phase 4 didn't surface contradictions. + +For nodes that need OS install, present per-provider options. The picker shape depends on `intent_hints.hardware_provider` from wizard Phase 0: + +**OCI / Oracle Cloud Infrastructure**: + +1. `OCI Custom Image (nocloud-amd64.raw.xz)` (Recommended) — operator downloads the `nocloud-amd64.raw.xz` from the cozystack-tuned release artefact, uploads to OCI Object Storage, imports as Custom Image, creates instances from it. The skill prints the artefact URL and the exact `oci` CLI commands; operator runs them. Once instances are launched and reach maintenance mode (typically 2–5 min), the skill re-runs Phase 3. +2. `OCI shape already running, just not configured` — operator already launched instances from a Custom Image; they're in maintenance mode now. Re-run Phase 3 — should turn green. +3. `PXE / iPXE / matchbox` — rarely the right choice on OCI; needs a bastion in the same VCN. Surface a warning and the matchbox-on-OCI guide link if the operator insists. + +**Bare-metal**: + +1. `boot-to-talos` (Recommended for existing-Linux nodes) — operator already has Linux on the box; replaces it in place. The skill prints the exact `boot-to-talos -image ghcr.io/cozystack/cozystack/talos: -disk /dev/sda -mode install` command. +2. `ISO / USB` — operator downloads `metal-amd64.iso` from the cozystack-tuned release, boots from it. +3. `PXE / iPXE / matchbox` — for racks; the skill points at matchbox docs. + +**Other cloud (AWS / GCP / Azure / Hetzner / etc.)**: + +1. Provider-specific Custom Image (AMI / disk image / etc.) — different artefact per cloud; skill names the right one and the upload procedure. +2. `Already-launched instance in maintenance mode` — see OCI option 2. + +Per-node decisions go into `state.talos.boot_method.`. The skill does **not** run the boot itself in v1 — operator runs the chosen path, then re-invokes the skill with `--skip-boot-method` once nodes are in maintenance mode. + +## Phase 6 — talm init (generate cluster-wide secrets + config) + +Run inside `` so all artefacts (`nodes/`, `secrets.yaml`, `talosconfig`) land alongside the rest of the cluster config: + +```bash +cd "$CONFIG_DIR" + +talm init \ + --preset cozystack \ + --endpoint "https://${CP1_IP}:6443" \ + --output-dir ./ +``` + +The `cozystack` preset bakes in everything from the Source-of-truth list above (image, extensions, kernel modules, LVM filter). The operator does not edit those by hand. + +`talm init` produces: + +- `secrets.yaml` — cluster-wide PKI, encryption keys (sops-encrypted if `state.sops.enabled` is true via the shared `.sops.yaml`). +- `templates/` — per-resource Talos machine-config templates (the cozystack preset). +- `talosconfig` — talos client config to talk to the cluster post-bootstrap. +- `values.yaml` — the input values the templates render against. **This is where certSANs lives.** + +Critically, `talm init` does **not** create `nodes/.yaml` — those are operator inputs the skill assembles in Phase 6.5. + +If `state.sops.enabled` is true, the `/.sops.yaml` from `cozystack:wizard` Phase 1.5 already covers `secrets.yaml` and `nodes/*.yaml`; talm respects it natively, no skill-side encrypt step needed. + +## Phase 6.3 — NAT-provider cert-SAN guardrail (BEFORE first talm apply) + +Skip when none of the NAT-provider signatures are present. The trigger is the same one Phase 4.5 research uses: + +``` +state.intent_hints.reach_mode == "public" +AND state.cozystack_intake.external_ips.strategy == "internal" +AND state.inventory.nodes[].public_ip is set on at least one node +``` + +This is the OCI / GCP-with-Cloud-NAT / AWS-with-EIP signature: the workstation reaches each node via a **public IP** that is rewritten by the cloud fabric to the node's **internal IP** before the packet hits the interface. The node's kernel — and Talos's machine-cert generator — only ever see the internal IP. After `talm apply`, Talos issues its API-server TLS certificate with `certSANs` populated from observed addresses: internal IPs + `127.0.0.1` + the explicit `--endpoint`. The workstation, dialing the public IP, gets a cert with no matching SAN. **TLS handshake fails. talosctl bootstrap cannot proceed. There is no insecure escape hatch.** Recovery from this point requires re-imaging the node — `talosctl reset` itself needs a valid TLS connection. + +This trap caught the same operator twice in a row. It is now closed before the first `talm apply` runs. + +Auto-populate `values.yaml` with the full certSANs set the workstation will need to dial: + +```bash +# Collect from state.yaml +PUBLIC_IPS=$(yq '.inventory.nodes[] | select(.public_ip != null) | .public_ip' "$STATE_FILE") +INTERNAL_IPS=$(yq '.inventory.nodes[] | select(.internal_ip != null) | .internal_ip' "$STATE_FILE") +VIP=$(yq '.cluster.vip.shared_address // ""' "$STATE_FILE") +PER_NODE_VIPS=$(yq '.cluster.vip.per_node[]' "$STATE_FILE" 2>/dev/null || true) +API_HOST=$(yq '.cozystack_intake.publishing.host // ""' "$STATE_FILE") + +# Merge, dedupe, sort +{ + printf '%s\n' "127.0.0.1" "localhost" + printf '%s\n' $PUBLIC_IPS $INTERNAL_IPS $VIP $PER_NODE_VIPS + [ -n "$API_HOST" ] && printf 'api.%s\n' "$API_HOST" +} | awk 'NF' | sort -u > /tmp/cert-sans.txt + +# Patch values.yaml (yq edits in-place; preserves comments) +yq --inplace ' + .machine.certSANs = (load_str("/tmp/cert-sans.txt") | split("\n") | map(select(length > 0))) +' "$CONFIG_DIR/values.yaml" +``` + +Surface the result to the operator: + +```text +talos-bootstrap — NAT-provider cert-SAN guardrail + + detected signature: reach_mode=public + external_ips_strategy=internal + (OCI 1:1 NAT / GCP NAT / AWS EIP) + + auto-added to machine.certSANs: + 127.0.0.1 + localhost + 10.X.0.128 # node0 internal + 10.X.0.27 # node1 internal + 10.X.0.173 # node2 internal + 10.X.100.10 # VIP shared + 10.X.100.11 / 10.X.100.12 / 10.X.100.13 # per-node VIPs + 192.0.2.10 / 192.0.2.11 / 192.0.2.12 # workstation-visible public IPs + api.cluster.example.com # publishing.host + + why: without public IPs in certSANs, the workstation cannot reach the + apiserver after talm apply — the cert is valid only for IPs the kernel + sees, and the kernel never sees public IPs on NAT'd providers. Without + the apiserver, talosctl bootstrap cannot complete, and there is no + insecure escape from a wrong cert. +``` + +No gate — this is purely defensive. The operator can `--no-cert-san-guardrail` to opt out (e.g. they're using their own values.yaml with explicit SANs already). + +If the trigger signature is **not** present (operator on bare-metal with the public IP actually on the interface, or reach_mode=internal), skip this entire phase silently. + +## Phase 6.5 — Create per-node config stubs (modeline + body overlay) + +`talm init` writes `templates/`, `secrets.yaml`, `values.yaml`, and `talosconfig` but **does not** create `nodes/.yaml`. The skill creates them with both the modeline (which links to the rendering template) AND the body overlay (per-node values that get merged on top of the template render at `talm apply` time). + +`talm template -f node.yaml [--in-place]` does **not** preserve body overlay — the output is the rendered template plus the modeline, byte-identical to what the template alone would produce on Talos 1.12+ multidoc format. Body overlay (HostnameConfig, per-node LinkConfig with static IPv4 on the VIP link, etc.) gets stripped. So **do not** run `talm template --in-place` to fill the stub. Write the body overlay directly: + +```bash +mkdir -p "$CONFIG_DIR/nodes" + +for node_idx in "${!INVENTORY_NODE_NAMES[@]}"; do + node="${INVENTORY_NODE_NAMES[$node_idx]}" + f="$CONFIG_DIR/nodes/${node}.yaml" + [ -f "$f" ] && continue + + # Resolve per-node values from state + per_node_vip=$(yq ".cluster.vip.per_node.${node} // \"\"" "$STATE_FILE") + vip_link=$(yq '.cluster.vip.link // ""' "$STATE_FILE") + shared_vip=$(yq '.cluster.vip.shared_address // ""' "$STATE_FILE") + vip_subnet_mask=$(yq '.cluster.vip.subnet // ""' "$STATE_FILE" | sed 's|.*/||') + + cat > "$f" <> "$f" <> "$f" <.yaml` for review. Things to spot before apply: + +- `machine.install.image` should be `ghcr.io/cozystack/cozystack/talos:`. The cozystack preset sets this; surface if missing. +- `machine.kernel.modules` should list drbd / zfs / spl / openvswitch / vfio_pci / vfio_iommu_type1. +- For CP nodes: `machine.type: controlplane`, optional `machine.network.interfaces[].vip` for HA. +- `machine.install.disk` defaults to a heuristic — confirm it picks the right system disk on multi-disk nodes (operator can edit nodes/.yaml before apply). + +Options: + +- `Apply` — proceed to Phase 8. Skill applies to every node back-to-back without per-node prompts. +- `Edit .yaml` — operator names a specific node, opens it in `$EDITOR`, returns; skill re-shows the relevant config slice and asks again. + +The default is `Apply`. The per-node-approval path is gone — once the operator approved the config in this phase, every node gets `talm apply` in Phase 8 without a "ok for cp2?" intermission. + +## Phase 8 — talm apply (push config to nodes in maintenance mode) + +Per-node, drive talm against the maintenance API. **Always double-quote shell variables holding IPs / node lists** — `talosctl` and `talm` both pass `--nodes` as a space-separated list, so an unquoted `--nodes $IPS` with `IPS="1.2.3.4 5.6.7.8"` reads the second IP as a positional subcommand and fails with `unknown command 5.6.7.8`. + +```bash +talm apply \ + --talosconfig "$CONFIG_DIR/talosconfig" \ + --insecure \ + --nodes "$NODE_IP" \ + --mode=auto \ + --file "$CONFIG_DIR/nodes/$NODE_NAME.yaml" +``` + +`--insecure` because the node is in maintenance mode without a cert yet. `--mode=auto` lets talm pick the safest reboot mode (try → staged → reboot). After first apply, every node reboots into the final configuration and is no longer in maintenance mode — but **the first reboot does not reinstall Talos** (see "version drift" in Phase 11). The message `talm` prints — `Applied configuration without a reboot` — is correct: talm applied live, no reboot needed; the on-disk image is replaced lazily on the next `talosctl upgrade`, not on this apply. + +Capture talm's stdout/stderr verbatim. On any node failure, abort the batch (don't continue applying to other nodes — partial cluster is harder to debug than no cluster). + +## Phase 9 — Bootstrap etcd (CP1 only, first time only) + +After CP1 has applied its config and rebooted into the final configuration: + +```bash +talosctl --talosconfig "$CONFIG_DIR/talosconfig" \ + --nodes "$CP1_IP" \ + bootstrap +``` + +This brings etcd online on CP1. Additional CPs (cp2, cp3) join automatically via the join token in the machine-config. Workers come up after the control plane is healthy. + +Wait for the cluster to come up: + +```bash +talosctl --talosconfig "$CONFIG_DIR/talosconfig" --nodes "$CP1_IP" \ + health --wait-timeout=15m +``` + +## Phase 10 — Fetch kubeconfig + +```bash +talosctl --talosconfig "$CONFIG_DIR/talosconfig" \ + --nodes "$CP1_IP" \ + kubeconfig "$CONFIG_DIR/kubeconfig.yaml" + +chmod 0600 "$CONFIG_DIR/kubeconfig.yaml" + +# If sops is enabled, encrypt at rest. +if [ "$(yq '.sops.enabled // false' "$CONFIG_DIR/.state.yaml")" = "true" ]; then + sops --encrypt --in-place "$CONFIG_DIR/kubeconfig.yaml" +fi +``` + +Verify: + +```bash +KUBECONFIG="$CONFIG_DIR/kubeconfig.yaml" kubectl get nodes +# Expect every inventory node Ready, versions matching. +``` + +## Phase 11 — Verify cozystack-tuned shape + +Use `talosctl` (via apid) — **not** `kubectl debug`. At end of `talos-bootstrap` there's no CNI yet (`cluster-install` installs Cilium); `kubectl debug node` will fail to schedule the debug pod. apid is always reachable on the nodes' Talos API port regardless of CNI state. + +Three checks per node: + +```bash +TALOSCONFIG="$CONFIG_DIR/talosconfig" +NODES=$(yq '.inventory.nodes[].host' "$CONFIG_DIR/.state.yaml" | paste -sd,) + +# Modules — read /proc/modules from apid +talosctl --talosconfig "$TALOSCONFIG" --nodes "$NODES" \ + read /proc/modules \ + | awk '{print $1}' \ + | grep -Ex 'drbd|zfs|openvswitch' + +# Cozystack-tuned extensions in the installer. talosctl streams one resource +# per line in jsonpath mode, so the kubectl-style {range} wrapper is not needed +# — but newline-separated output also means an empty stream silently passes +# a top-level `grep -E`. The per-extension `grep -qE` loop below catches that: +# every expected extension must produce at least one match, otherwise exit 1. +talosctl --talosconfig "$TALOSCONFIG" --nodes "$NODES" \ + get extensions --output jsonpath='{.spec.metadata.name}' \ + | sort -u \ + | tee /tmp/talos-ext.list + +for want in drbd zfs openvswitch; do + grep -qE "(^|/)${want}\$" /tmp/talos-ext.list \ + || { echo "missing extension: ${want}"; exit 1; } +done + +# LVM filter +talosctl --talosconfig "$TALOSCONFIG" --nodes "$NODES" \ + read /etc/lvm/lvm.conf \ + | grep -E '^\s*global_filter\s*=' \ + | grep -E 'drbd|zd|dm-' + +# Talos version uniformity — straight from apid +talosctl --talosconfig "$TALOSCONFIG" --nodes "$NODES" \ + version --short \ + | grep -E '^Tag:' | sort -u +``` + +If any check fails, the skill **does not silently mark `failed_at`** — it tries to reconcile via Phase 11.5 auto-upgrade first (the most common cause of missing extensions is operator-booted-from-base-Talos-image instead of cozystack-tuned). Only after Phase 11.5 fails does the skill write `status.talos-bootstrap.failed_at`. `cluster-install` will refuse a Talos cluster missing extensions. + +## Phase 11.5 — Auto-upgrade to cozystack-tuned image (when Phase 11 fails on extensions/drift) + +Triggered when Phase 11 detected one of: + +- missing extensions (`drbd`, `zfs`, or `openvswitch` not in `talosctl get extensions`) +- running Talos version != pinned image version +- running image != cozystack-tuned (`ghcr.io/cozystack/cozystack/talos:*`) + +All three have the same root cause: nodes booted from a non-cozystack-tuned image (operator imported base Talos `nocloud-amd64.raw.xz` from the upstream Talos releases instead of the cozystack-tuned `nocloud-amd64.raw.xz` from cozystack releases; or boot-to-talos used its hardcoded default image; or somebody hand-edited a node config). `talm apply` does not reinstall Talos — the pinned `machine.install.image` only takes effect on the next `talosctl upgrade`. Phase 11.5 runs that upgrade. + +Resolve the pinned image: + +```bash +PINNED_IMAGE=$(yq '.machine.install.image' "$CONFIG_DIR/nodes/$CP1_NAME.yaml") +# Sanity check: must be a cozystack-tuned image +case "$PINNED_IMAGE" in + ghcr.io/cozystack/cozystack/talos:*) ;; + *) echo "REFUSE: pinned image $PINNED_IMAGE is not cozystack-tuned"; exit 1 ;; +esac +``` + +Surface the plan to the operator (this is a STOP GATE — `talosctl upgrade` is a per-node reboot): + +```text +talos-bootstrap — Phase 11.5 auto-upgrade to cozystack-tuned + + reason: + Phase 11 detected the running nodes are not on the cozystack-tuned + image. This happens when nodes booted from base Talos (e.g. OCI Custom + Image of upstream Talos vX.Y.Z) instead of the cozystack-tuned + artefact. talm apply did NOT reinstall — the pinned image only takes + effect on the next upgrade. + + current state: + node0 running base Talos v1.12.6 pinned ghcr.io/cozystack/cozystack/talos:v1.12.6 + node1 running base Talos v1.12.6 pinned ghcr.io/cozystack/cozystack/talos:v1.12.6 + node2 running base Talos v1.12.6 pinned ghcr.io/cozystack/cozystack/talos:v1.12.6 + extensions missing: drbd, zfs, openvswitch + + plan: + Per-node, sequentially (not parallel — preserves cluster availability): + talosctl --talosconfig --nodes upgrade \ + --image $PINNED_IMAGE --preserve + + --preserve keeps user disks (any zfs pool stays). Each node reboots + once into the cozystack-tuned image. Total ~3 min × 3 nodes = ~9 min. + etcd quorum maintained (one node down at a time). + + options: + - Proceed (Recommended — required for cluster-install to accept this Talos cluster) + - Skip and fail Phase 11.5 — manual recovery needed + - Cancel +``` + +On `Proceed`, run per-node: + +```bash +for node_ip in $(yq '.inventory.nodes[].host' "$STATE_FILE"); do + talosctl --talosconfig "$TALOSCONFIG" --nodes "$node_ip" upgrade \ + --image "$PINNED_IMAGE" --preserve --wait + + # Wait for the node to come back Ready (post-reboot, the kubeconfig context is valid). + kubectl --context "$CTX" wait --for=condition=Ready "node/$(yq ".inventory.nodes[] | select(.host == \"$node_ip\") | .name" "$STATE_FILE")" --timeout=5m +done +``` + +After all nodes upgraded, re-run Phase 11 verification (recursively, max once). If it still fails, the cause is something other than image mismatch — write `failed_at` and surface the verification output. + +If the operator picks `Skip` — write `failed_at: "phase-11.5-skipped"` and exit. `cluster-install` Phase 3 will refuse. + +### Talos version drift across nodes (sanity, not the upgrade trigger) + +After Phase 11.5 settles, verify uniformity across all CP nodes' configs — if a hand-edit drifted one node, surface it: + +```bash +for node_yaml in "$CONFIG_DIR/nodes/"*.yaml; do + this_image=$(yq '.machine.install.image' "$node_yaml") + [ "$this_image" != "$PINNED_IMAGE" ] && \ + echo "WARN: $(basename "$node_yaml") pins $this_image (expected $PINNED_IMAGE)" +done +``` + +This is informational — the per-node hand-edit case is rare and the operator owns it. + +## Phase 12 — Write state and hand off + +```yaml +cluster: + context: + kubeconfig: /kubeconfig.yaml + api_endpoint: https://:6443 + distribution: talos + k8s_version: + talos_version: +status: + talos-bootstrap: + completed_at: + talosconfig: /talosconfig +``` + +NOTES: + +```text +cozystack:talos-bootstrap — ready + +cluster: + context: + api: https://:6443 + nodes: Talos $TALOS_VERSION (cozystack-tuned) + extensions: drbd ✓ zfs ✓ openvswitch ✓ + lvm filter: ✓ + +artifacts under : + nodes/*.yaml # per-node machine-config — sops-encrypted in-tree when sops on, plain (safe to commit) when off + secrets.yaml # cluster PKI bundle — sops-encrypted in-tree when sops on, gitignored when off + talosconfig # talos client config — sops-encrypted in-tree when sops on, gitignored when off + kubeconfig.yaml # k8s client config — sops-encrypted in-tree when sops on, gitignored when off + talm.key # RECOVERY KEY — talm's own encryption layer over secrets.yaml. + # back up SEPARATELY (password manager or air-gapped store). + # without it, secrets.encrypted.yaml is unrecoverable — + # losing it equals losing the cluster's PKI. + +next: invoke /cozystack:cluster-install — it will resume from /.state.yaml +``` + +After Phase 12 surface a one-line reminder above the NOTES block: + +```text +recovery key: /talm.key — back this up SEPARATELY before +anything else. It is not encrypted by sops; losing it means +secrets.encrypted.yaml cannot be decrypted, and the cluster's PKI +cannot be regenerated without re-bootstrapping the whole cluster. +``` + +## Guardrails + +- NEVER `talm apply` on a node that's not in maintenance mode. Phase 4 probe catches this; on `--reset` from the operator the skill `talosctl reset`s explicitly with approval and a clear data-loss warning. Default refusal stands. +- NEVER skip Phase 11 verification — Cozystack expects the cozystack-tuned shape; an un-verified Talos cluster will fail in `cluster-install` Phase 3 anyway. +- NEVER cache the cozystack-tuned image tag — read it from the upstream installer profile each run. +- NEVER write `cluster.kubeconfig` to a path that already exists without offering rename / merge. +- ALWAYS run `talm init` (Phase 6), `talm apply` (Phase 8), `talosctl bootstrap` (Phase 9), `talosctl kubeconfig` (Phase 10), and the verification probes (Phase 11) automatically once the operator approved the plan in Phase 5 (boot method) or confirmed all nodes are in maintenance mode in Phase 4. There is no separate "operator says done" gate between these phases — it's one valid path forward, the skill executes it. +- ALWAYS write artefacts under `/`. nodes/, secrets.yaml, talosconfig, kubeconfig all live there. +- ALWAYS respect `.sops.yaml` when it exists — talm reads it natively for secrets.yaml and nodes/*.yaml; the skill encrypts kubeconfig.yaml manually after fetch. + +## References + +- `references/manual-steps.md` — boot-method-specific commands per provider (OCI Custom Image, bare-metal ISO / boot-to-talos / PXE) for Phase 4. + +External: + +- `https://cozystack.io/docs/v1.3/install/talos/` — primary install guide. +- `https://github.com/cozystack/talm` — talm chart + cozystack preset. +- `https://github.com/cozystack/boot-to-talos` — one-step replace-existing-OS helper. +- `https://factory.talos.dev/` — Talos Image Factory (for custom schematics if the cozystack-tuned image doesn't fit). +- `~/git/github.com/cozystack/cozystack/packages/core/talos/` — source of the OCI image build pipeline + extension list. diff --git a/plugins/cozystack/skills/talos-bootstrap/references/manual-steps.md b/plugins/cozystack/skills/talos-bootstrap/references/manual-steps.md new file mode 100644 index 0000000..c77b5fc --- /dev/null +++ b/plugins/cozystack/skills/talos-bootstrap/references/manual-steps.md @@ -0,0 +1,198 @@ +# Manual Talos bootstrap steps + +These are the commands `cozystack:talos-bootstrap` v1 hands to the operator to run. The skill substitutes IPs from `inventory` and the cozystack-tuned image tag from the live profile file, then waits for the operator to come back with "done". + +Reference docs: + +- `https://cozystack.io/docs/v1.3/install/talos/` +- `https://github.com/cozystack/boot-to-talos` +- `https://github.com/cozystack/talm` + +## Step 1 — Get the right Talos image on every node + +Two supported paths in v1; the operator picks one based on what they have: + +### Path A — boot-to-talos (existing Linux on the nodes) + +For each node currently running Linux (bare-metal, VPS, cloud VM): + +```bash +ssh root@ ' + curl -fsSL https://github.com/cozystack/boot-to-talos/releases/latest/download/boot-to-talos-linux-amd64 \ + -o /usr/local/bin/boot-to-talos + chmod +x /usr/local/bin/boot-to-talos + + boot-to-talos \ + -image ghcr.io/cozystack/cozystack/talos: \ + -disk /dev/sda \ + -mode install \ + -yes +' +``` + +`-mode install` writes Talos to `/dev/sda` and reboots into it. Pick `-mode boot` instead if Secure Boot is disabled and the operator wants a kexec'd Talos that doesn't touch the disk (useful for testing, but won't survive reboot). + +### Path B — fresh ISO / PXE / cloud image + +Download the cozystack-tuned installer image, write to USB or serve via PXE: + +```bash +docker pull ghcr.io/cozystack/cozystack/talos: +# Extract the metal-amd64.raw.xz artefact from the OCI image — see Cozystack docs. +``` + +Cloud images: use the `nocloud` profile from `~/git/github.com/cozystack/cozystack/packages/core/talos/images/talos/profiles/nocloud.yaml` as a base. + +Either way: every node should boot into Talos with the cozystack-tuned extensions before Step 2. + +## Step 2 — Generate machine-config with talm + +```bash +mkdir -p ~/cozystack-cluster && cd ~/cozystack-cluster + +# Set the right cozystack preset version +helm repo add cozystack https://charts.cozystack.io +helm repo update + +# Generate cluster-wide secrets and per-node configs +talm init \ + --preset cozystack \ + --endpoint https://:6443 \ + --output-dir ./ +``` + +`talm init` walks the operator through: + +- Cluster name, network CIDRs (must match what `cluster-install` will use later — Pod 10.244.0.0/16 if kubeadm-compatible CIDRs, or whatever the operator picks). +- Node discovery — `talm` reaches out to maintenance Talos and reads NIC / disk / NUMA info. +- Output: per-node YAML files under `./nodes/.yaml` and a cluster-wide `secrets.yaml`. + +## Step 3 — Review and edit per-node configs + +```bash +$EDITOR nodes/cp1.yaml +$EDITOR nodes/cp2.yaml +$EDITOR nodes/w1.yaml +``` + +Things to verify (the `cozystack` preset sets sensible defaults, but verify anyway): + +- `machine.install.image` points at `ghcr.io/cozystack/cozystack/talos:`. +- `machine.kernel.modules` lists drbd / zfs / spl / openvswitch / vfio_pci / vfio_iommu_type1. +- `machine.files` contains `/etc/lvm/lvm.conf` overwrite with the cozystack global_filter. +- For CP nodes: `machine.type: controlplane`, with optional `machine.network.interfaces[].vip` for HA. + +## Step 4 — Apply machine-config + +```bash +for n in cp1 cp2 cp3 w1; do + talm apply -f nodes/$n.yaml --mode=auto +done +``` + +`--mode=auto` lets talm pick the safest reboot mode per node (try / staged / reboot). On first apply every node will reboot into the final configuration. + +## Step 5 — Bootstrap etcd (CP1 only, first time only) + +```bash +talosctl --talosconfig ./talosconfig --nodes bootstrap +``` + +This brings up etcd and Kubernetes on CP1; the other CPs join automatically. + +## Step 6 — Fetch kubeconfig + +```bash +talosctl --talosconfig ./talosconfig --nodes kubeconfig ~/.kube/cozystack-lab.yaml +kubectl --kubeconfig ~/.kube/cozystack-lab.yaml get nodes +``` + +When all nodes show `Ready`, return to `cozystack:talos-bootstrap` and say "done". The skill runs Phase 5 verification and writes `status.talos-bootstrap.completed_at`. + +## Working nodes/.yaml shape (cozystack v1.12+) + +The cozystack talm preset auto-emits `HostnameConfig` and `LinkConfig` as v1alpha1 multidoc fragments, not legacy `machine.network.interfaces[]` / `machine.network.hostname` keys. Mixing the two shapes makes `talm template` fail with one of: + +```text +the multi-doc renderer cannot translate legacy machine.network.interfaces[] from the running MachineConfig. +Move the interfaces, vlans, and addresses below into per-node body overlays as v1.12 typed documents +(LinkConfig, VLANConfig, BondConfig, RouteConfig) +``` + +```text +static hostname is already set in v1alpha1 config — talm.discovered.hostname auto-emits HostnameConfig +``` + +Working anchor body for a node that needs a static hostname + a VLAN-tagged interface for the VIP overlay (typical cozystack OCI / metal setup): + +```yaml +# nodes/node0.yaml — multidoc, no legacy keys +machine: + # Required: which interface to install Talos to. + install: + disk: /dev/sda +--- +apiVersion: v1alpha1 +kind: HostnameConfig +name: hostname +spec: + hostname: node0 +--- +apiVersion: v1alpha1 +kind: LinkConfig +name: ens5 +spec: + name: ens5 + up: true + mtu: 9000 + addresses: + - 10.17.100.10/24 +``` + +Three things that trip operators porting from legacy schema: + +- **No `machine.network.hostname`** — talm chart auto-emits `HostnameConfig` from the preset's `talm.discovered.hostname` (which the operator's `values.yaml` overrides per-node). Setting it again under `machine.network.*` is a duplicate. +- **No `machine.network.interfaces[]`** — every interface is its own `LinkConfig` document. `VLANConfig`, `BondConfig`, `RouteConfig` follow the same multidoc shape. +- **`apiVersion: v1alpha1`** on every multidoc fragment. Skipping it makes talm reject the document silently in some versions. + +The preset's `_helpers.tpl` is the source of truth; check it when in doubt. + +## talm flags: explicit -e / -n every time + +`talm apply` parses the `endpoints=[...]` modeline at the top of `nodes/.yaml`. `talm template -i` (insecure / pre-machineconfig) does **not** — it needs explicit `--endpoints` and `--nodes` flags, otherwise it fails with `failed to determine endpoints`. + +The skill always passes `-e`/`-n` explicitly rather than relying on modeline auto-resolution. Pattern: + +```bash +talm template \ + --insecure \ + --talosconfig "$CONFIG_DIR/talosconfig" \ + --endpoints "$NODE_IP" \ + --nodes "$NODE_IP" \ + --file "$CONFIG_DIR/nodes/$NODE_NAME.yaml" +``` + +If the talm error message says `failed to determine endpoints` and the operator already has reachable nodes, the answer is not `--offline` — it's explicit `-e/-n`. + +## TALOSCONFIG env doesn't persist between commands + +`export TALOSCONFIG=...` in one shell invocation doesn't carry over to the next. Skills running through Claude's Bash tool spawn a fresh shell per call, so the env variable disappears. + +Always pass `--talosconfig "$CONFIG_DIR/talosconfig"` explicitly on every `talm` / `talosctl` invocation. The skill does this throughout; if you're running commands manually for debugging, do the same. + +## Common pitfalls + +- **Wrong image tag** — `talm` default points at upstream `ghcr.io/siderolabs/installer:`, **not** at the cozystack-tuned image. The `cozystack` preset overrides this; double-check `machine.install.image` if you used a different preset. +- **Cluster domain ≠ `cozy.local`** — Cozystack hard-codes it. talm cozystack preset sets it; verify `cluster.discovery.registries` if you customised. +- **HA quorum** — embedded etcd needs an odd CP count (1, 3, 5). Don't run two-CP "HA". +- **Floating IP** — the `cozystack` preset supports a Layer-2 VIP. Set it in `values.yaml` before `talm init` to avoid having to re-render configs. + +## When v2 of this skill ships + +v2 will: + +- Drive `boot-to-talos` per node over SSH. +- Drive `talm init` / `talm apply` programmatically. +- Manage the talosconfig file alongside `state.yaml`. + +Until then, this checklist is the contract. diff --git a/plugins/cozystack/skills/talos-reset/SKILL.md b/plugins/cozystack/skills/talos-reset/SKILL.md new file mode 100644 index 0000000..ccee4aa --- /dev/null +++ b/plugins/cozystack/skills/talos-reset/SKILL.md @@ -0,0 +1,311 @@ +--- +name: talos-reset +description: Cloud-provider recovery helper for Talos nodes in an unrecoverable state. Use when the API-TLS handshake is broken (cert-SAN trap caught before the guardrail, broken machine-config, lost talosconfig, accidental wipe). Wraps the provider CLI (oci / aws / gcloud / hcloud) to terminate the instance, preserve attached block volumes, VLAN/secondary VNIC attachments, NSG rules, then relaunch from the same cozystack-tuned image, re-attach the disks + VNICs, and hand the freshly-bootstrapped nodes back to `cozystack:talos-bootstrap` for re-bootstrap from maintenance mode. Does NOT touch already-installed Cozystack — that's `cozystack:debug` territory. Read-only against any cluster; mutations only via the provider CLI with explicit per-node operator approval. +argument-hint: "[--config-dir=] [--provider=oci|aws|gcp|hetzner] [--nodes=] [--preserve-disks] [--preserve-vnics]" +--- + +# cozystack:talos-reset + +Work in reasoning mode. Use the phrasing `cozystack:talos-reset`. Announce phase transitions: `cozystack:talos-reset Phase N — `. + +> **Note on language in this SKILL.md** — every operator-facing prompt below is written in English for clarity. At runtime the skill matches the operator's natural language detected from prior conversation messages (or read from `/.state.yaml` `operator_language` when the wizard chain is in progress). Code identifiers, commands, file paths, and any text destined for GitHub stay canonical regardless of operator language. + +## When to use this skill + +Use `cozystack:talos-reset` when **none** of the in-cluster paths work: + +- `talosctl reset` requires a valid TLS handshake — useless when cert-SAN is wrong, talosconfig is lost, or `apid` doesn't trust the workstation. +- `talosctl upgrade --image ` needs the same TLS path. +- `kubectl drain` + `kubectl delete node` only works if `kube-apiserver` is reachable. +- A cluster that never reached bootstrap (`talosctl bootstrap` failed) has no etcd, no apiserver, no kubectl path at all. + +When none of those apply, the node has to be re-imaged at the cloud-provider layer: terminate the instance, preserve the disks (so a re-bootstrap with `--preserve` keeps the zpool), and relaunch from the cozystack-tuned image. This skill orchestrates that without losing the operator's existing block volumes, VNIC attachments, or NSG rules. + +**Not for**: + +- Cozystack-side problems (HRs failing, dashboard unreachable, certs not issuing) → use `cozystack:debug`. +- Talos upgrades on a healthy cluster → use `talosctl upgrade --preserve` directly; no need for cloud-provider intervention. +- Fresh installs with no pre-existing state → use `cozystack:wizard` → `cozystack:talos-bootstrap`. + +## Core principles + +- Match the operator's natural language. Read `/.state.yaml` `operator_language` if available, detect otherwise. +- Provider CLI is the operator's tool. The skill prints exact `oci`, `aws`, `gcloud`, or `hcloud` commands; the operator runs them. The skill never directly invokes the provider CLI without operator approval — every mutation is shown first. +- Preserve by default. `--preserve-disks` and `--preserve-vnics` default true. The skill explicitly lists what it will preserve and what gets recreated. +- Layer-pure operator output. The skill never says "returning control to wizard" — whoever invoked the skill figures out next steps from the printed NOTES. +- One node at a time. Sequential terminate+relaunch preserves etcd quorum on 3+ CP clusters. For total cluster recovery (all CPs broken), the skill walks the operator through losing+regaining quorum explicitly. +- Verify before declaring success. Post-relaunch the node must reach Talos maintenance mode (port 50000 responds to `talosctl get disks --insecure`) AND retain the preserved disks (existing zpool surfaced by `talosctl get disks`). + +## Phase 1 — Read state and scope + +Read `/.state.yaml`. Required: `config_dir`, `inventory.nodes` (with per-node `public_ip` / `internal_ip` / `name`), `intent_hints.platform`. If invoked without a state file (operator hand-bootstrapped, never used the wizard), interview: + +- Provider (`oci` / `aws` / `gcp` / `hetzner`). +- Node list with provider-specific IDs (OCI: instance OCIDs; AWS: instance IDs; GCP: instance names + zone; Hetzner: server numbers). +- For each node: which block volumes / VNICs / NSG attachments to preserve. + +If `--nodes` was passed, scope to that subset; otherwise scope to **all** nodes that match the failure signature (operator describes the symptom, skill infers which nodes are affected). + +Persist scope to `state.talos_reset.scope`. + +## Phase 2 — Workstation prep + +Verify the relevant provider CLI is installed and authenticated: + +```bash +# OCI +oci --version && oci iam region list --output table + +# AWS +aws --version && aws sts get-caller-identity + +# GCP +gcloud --version && gcloud auth list --filter=status:ACTIVE + +# Hetzner +hcloud version && hcloud context list +``` + +Refuse for any provider where the CLI is missing or not authenticated — point the operator at the install instructions. Do not attempt to wrap with `gh auth login` / `oci setup` / `aws configure` — credential setup is operator territory. + +## Phase 3 — Snapshot current cloud state + +Before terminating anything, capture the **exact** current state of every in-scope node so re-attach in Phase 6 can restore it: + +```bash +# Example: OCI +oci compute instance get --instance-id "$OCID" > "$CONFIG_DIR/talos-reset/$NODE_NAME/instance.json" +oci compute volume-attachment list --instance-id "$OCID" \ + > "$CONFIG_DIR/talos-reset/$NODE_NAME/volume-attachments.json" +oci compute vnic-attachment list --instance-id "$OCID" \ + > "$CONFIG_DIR/talos-reset/$NODE_NAME/vnic-attachments.json" +# Network security group memberships — fetched per-VNIC +for vnic_ocid in $(jq --raw-output '.data[].id' < "$CONFIG_DIR/talos-reset/$NODE_NAME/vnic-attachments.json"); do + oci network vnic get --vnic-id "$vnic_ocid" \ + > "$CONFIG_DIR/talos-reset/$NODE_NAME/vnic-${vnic_ocid##*.}.json" +done +``` + +Equivalent shapes for AWS / GCP / Hetzner — see `references/provider-cli.md` for the verbatim command sets. + +The snapshot directory `/talos-reset//` becomes the source of truth for Phase 6. Operator can audit `cat instance.json | jq '.data.shape'` before approving any destructive step. + +## Phase 4 — Present the reset plan (STOP GATE 1) + +Surface the consolidated plan in a single screen: + +```text +cozystack:talos-reset — plan + + provider: oci + scope: 3 nodes (all CPs) + signature: cert-SAN trap detected — workstation cannot reach apiserver + + per-node plan (sequential, etcd quorum maintained: 2/3 stay up while 1 resets): + + node0 (instance ocid1.instance.oc1.iad.abc123) + preserve: block volume ocid1.volume.oc1.iad.xyz789 (254 GiB, zpool 'data') + secondary VNIC ocid1.vnic.oc1.iad.def456 (VLAN 4000, VIP-link static 10.X.100.11) + NSG memberships cozystack-cp (2 rules: 6443/tcp, 50000/tcp) + terminate: oci compute instance terminate --instance-id --preserve-boot-volume false + ↑ deletes ephemeral boot disk; data disk above is preserved separately + relaunch: oci compute instance launch \ + --shape VM.Standard3.Flex --shape-config '{...}' \ + --image-id \ + --launch-mode PARAVIRTUALIZED \ + --availability-domain --subnet-id \ + --hostname-label node0 \ + ... + reattach: oci compute volume-attachment attach \ + --instance-id --volume-id --type iscsi + oci compute vnic-attachment create \ + --instance-id --create-vnic-details file://vnic-secondary.json + verify: wait until 'talosctl get disks --insecure --nodes ' returns + AND existing zpool 'data' visible in disk list + + node1: + node2: + + what gets lost: + - ephemeral boot volume (Talos rootfs — replaced from cozystack-tuned image) + - any public IP NOT covered by a Reserved IP (OCI ephemeral IPs change on relaunch; + Phase 4.5 NAT-signature research in the next bootstrap will re-derive certSANs + from the NEW public IPs — old certs are discarded with the boot volume anyway) + + what gets preserved: + - data block volumes (and the zpool on them — re-bootstrap with --preserve) + - secondary VNIC attachments (VLAN VIP-link static IPs survive) + - NSG memberships + - block-volume IQN/auth — re-attach uses the same volume OCID + + options: + - Proceed (sequential, ~5 min per node, ~15 min total for 3 nodes) + - Edit plan — pick a different subset of nodes / preserve options + - Cancel +``` + +`Proceed` is the gate; the skill does NOT auto-execute any provider CLI until this answer. + +## Phase 5 — Terminate (per node, sequential) + +For each in-scope node, in inventory order: + +```bash +# 1. (CP only on multi-CP clusters) drain the node from the cluster's perspective if apiserver still works +if kubectl --context "$CTX" get node "$NODE_NAME" 2>/dev/null | grep -q Ready; then + kubectl --context "$CTX" drain "$NODE_NAME" --ignore-daemonsets --delete-emptydir-data --timeout=120s || true + kubectl --context "$CTX" delete node "$NODE_NAME" || true +fi + +# 2. Terminate the instance, NOT preserving the boot volume +oci compute instance terminate \ + --instance-id "$OCID" \ + --preserve-boot-volume false \ + --force +``` + +Wait for instance state to reach `TERMINATED` before proceeding: + +```bash +until [ "$(oci compute instance get --instance-id "$OCID" \ + --query 'data."lifecycle-state"' --raw-output 2>/dev/null)" = "TERMINATED" ]; do + sleep 10 +done +``` + +Surface termination confirmation explicitly — operator sees the lifecycle transition before relaunch starts. + +## Phase 6 — Relaunch + reattach (per node) + +```bash +# 1. Launch a new instance from the cozystack-tuned Custom Image +NEW_OCID=$(oci compute instance launch \ + --shape "$SHAPE" --shape-config "$SHAPE_CONFIG" \ + --image-id "$COZYSTACK_TUNED_IMAGE_OCID" \ + --launch-mode PARAVIRTUALIZED \ + --availability-domain "$AD" \ + --subnet-id "$PRIMARY_SUBNET" \ + --hostname-label "$NODE_NAME" \ + --metadata "$METADATA" \ + --query 'data.id' --raw-output) + +# 2. Wait for RUNNING + maintenance mode reachable +until [ "$(oci compute instance get --instance-id "$NEW_OCID" --query 'data."lifecycle-state"' --raw-output)" = "RUNNING" ]; do + sleep 10 +done + +# 3. Discover the NEW public IP and update state +NEW_PUBLIC_IP=$(oci compute instance list-vnics --instance-id "$NEW_OCID" \ + --query 'data[0]."public-ip"' --raw-output) +yq --inplace ".inventory.nodes[] |= select(.name == \"$NODE_NAME\") .public_ip = \"$NEW_PUBLIC_IP\"" "$STATE_FILE" + +# 4. Re-attach preserved data volume +oci compute volume-attachment attach \ + --instance-id "$NEW_OCID" \ + --volume-id "$PRESERVED_VOLUME_OCID" \ + --type iscsi --wait-for-state ATTACHED + +# 5. Re-attach secondary VNIC (carries VIP-link static IPv4) +oci compute vnic-attachment create \ + --instance-id "$NEW_OCID" \ + --create-vnic-details file://"$CONFIG_DIR/talos-reset/$NODE_NAME/vnic-secondary.json" +``` + +Provider-specific equivalents documented in `references/provider-cli.md`. + +## Phase 7 — Verify maintenance mode + preserved disks + +```bash +# Talos API responds (port 50000) — get disks works on Talos 1.12+ in maintenance +until talosctl get disks --insecure --nodes "$NEW_PUBLIC_IP" 2>&1 | head -3; do + echo "waiting for $NEW_PUBLIC_IP to reach maintenance mode..." + sleep 20 +done + +# Verify the preserved data disk + existing zpool are visible +talosctl get disks --insecure --nodes "$NEW_PUBLIC_IP" --output yaml \ + | yq 'select(.spec.size > 100000000000)' # filter for the data disk by size +# Expect: /dev/sdb with the zpool-signature recognisable. Talos doesn't auto-mount user disks, +# so the zpool isn't imported — that's fine, talos-bootstrap Phase 8 + cluster-install Phase 5.5 +# pre-existing-data check handles import. +``` + +If the preserved zpool data is gone (volume re-attached but blank), abort and surface — operator's preserved volume OCID was wrong, do NOT continue with the remaining nodes. + +## Phase 8 — Update state and hand off + +Write to `/.state.yaml`: + +```yaml +talos_reset: + scope: ["node0", "node1", "node2"] + completed_at: + preserved: + - node: node0 + volume: ocid1.volume.oc1.iad.xyz789 + vnics: ["ocid1.vnic.oc1.iad.def456"] + - ... + ip_changes: + - node: node0 + old_public_ip: 198.51.100.10 + new_public_ip: 198.51.100.42 +status: + talos-reset: + completed_at: +``` + +Reset `status.talos-bootstrap` and `status.cluster-install` — those skills will need to re-run, but they should NOT re-do the heavy work on what's already on disk (zpool exists, talm.key + secrets.yaml exist): + +```yaml +status: + talos-bootstrap: + dispatched_at: null + completed_at: null + failed_at: null + cluster-install: + # left untouched if cluster-install had already run — its post-install state lives in the cluster, + # not on disk; after re-bootstrap the kubeconfig will reach the recovered cluster + ... +``` + +## Phase 9 — NOTES + handoff + +```text +cozystack:talos-reset — complete + + reset: 3 nodes (node0, node1, node2) + duration: 14 min + preserved: block volumes + secondary VNICs intact + ip changes: node0 198.51.100.10 → 198.51.100.42 + node1 198.51.100.11 → 198.51.100.43 + node2 198.51.100.12 → 198.51.100.44 + + state updates: + inventory.nodes[*].public_ip — updated to new public IPs + status.talos-bootstrap — reset (will re-run) + +next: invoke /cozystack:talos-bootstrap — it will detect maintenance mode + on the new instances, skip the boot-method picker, run talm init + + apply against the recovered nodes. zpool 'data' will be picked up + by cluster-install Phase 5.5 pre-existing-data check (no wipe needed). +``` + +## Guardrails + +- NEVER call any provider CLI mutation without operator approval. The skill prints the exact command and waits for `Proceed`. +- NEVER terminate more than one node at a time on multi-CP clusters — etcd quorum requires ⌈N/2⌉+1 nodes up. Sequential reset preserves quorum. +- NEVER assume the new public IP equals the old. OCI / AWS / GCP ephemeral IPs change on relaunch unless a Reserved IP / EIP is in use. The skill updates `state.inventory.nodes[*].public_ip` from the new instance's actual VNIC data. +- NEVER reset a node whose data volume isn't part of a preserved volumes list — accidentally wiping a zpool is unrecoverable from this skill. +- NEVER touch in-cluster state (HelmReleases, namespaces, ConfigMaps). Cozystack-side problems are `cozystack:debug` territory. +- ALWAYS verify the preserved disk is intact AND the new instance reaches maintenance mode before continuing to the next node in the batch. +- ALWAYS surface what changes between old and new instance (public IP, instance OCID, anything else operator-visible). + +## References + +- `references/provider-cli.md` — verbatim CLI command sets per provider (oci / aws / gcloud / hcloud) for terminate, launch, volume-attach, vnic-attach. + +Cross-references: + +- `/cozystack:wizard` — the orchestrator. If `talos-reset` completed, wizard's `--resume` will re-dispatch `talos-bootstrap` and pick up from there. +- `/cozystack:talos-bootstrap` — re-bootstrap entry point post-reset. +- `/cozystack:debug` — for in-cluster failures that don't require cloud-layer reset. diff --git a/plugins/cozystack/skills/talos-reset/references/provider-cli.md b/plugins/cozystack/skills/talos-reset/references/provider-cli.md new file mode 100644 index 0000000..6990ab9 --- /dev/null +++ b/plugins/cozystack/skills/talos-reset/references/provider-cli.md @@ -0,0 +1,335 @@ +# Provider CLI reference for cozystack:talos-reset + +Per-provider command sets the skill emits in Phases 5–7. Each provider section lists: snapshot, terminate, relaunch, reattach. Operator runs them; skill never auto-executes without approval. + +All examples use `` for runtime-resolved values. The skill substitutes from `state.inventory.nodes[]`, snapshot files under `/talos-reset//`, and provider-specific lookups. + +## OCI (Oracle Cloud Infrastructure) + +### Snapshot (Phase 3) + +```bash +NODE= +OCID= +SNAP="$CONFIG_DIR/talos-reset/$NODE" +mkdir -p "$SNAP" + +oci compute instance get --instance-id "$OCID" > "$SNAP/instance.json" +oci compute volume-attachment list --instance-id "$OCID" > "$SNAP/volume-attachments.json" +oci compute vnic-attachment list --instance-id "$OCID" > "$SNAP/vnic-attachments.json" + +for vnic_ocid in $(jq --raw-output '.data[].id' < "$SNAP/vnic-attachments.json"); do + oci network vnic get --vnic-id "$vnic_ocid" > "$SNAP/vnic-${vnic_ocid##*.}.json" +done +``` + +### Terminate (Phase 5) + +```bash +oci compute instance terminate \ + --instance-id "$OCID" \ + --preserve-boot-volume false \ + --force + +# Wait for TERMINATED +until [ "$(oci compute instance get --instance-id "$OCID" --query 'data."lifecycle-state"' --raw-output 2>/dev/null)" = "TERMINATED" ]; do + sleep 10 +done +``` + +### Relaunch (Phase 6) + +```bash +SHAPE=$(jq --raw-output '.data.shape' < "$SNAP/instance.json") +SHAPE_CONFIG=$(jq --compact-output '.data."shape-config"' < "$SNAP/instance.json") +AD=$(jq --raw-output '.data."availability-domain"' < "$SNAP/instance.json") +PRIMARY_SUBNET=$(jq --raw-output '.data."subnet-id" // (.data[0]."subnet-id")' < "$SNAP/vnic-attachments.json") + +NEW_OCID=$(oci compute instance launch \ + --shape "$SHAPE" \ + --shape-config "$SHAPE_CONFIG" \ + --image-id "$COZYSTACK_TUNED_IMAGE_OCID" \ + --launch-mode PARAVIRTUALIZED \ + --availability-domain "$AD" \ + --subnet-id "$PRIMARY_SUBNET" \ + --hostname-label "$NODE" \ + --wait-for-state RUNNING \ + --query 'data.id' --raw-output) + +NEW_PUBLIC_IP=$(oci compute instance list-vnics --instance-id "$NEW_OCID" \ + --query 'data[0]."public-ip"' --raw-output) +``` + +### Reattach (Phase 6) + +```bash +# Data volume (single preserved per node in the canonical cozystack layout) +PRESERVED_VOLUME=$(jq --raw-output '.data[] | select(."is-pv-encryption-in-transit-enabled" == false) | ."volume-id"' \ + < "$SNAP/volume-attachments.json" | head -1) + +oci compute volume-attachment attach \ + --instance-id "$NEW_OCID" \ + --volume-id "$PRESERVED_VOLUME" \ + --type iscsi --wait-for-state ATTACHED + +# Secondary VNIC (carries VIP-link static IPv4 on VLAN) +SECONDARY_VNIC=$(jq --compact-output '.data[] | select(."is-primary" == false)' \ + < "$SNAP/vnic-attachments.json" | head -1) + +if [ -n "$SECONDARY_VNIC" ]; then + VLAN_ID=$(jq --raw-output '."vlan-id"' <<<"$SECONDARY_VNIC") + cat > "$SNAP/vnic-secondary.json" <]`) must be re-applied via Talos machine-config; the skill does NOT preserve the per-VNIC IP assignment, only the VLAN membership. + +## AWS (Elastic Compute Cloud) + +### Snapshot (Phase 3) + +```bash +NODE= +IID= +SNAP="$CONFIG_DIR/talos-reset/$NODE" +mkdir -p "$SNAP" + +aws ec2 describe-instances --instance-ids "$IID" > "$SNAP/instance.json" +aws ec2 describe-volumes --filters "Name=attachment.instance-id,Values=$IID" \ + > "$SNAP/volumes.json" +aws ec2 describe-network-interfaces \ + --filters "Name=attachment.instance-id,Values=$IID" \ + > "$SNAP/enis.json" +``` + +### Terminate (Phase 5) + +```bash +# Detach the data volume FIRST — AWS does not let you preserve a non-root volume +# attached to a terminated instance unless DeleteOnTermination is false. +DATA_VOLUMES=$(jq --raw-output '.Volumes[] | select(.Attachments[0].Device != "/dev/sda1" and .Attachments[0].Device != "/dev/xvda") | .VolumeId' < "$SNAP/volumes.json") + +for vol in $DATA_VOLUMES; do + aws ec2 detach-volume --volume-id "$vol" --force + aws ec2 wait volume-available --volume-ids "$vol" +done + +aws ec2 terminate-instances --instance-ids "$IID" +aws ec2 wait instance-terminated --instance-ids "$IID" +``` + +### Relaunch (Phase 6) + +```bash +INSTANCE_TYPE=$(jq --raw-output '.Reservations[0].Instances[0].InstanceType' < "$SNAP/instance.json") +SUBNET_ID=$(jq --raw-output '.Reservations[0].Instances[0].SubnetId' < "$SNAP/instance.json") +SG_IDS=$(jq --raw-output '[.Reservations[0].Instances[0].SecurityGroups[].GroupId] | join(",")' < "$SNAP/instance.json") +KEY_NAME=$(jq --raw-output '.Reservations[0].Instances[0].KeyName // ""' < "$SNAP/instance.json") +AZ=$(jq --raw-output '.Reservations[0].Instances[0].Placement.AvailabilityZone' < "$SNAP/instance.json") + +NEW_IID=$(aws ec2 run-instances \ + --image-id "$COZYSTACK_TUNED_AMI_ID" \ + --instance-type "$INSTANCE_TYPE" \ + --subnet-id "$SUBNET_ID" \ + --security-group-ids $SG_IDS \ + ${KEY_NAME:+--key-name "$KEY_NAME"} \ + --placement "AvailabilityZone=$AZ" \ + --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$NODE}]" \ + --query 'Instances[0].InstanceId' --output text) + +aws ec2 wait instance-running --instance-ids "$NEW_IID" +NEW_PUBLIC_IP=$(aws ec2 describe-instances --instance-ids "$NEW_IID" \ + --query 'Reservations[0].Instances[0].PublicIpAddress' --output text) +``` + +### Reattach (Phase 6) + +```bash +for vol_record in $(jq --compact-output '.Volumes[] | select(.Attachments[0].Device != "/dev/sda1" and .Attachments[0].Device != "/dev/xvda")' < "$SNAP/volumes.json"); do + VOL_ID=$(jq --raw-output '.VolumeId' <<<"$vol_record") + DEVICE=$(jq --raw-output '.Attachments[0].Device' <<<"$vol_record") + aws ec2 attach-volume --volume-id "$VOL_ID" --instance-id "$NEW_IID" --device "$DEVICE" +done + +# Secondary ENIs (typically not used in the canonical cozystack AWS layout — primary ENI +# carries everything via SG rules — but if present: +for eni_record in $(jq --compact-output '.NetworkInterfaces[] | select(.Attachment.DeviceIndex > 0)' < "$SNAP/enis.json"); do + ENI_ID=$(jq --raw-output '.NetworkInterfaceId' <<<"$eni_record") + DEV_IDX=$(jq --raw-output '.Attachment.DeviceIndex' <<<"$eni_record") + aws ec2 attach-network-interface --network-interface-id "$ENI_ID" \ + --instance-id "$NEW_IID" --device-index "$DEV_IDX" +done +``` + +AWS-specific gotchas: + +- `aws ec2 terminate-instances` deletes any volume with `DeleteOnTermination: true` (the root volume by default). Detach data volumes FIRST. +- Elastic IPs survive terminate but disassociate. Re-associate post-relaunch: `aws ec2 associate-address --instance-id "$NEW_IID" --allocation-id "$EIP_ALLOC_ID"`. +- ENI security-group memberships travel with the ENI on re-attach. + +## GCP (Google Compute Engine) + +### Snapshot (Phase 3) + +```bash +NODE= +ZONE= +SNAP="$CONFIG_DIR/talos-reset/$NODE" +mkdir -p "$SNAP" + +gcloud compute instances describe "$NODE" --zone="$ZONE" --format=json > "$SNAP/instance.json" +gcloud compute disks list --filter="users:zones/$ZONE/instances/$NODE" --format=json \ + > "$SNAP/disks.json" +``` + +### Terminate (Phase 5) + +```bash +# Detach data disks (keep them around — delete defaults to false on attached disks not marked autoDelete) +DATA_DISKS=$(jq --raw-output '.[] | select(.boot != true) | .name' < "$SNAP/disks.json") + +for disk in $DATA_DISKS; do + gcloud compute instances detach-disk "$NODE" --disk="$disk" --zone="$ZONE" +done + +gcloud compute instances delete "$NODE" --zone="$ZONE" --quiet +``` + +### Relaunch (Phase 6) + +```bash +MACHINE_TYPE=$(jq --raw-output '.machineType | split("/")[-1]' < "$SNAP/instance.json") +NETWORK=$(jq --raw-output '.networkInterfaces[0].network | split("/")[-1]' < "$SNAP/instance.json") +SUBNET=$(jq --raw-output '.networkInterfaces[0].subnetwork | split("/")[-1]' < "$SNAP/instance.json") +TAGS=$(jq --raw-output '.tags.items // [] | join(",")' < "$SNAP/instance.json") + +gcloud compute instances create "$NODE" \ + --zone="$ZONE" \ + --machine-type="$MACHINE_TYPE" \ + --image="$COZYSTACK_TUNED_IMAGE" \ + --image-project="$IMAGE_PROJECT" \ + --network="$NETWORK" \ + --subnet="$SUBNET" \ + ${TAGS:+--tags="$TAGS"} \ + --no-restart-on-failure + +NEW_PUBLIC_IP=$(gcloud compute instances describe "$NODE" --zone="$ZONE" \ + --format='get(networkInterfaces[0].accessConfigs[0].natIP)') +``` + +### Reattach (Phase 6) + +```bash +for disk in $DATA_DISKS; do + gcloud compute instances attach-disk "$NODE" --disk="$disk" --zone="$ZONE" +done +``` + +GCP-specific gotchas: + +- `gcloud compute instances delete` with `--keep-disks=data` preserves all non-boot disks automatically. Less manual detach dance than AWS. +- Public IPs change on delete+create unless a Reserved IP was promoted. +- Cloud NAT'd VMs do not have a public IP on the interface — the Phase 4.5 NAT-signature research in the next bootstrap handles certSANs. + +## Hetzner Cloud + +### Snapshot (Phase 3) + +```bash +NODE= +SRV= +SNAP="$CONFIG_DIR/talos-reset/$NODE" +mkdir -p "$SNAP" + +hcloud server describe "$SRV" --output json > "$SNAP/server.json" +hcloud volume list --selector="server.id=$SRV" --output json > "$SNAP/volumes.json" +``` + +### Terminate (Phase 5) + +```bash +DATA_VOLUMES=$(jq --raw-output '.[].id' < "$SNAP/volumes.json") + +for vol in $DATA_VOLUMES; do + hcloud volume detach "$vol" +done + +hcloud server delete "$SRV" +``` + +### Relaunch (Phase 6) + +```bash +SERVER_TYPE=$(jq --raw-output '.server_type.name' < "$SNAP/server.json") +LOCATION=$(jq --raw-output '.datacenter.location.name' < "$SNAP/server.json") +SSH_KEYS=$(jq --raw-output '[.public_net.ssh_keys[].name] | join(",")' < "$SNAP/server.json") + +NEW_SRV=$(hcloud server create \ + --name "$NODE" \ + --type "$SERVER_TYPE" \ + --location "$LOCATION" \ + --image "$COZYSTACK_TUNED_IMAGE_NAME" \ + ${SSH_KEYS:+--ssh-key "$SSH_KEYS"} \ + --output json | jq --raw-output '.server.id') + +NEW_PUBLIC_IP=$(hcloud server describe "$NEW_SRV" --output json \ + | jq --raw-output '.public_net.ipv4.ip') +``` + +### Reattach (Phase 6) + +```bash +for vol in $DATA_VOLUMES; do + hcloud volume attach "$vol" --server "$NEW_SRV" +done + +# Hetzner doesn't have native VLAN secondary VNICs — vSwitch attachments live at the Network layer. +# If the server was on a Network, re-attach: +NETWORK_ID=$(jq --raw-output '.private_net[0].network // empty' < "$SNAP/server.json") +if [ -n "$NETWORK_ID" ]; then + PRIVATE_IP=$(jq --raw-output '.private_net[0].ip' < "$SNAP/server.json") + hcloud server attach-to-network "$NEW_SRV" --network "$NETWORK_ID" --ip "$PRIVATE_IP" +fi +``` + +Hetzner-specific gotchas: + +- Dedicated servers (Robot) are a different product entirely — `hcloud` only manages Cloud servers. For dedicated, the reset path involves Hetzner Robot's web UI (no usable CLI) — out of scope for this skill in v1; surface a manual checklist instead. +- vSwitch + VLAN setup is configured at the Network level on Hetzner Cloud — re-attach preserves VLAN membership. +- Hetzner doesn't have an "ephemeral vs reserved IP" distinction the same way; the new server gets a new public IPv4. RobotLB (if used) needs `ROBOTLB_HCLOUD_TOKEN` re-targeted at the new server IDs after relaunch. + +## Generic / unknown provider + +If the operator is on a provider the skill doesn't have a CLI section for, refuse with: + +```text +talos-reset — provider not supported + + detected: $PROVIDER + supported: oci / aws / gcp / hetzner (Hetzner Cloud only, not Robot) + + This skill needs provider-specific CLI orchestration to terminate + + preserve disks + relaunch + reattach. Without that, the safe + path is manual: + + 1. Snapshot your instance's volume-attachment + VNIC config via your + provider's console / CLI of choice. + 2. Detach data volumes (do NOT delete on terminate). + 3. Terminate the instance. + 4. Create a new instance from the cozystack-tuned image. + 5. Re-attach the data volumes + any secondary VNICs. + 6. Update /.state.yaml inventory.nodes[*].public_ip + with the new IPs. + 7. Re-invoke /cozystack:talos-bootstrap. + + PRs welcome to add your provider — see plugins/cozystack/skills/talos-reset/references/provider-cli.md. +``` diff --git a/plugins/cozystack/skills/ubuntu-bootstrap/SKILL.md b/plugins/cozystack/skills/ubuntu-bootstrap/SKILL.md new file mode 100644 index 0000000..0b06b6e --- /dev/null +++ b/plugins/cozystack/skills/ubuntu-bootstrap/SKILL.md @@ -0,0 +1,376 @@ +--- +name: ubuntu-bootstrap +description: Bootstrap a Cozystack-ready Kubernetes cluster on Ubuntu/Debian nodes by wrapping the upstream cozystack/ansible-cozystack playbooks. Drives inventory interview, renders inventory.yml into the operator's cluster config directory, runs prepare-sudo.yml (Ubuntu 26.04+ sudo-rs workaround), prepare-ubuntu.yml (packages, kernel modules, sysctl, services, multipath blacklist, DRBD-DKMS for Secure Boot, ZFS, KubeVirt modules), and the k3s.orchestration collection (HA k3s with cozystack-compatible flags — flannel off, kube-proxy off, traefik off, cluster-domain cozy.local). Stops after k3s is up and kubeconfig is retrieved; the actual Cozystack install is done by `cozystack:cluster-install` in the next step. All artifacts (inventory.yml, kubeconfig, state) live in the cluster config directory so operators can manage them as code in their own git workflow. Use as the first chain step when `cozystack:wizard` picks "Bare-metal generic Linux, no k8s yet". +argument-hint: "[--config-dir=] [--ansible-cozystack-dir=] [--skip-sudo-rs-workaround] [--k3s-version=]" +--- + +# cozystack:ubuntu-bootstrap + +Work in reasoning mode. Use the phrasing `cozystack:ubuntu-bootstrap`. Announce phase transitions: `cozystack:ubuntu-bootstrap Phase N — `. + +> **Note on language in this SKILL.md** — every operator-facing prompt below is written in English for clarity. At runtime the skill matches the operator's natural language detected from prior conversation messages (or read from `/.state.yaml` `operator_language` when the wizard chain is in progress). Code identifiers, commands, file paths, and any text destined for GitHub stay canonical. Ansible task names from upstream playbooks come through in English by design — don't translate them. + +## Why this is an ansible wrapper, not a hand-rolled SSH flow + +`cozystack/ansible-cozystack/examples/ubuntu/` already implements every node-prep step Cozystack needs — and that list is non-obvious and broad: 11 sysctl keys, 8 kernel modules across required / optional / DRBD / ZFS / KubeVirt categories, LINBIT PPA wiring for drbd-dkms on Secure Boot, multipath blacklist for `drbd*` devnodes, iptables flush for cloud images that REJECT non-SSH traffic, Ubuntu 26.04 sudo-rs workaround. A hand-rolled SSH shell skill drifts from this list the moment ansible-cozystack adds anything new. Wrapping the playbooks keeps `ubuntu-bootstrap` aligned by reference. + +The wrapper does **not** call the cozystack-install step from upstream `site.yml` — that lives in `cozystack:cluster-install` so the storage / extractedprism / OIDC / domain logic stays in one place. The skill renders an inventory with `cozystack_create_platform_package: false` to enforce this. + +## Cluster config directory + +Every artifact lives under `/`. Default resolution (read from `state.config_dir` written by `cozystack:wizard`): + +``` +/ + .gitignore # skill-managed: excludes secrets + state + .state.yaml # wizard chain state (read+write by every skill) + inventory.yml # ansible inventory rendered by this skill + kubeconfig.yaml # k3s kubeconfig after Phase 8 (server URL rewritten) +``` + +The skill never touches `git init` / `git add` / `git commit` — operating on git is the operator's call. The `.gitignore` is just a text file with sensible exclusions; harmless when there is no git repo, ready when the operator decides to make one. + +## Core principles + +- Match the operator's natural language. Read from `/.state.yaml` `operator_language` (set by `cozystack:wizard` Phase 0) or detect from prior messages when invoked directly. Use it in prompts, AskUserQuestion options, summaries, and gates. Code identifiers, commands, file paths, and GitHub-public text stay in their canonical form. Ansible task names from the upstream playbooks come through in English by design — don't translate them. +- One valid path → just do it. Once the operator approved the rendered `inventory.yml` in Phase 4, the skill runs `ansible-playbook prepare-sudo.yml`, `prepare-ubuntu.yml`, and `k3s.orchestration.site` back-to-back without per-playbook re-confirmation. Approval gates remain only for (a) inventory shape in Phase 4 (operator picks node count, roles, SSH user), (b) destructive recovery paths (resetting a half-installed cluster). The "approve OS prep on all nodes / approve per-node" choice in Phase 7 collapses to a single Apply — the operator already approved the inventory. +- Front-load the interview. **Every question the skill might ask in any phase lives in Phase 4** (inventory render): node IPs, roles, ssh_user, ssh_key path, k3s_version (with default), VIP if HA, `cozystack_flush_iptables` for cloud images, `cozystack_enable_zfs` / `cozystack_enable_drbd_dkms` overrides, alternate `cozystack_ubuntu_extra_packages` for non-stock kernels. `intent_hints` from wizard Phase 0 pre-fills wherever possible. Phases 6–8 (the three playbook runs) consume the inventory and never re-prompt; on any ansible failure the skill surfaces it and stops, but does not interactively ask which playbook step to skip mid-batch. +- Layer-pure operator output. The skill never says "returning control to wizard", "the wizard will dispatch next", or any other orchestration commentary in the **operator-facing** summary. Whoever invoked the skill (a human directly, or the wizard's dispatch loop) figures out what's next on their own. Internal SKILL.md references to `cozystack:wizard` are fine for documentation; `wizard` does not appear in any text shown to the operator. +- Operator-driven SSH via ansible. No direct shell-over-ssh from the skill. +- Per-step gate. Operator approves each `ansible-playbook` invocation before it runs. +- Idempotent re-run. Ansible itself is idempotent; the skill re-runs without harm. +- Stop after k3s is up. The next step is `cozystack:cluster-install`, not bundled here. +- Trust ansible's exit code. Non-zero = abort, surface the ansible output verbatim. +- Cluster config dir as state. Don't sprinkle artifacts across `/tmp`; the operator should be able to `tar`, `rsync`, or `git push` the directory and reproduce the cluster shape. + +## Phase 1 — Read state and locate ansible-cozystack + +Read `/.state.yaml`. Required keys: `config_dir`, `inventory.nodes`, `inventory.ssh_user`, `inventory.ssh_key`. If invoked without a state file (direct invocation, not via wizard), interview them inline. + +Find ansible-cozystack on the workstation: + +1. `--ansible-cozystack-dir=` — explicit override; verify it contains `examples/ubuntu/site.yml`. +2. `~/git/github.com/cozystack/ansible-cozystack/` — default checkout location. +3. If neither exists, instruct: + + ```bash + mkdir -p ~/git/github.com/cozystack + git clone https://github.com/cozystack/ansible-cozystack.git \ + ~/git/github.com/cozystack/ansible-cozystack + ``` + + Refuse until the directory is present (cloning over network is operator's call). + +Verify versions: + +```bash +ansible --version # >= 2.15 +ansible-galaxy --version +ssh -V +``` + +The skill does not install ansible — print the install command and refuse if missing. + +## Phase 2 — Initialise .gitignore + +If `/.gitignore` is missing or lacks the cozystack section, append the right block based on `state.sops.enabled` (from wizard Phase 1.5): + +**sops off** (default — wizard wrote `state.sops.enabled: false`): + +```gitignore +# === BEGIN cozystack === +.state.yaml +kubeconfig.yaml +inventory.yml +talosconfig +secrets.yaml +*.tar.gz +# === END cozystack === +``` + +**sops on**: + +```gitignore +# === BEGIN cozystack === +# Secret files are encrypted in place via sops — see .sops.yaml. +# Operator-side: do not commit your private age key. +*.tar.gz +# === END cozystack === +``` + +If the section already exists, leave it alone (re-running wizard --sops / --no-sops will rewrite it). + +## Phase 2.5 — Sops sanity check + +If `state.sops.enabled` is true, verify before any secret write: + +```bash +sops --version +test -f "$CONFIG_DIR/.sops.yaml" +``` + +Refuse to proceed if either fails — surface the missing piece and ask the operator to install `sops` and the configured age/PGP key, then re-run with `--sops` already on, or pass `--no-sops` to disable. Do **not** silently fall back to plain writes. + +The encrypt-after-write helper used in Phase 4 / Phase 9 below: + +```bash +maybe_encrypt() { + local file="$1" + if [ "$(yq '.sops.enabled // false' "$CONFIG_DIR/.state.yaml" 2>/dev/null)" = "true" ]; then + sops --encrypt --in-place "$file" + fi +} +``` + +## Phase 3 — Install ansible collections + +```bash +cd "$ANSIBLE_COZYSTACK_DIR/examples/ubuntu" +ansible-galaxy collection install --requirements-file requirements.yml +``` + +Surface which collections resolve (k3s.orchestration from git, cozystack.installer from git, plus ansible.posix / community.general / kubernetes.core / ansible.utils from Galaxy). Re-running is cheap — Galaxy short-circuits on already-installed versions. + +## Phase 4 — Render inventory.yml into config dir + +Write `/inventory.yml` from `state.inventory`. Do **not** edit the example's inventory in place — that's a template. + +```yaml +--- +cluster: + children: + server: + hosts: + # Internal IP as host key, public IP as ansible_host. + # ansible-cozystack auto-collects server host keys into kube-ovn MASTER_NODES. + 10.0.0.10: + ansible_host: 203.0.113.10 + 10.0.0.11: + ansible_host: 203.0.113.11 + 10.0.0.12: + ansible_host: 203.0.113.12 + agent: + hosts: + 10.0.0.20: + ansible_host: 203.0.113.20 + + vars: + ansible_port: 22 + ansible_user: ubuntu + ansible_ssh_private_key_file: ~/.ssh/cozystack-lab + + # k3s.orchestration + k3s_version: "v1.32.3+k3s1" # pin explicitly; get.k3s.io floats otherwise + token: "<32-byte hex>" # openssl rand -hex 32 + api_endpoint: "10.0.0.10" + cluster_context: "cozystack-lab" + + # cozystack.installer (we install Cozystack later via cluster-install) + cozystack_api_server_host: "10.0.0.10" + cozystack_create_platform_package: false + + # Extra k3s flags (--tls-san for every CP IP and optional VIP) + cozystack_k3s_extra_args: "--tls-san=203.0.113.10 --tls-san=203.0.113.11 --tls-san=203.0.113.12" + + # OCI / cloud-image multi-master quirk — set true if running on cloud images + cozystack_flush_iptables: false +``` + +Critical knob: `cozystack_create_platform_package: false`. This tells the cozystack.installer role to skip Platform Package creation, leaving Cozystack uninstalled. The skill installs operator + Package via `cluster-install` in the next chain step. + +This is the **one** interview phase. After Phase 1 inventory interview the skill auto-fills everything the rest of the flow needs and presents a single consolidated summary; later phases (ansible-playbook prepare-sudo / prepare-ubuntu / k3s.orchestration.site / kubeconfig fetch) consume the answers without re-prompting. + +Slots filled here from `intent_hints` + defaults (operator edits only what they want different): + +- Inventory: nodes (host, role, name), ssh_user, ssh_key path, optional VIP. +- `k3s_version` (default pinned per `references/ansible-playbook.md` version matrix). +- `cozystack_flush_iptables` (default `true` when `intent_hints.hardware_provider` is a cloud / OCI; `false` otherwise). +- `cozystack_enable_zfs` (default `true`; off if `intent_hints.distribution` is RHEL 10 family). +- `cozystack_enable_drbd_dkms` (default `true` on Ubuntu Secure Boot hosts; off if `intent_hints.disable_secure_boot` was hinted). +- Cluster name / kubeconfig merge target (same as `talos-bootstrap`). +- `cozystack_root_host` and `cozystack_external_ips` — passed through to `cluster-install`, not actually used by ansible if `cozystack_create_platform_package: false`, but listed for completeness in case the operator wants to override the ansible defaults. + +Consolidated summary: + +```text +cozystack:ubuntu-bootstrap — collected values + +inventory: + ssh_user: ubuntu + ssh_key: ~/.ssh/cozystack-lab + cp1 (10.0.0.10): Ubuntu 24.04, ansible_host=203.0.113.10 + cp2 (10.0.0.11): Ubuntu 24.04, ansible_host=203.0.113.11 + cp3 (10.0.0.12): Ubuntu 24.04, ansible_host=203.0.113.12 + w1 (10.0.0.20): Ubuntu 24.04, ansible_host=203.0.113.20 + vip: (none — using cp1 host as tls-san) + +k3s: + version: v1.32.3+k3s1 (pinned for cozystack v1.3.x) + +knobs: + cozystack_flush_iptables: true (hardware_provider=oci hinted) + cozystack_enable_zfs: true + cozystack_enable_drbd_dkms: true + cluster name: cozystack-lab + kubeconfig: /kubeconfig.yaml (overwrite if exists; or pick merge) + +operations on Approve: + 1. ansible-galaxy collection install --requirements-file requirements.yml + 2. ansible -m ping (preflight) + 3. ansible-playbook prepare-sudo.yml (Ubuntu 26.04+ only) + 4. ansible-playbook prepare-ubuntu.yml (~10 min) + 5. ansible-playbook k3s.orchestration.site (~5 min) + 6. scp + rewrite kubeconfig.yaml + +options: + - Approve all — proceed end-to-end + - Edit + - Cancel +``` + +After Approve, if sops is enabled, run `maybe_encrypt "$CONFIG_DIR/inventory.yml"` — subsequent `ansible-playbook` invocations need to read the inventory, so the skill calls `sops --decrypt` into a temp file and passes `--inventory=` for each playbook run. The encrypted inventory.yml in the config dir stays the canonical form. + +## Phase 5 — SSH preflight via ansible + +```bash +cd "$ANSIBLE_COZYSTACK_DIR/examples/ubuntu" +ansible --inventory "$CONFIG_DIR/inventory.yml" all --module-name ping --one-line +``` + +Every node must respond `pong`. Failure → surface ansible's error verbatim and refuse to proceed (likely SSH key, passwordless sudo, or unreachable host). + +## Phase 6 — prepare-sudo.yml (Ubuntu 26.04+ sudo-rs workaround) + +Skip on `--skip-sudo-rs-workaround` or when every node reports `ansible_distribution_version < 26.04` (gather_facts result). Otherwise: + +```bash +ansible-playbook --inventory "$CONFIG_DIR/inventory.yml" prepare-sudo.yml +``` + +On any node failure, abort. Subsequent playbooks need sudo working. + +## Phase 7 — prepare-ubuntu.yml (OS prep) + +```bash +ansible-playbook --inventory "$CONFIG_DIR/inventory.yml" prepare-ubuntu.yml +``` + +Streams ansible output. This is the long step (5–15 min depending on apt mirror and node count). It installs packages, loads kernel modules, applies sysctl, enables services, writes the multipath blacklist, runs LINBIT PPA + drbd-dkms for Secure Boot hosts, configures ZFS / KubeVirt modules. See `references/ansible-playbook.md` for the full breakdown. + +On failure, the skill abandons and shows the failed task + stderr. Recovery is operator-side (fix apt mirror, MOK enrollment, etc.) and re-run. + +## Phase 8 — k3s.orchestration.site (k3s install) + +```bash +ansible-playbook --inventory "$CONFIG_DIR/inventory.yml" k3s.orchestration.site +``` + +Upstream `k3s-io/k3s-ansible` playbook. Uses k3s flags from `prepare-ubuntu.yml`'s `cozystack_k3s_server_args` (flannel-backend none, traefik / servicelb / local-storage / metrics-server disabled, kube-proxy off, cluster-domain cozy.local, max-pods 220) plus extra `--tls-san` from inventory. + +HA shape: server group ≥ 1 CP (3 for HA via embedded etcd), agents for workers. First server is `cluster-init`; rest joins. Token comes from `inventory.vars.token`. + +Exits when `kubectl --kubeconfig /etc/rancher/k3s/k3s.yaml get nodes` shows all Ready on the first CP. + +## Phase 9 — Retrieve kubeconfig into config dir + +See `references/kubeconfig.md`. + +```bash +scp -i "$SSH_KEY" \ + "$SSH_USER@$CP1_HOST:/etc/rancher/k3s/k3s.yaml" \ + "$CONFIG_DIR/kubeconfig.yaml" + +# Rewrite 127.0.0.1 → CP1 public IP (or VIP if set) +sed -i.bak "s#https://127\.0\.0\.1:6443#https://${VIP:-$CP1_HOST}:6443#g" \ + "$CONFIG_DIR/kubeconfig.yaml" +rm -f "$CONFIG_DIR/kubeconfig.yaml.bak" +chmod 0600 "$CONFIG_DIR/kubeconfig.yaml" + +# Encrypt at rest if sops is enabled. +maybe_encrypt "$CONFIG_DIR/kubeconfig.yaml" +``` + +Offer optional merge into `~/.kube/config` via `kubectl kc add --file "$CONFIG_DIR/kubeconfig.yaml" --context-name "$CLUSTER_CONTEXT" --cover`. Default: keep standalone — `$CONFIG_DIR/kubeconfig.yaml` is gitignored, the operator controls when (and whether) to merge. + +Verify: + +```bash +KUBECONFIG="$CONFIG_DIR/kubeconfig.yaml" kubectl get nodes +``` + +Every inventory node must show `Ready`, versions matching. + +## Phase 10 — Write state and hand off + +Update `/.state.yaml` — if sops is enabled, decrypt-edit-encrypt: shell out to `sops --decrypt > tmp`, mutate `tmp`, `sops --encrypt --output tmp`, `rm tmp`. The skill never leaves a plain-text `.state.yaml` on disk between writes when sops is on. + +```yaml +cluster: + context: cozystack-lab + kubeconfig: /kubeconfig.yaml + api_endpoint: https://10.0.0.10:6443 + distribution: k3s + k8s_version: v1.32.3+k3s1 +inventory: + # refined from prepare-ubuntu.yml gather_facts: real hostnames / arch / kernel + nodes: [...] +status: + ubuntu-bootstrap: + completed_at: + ansible_cozystack_dir: ~/git/github.com/cozystack/ansible-cozystack + inventory_path: /inventory.yml +``` + +Print NOTES: + +```text +cozystack:ubuntu-bootstrap — ready + +cluster: + context: cozystack-lab + kubeconfig: /kubeconfig.yaml + api: https://10.0.0.10:6443 + distribution: k3s v1.32.3+k3s1 + nodes: 3 cp + 1 worker — all Ready + +artifacts on disk (under ): + inventory.yml # safe to commit — encrypted if sops on, gitignored otherwise + kubeconfig.yaml # encrypted if sops on, gitignored otherwise; chmod 0600 either way + .state.yaml # encrypted if sops on, gitignored otherwise; chain progress + +next: invoke /cozystack:cluster-install — it will resume from /.state.yaml + cluster-install installs extractedprism (kube-apiserver HA proxy), + the cozy-installer chart, and the Platform Package on this fresh cluster. +``` + +## Guardrails + +- NEVER edit `examples/ubuntu/inventory.yml` in place — that's a template. Always render to `/inventory.yml`. +- NEVER set `cozystack_create_platform_package: true` in the rendered inventory. Cozystack install belongs to `cluster-install`. +- NEVER install ansible or pip on the operator's workstation. Refuse if missing, print install command. +- NEVER batch-skip a playbook because "the previous run did it" — ansible is idempotent and re-verifies real state. +- NEVER `git init` / `git add` / `git commit` inside the config dir. Git operations are operator-side decisions. +- ALWAYS surface ansible's stderr / failed task verbatim. Don't paraphrase. +- ALWAYS pin `k3s_version` explicitly — `get.k3s.io` floats to the channel tip otherwise. +- ALWAYS verify `kubectl get nodes` Ready before writing `status.ubuntu-bootstrap.completed_at`. +- ALWAYS write artifacts under `/`, never `/tmp` — operators should be able to `git add inventory.yml` after the run. +- NEVER silently fall back to plain writes when `state.sops.enabled` is true but `sops` or `.sops.yaml` is missing. Refuse Phase 2.5 and tell the operator to install `sops` + the configured age/PGP key, or re-invoke with `--no-sops`. +- NEVER leave a plain-text `.state.yaml` or `kubeconfig.yaml` on disk between phases when sops is on. Decrypt to a temp file, mutate, re-encrypt, remove the temp file. + +## References + +- `references/inventory.md` — interview template, role rules, HA prerequisites. +- `references/ansible-playbook.md` — what each upstream playbook does, knob mapping (inventory variables → playbook behaviour), troubleshooting per playbook. +- `references/kubeconfig.md` — retrieval, server URL rewrite, merge via kubecm. + +Cross-references: + +- `/cozystack:cluster-install` — runs after this, installs Cozystack proper. +- `/cozystack:wizard` — the orchestrator that builds the chain. + +External: + +- `https://github.com/cozystack/ansible-cozystack` — upstream playbooks. +- `https://cozystack.io/docs/v1.3/install/kubernetes/generic/` — install guide this skill automates. +- `https://docs.k3s.io/` — upstream k3s docs. +- `https://github.com/k3s-io/k3s-ansible` — the `k3s.orchestration` ansible collection. diff --git a/plugins/cozystack/skills/ubuntu-bootstrap/references/ansible-playbook.md b/plugins/cozystack/skills/ubuntu-bootstrap/references/ansible-playbook.md new file mode 100644 index 0000000..8cf2ce1 --- /dev/null +++ b/plugins/cozystack/skills/ubuntu-bootstrap/references/ansible-playbook.md @@ -0,0 +1,119 @@ +# k3s install commands + +Exact bodies for the three install phases. The skill substitutes `$K3S_VERSION`, `$CP1_HOST`, `$NODE_TOKEN`, `$EXTRA_TLS_SAN_ARGS` from `state.inventory` and Phase-5 capture. + +## Version pin + +| Cozystack | k3s | +| ----------- | ----------- | +| v1.3.x | `v1.32.3+k3s1` | +| v1.2.x | `v1.31.4+k3s1` | +| v1.1.x | `v1.30.5+k3s1` | +| v1.0.x | `v1.29.6+k3s1` | + +Default is the row matching the cozystack release in `state.cozystack.installer_version` (resolved later) or the latest from `https://github.com/k3s-io/k3s/releases` if the operator overrides. Always pass `INSTALL_K3S_VERSION=` explicitly — `get.k3s.io` floats to the channel tip otherwise. + +## First CP — `--cluster-init` + +```bash +curl -sfL https://get.k3s.io | \ + INSTALL_K3S_VERSION="$K3S_VERSION" \ + INSTALL_K3S_EXEC="server \ + --cluster-init \ + --flannel-backend=none \ + --disable=traefik \ + --disable=servicelb \ + --disable=local-storage \ + --disable=metrics-server \ + --disable-network-policy \ + --disable-kube-proxy \ + --cluster-domain=cozy.local \ + --tls-san=$CP1_HOST \ + $EXTRA_TLS_SAN_ARGS \ + --kubelet-arg=max-pods=220" \ + sh - +``` + +Flag rationale: + +- `--cluster-init` — initialise embedded etcd raft on this node; required for HA. Single-node sandbox still works with this flag. +- `--flannel-backend=none` — disables k3s's built-in CNI. Cozystack ships Cilium + Kube-OVN. +- `--disable=traefik` — Cozystack ships its own ingress (ingress-nginx). +- `--disable=servicelb` — Cozystack ships MetalLB inside the platform Package. +- `--disable=local-storage` — Cozystack ships LINSTOR + piraeus-operator. +- `--disable=metrics-server` — VictoriaMetrics stack covers metrics. +- `--disable-network-policy` — Cilium implements policies. +- `--disable-kube-proxy` — Cilium replaces kube-proxy. +- `--cluster-domain=cozy.local` — mandatory for Cozystack. Cluster-install Phase 2 refuses if this isn't set. +- `--tls-san=$CP1_HOST` — adds the CP's IP to the apiserver cert SAN list so clients dialling directly at the IP don't get a cert mismatch. +- `$EXTRA_TLS_SAN_ARGS` — one `--tls-san=` per CP in the inventory plus the VIP. Pre-computed before install. +- `--kubelet-arg=max-pods=220` — Cozystack's `talm` preset uses 512; for generic linux 220 is a safe floor without overflowing iptables hashing. + +## Additional CP — join existing etcd + +```bash +curl -sfL https://get.k3s.io | \ + INSTALL_K3S_VERSION="$K3S_VERSION" \ + K3S_TOKEN="$NODE_TOKEN" \ + INSTALL_K3S_EXEC="server \ + --server https://$CP1_HOST:6443 \ + --flannel-backend=none \ + --disable=traefik \ + --disable=servicelb \ + --disable=local-storage \ + --disable=metrics-server \ + --disable-network-policy \ + --disable-kube-proxy \ + --cluster-domain=cozy.local \ + --tls-san=$THIS_CP_HOST \ + $EXTRA_TLS_SAN_ARGS \ + --kubelet-arg=max-pods=220" \ + sh - +``` + +`--server` points at the first CP. After joining, the new server is also a member of the raft — kill CP1 and the cluster keeps running, provided quorum (≥ 2 of 3 etcd members) survives. + +`K3S_TOKEN` is `cat /var/lib/rancher/k3s/server/node-token` from CP1 (captured at the end of Phase 5). The skill keeps it in memory; **do not** write it to `state.yaml` (it's a node-level secret). + +## Agent worker + +```bash +curl -sfL https://get.k3s.io | \ + INSTALL_K3S_VERSION="$K3S_VERSION" \ + K3S_URL=https://$CP1_HOST:6443 \ + K3S_TOKEN="$NODE_TOKEN" \ + sh - +``` + +Agent install is much simpler — no `INSTALL_K3S_EXEC`, no flag list. The server's disable list is already in effect cluster-wide. + +If `inventory.vip` is set, `K3S_URL=https://$VIP:6443` is **also** valid and arguably safer (workers survive CP1 going away). v1 sticks with `CP1_HOST` for simplicity and prints a note. + +## Service-readiness probe after each install + +```bash +ssh -i "$SSH_KEY" "$SSH_USER@$HOST" ' + systemctl is-active k3s || systemctl is-active k3s-agent + systemctl is-enabled k3s 2>/dev/null || systemctl is-enabled k3s-agent +' +``` + +Then, from CP1: + +```bash +ssh -i "$SSH_KEY" "$SSH_USER@$CP1_HOST" \ + 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl wait --for=condition=Ready node/$NEW_NODE --timeout=120s' +``` + +If `kubectl wait` times out → node didn't fully join (network, cert SAN mismatch, taint). Abort. + +## Common pitfalls + +- **`+k3s1` suffix vs container image tag** — kubelet reports `v1.35.0+k3s1`, but `+` is invalid in Docker image tags. Cozystack's linstor-scheduler hit this in old releases; the v1.0+ chart sanitises. Operator doesn't need to do anything — just be aware that `kubectl version` looks unusual. +- **`tls-san` after install** — adding a SAN later requires recreating the certs (`k3s certificate rotate`). Include every name/IP/VIP upfront. +- **Re-run after partial failure** — `k3s` install script is idempotent if `/etc/systemd/system/k3s.service` already exists and is active. Cozystack:k3s-bootstrap detects this in Phase 5 and skips with a "already installed at $VERSION — skipping" line. +- **Wrong cluster domain** — `--cluster-domain=cozy.local` is the only acceptable value for Cozystack. Without it, cluster-install Phase 2 refuses. + +## Why no `--write-kubeconfig-mode=0644` + +The skill fetches `/etc/rancher/k3s/k3s.yaml` over `scp` with `sudo` — file mode on the node stays `0600` (k3s default). Don't loosen it for convenience. diff --git a/plugins/cozystack/skills/ubuntu-bootstrap/references/inventory.md b/plugins/cozystack/skills/ubuntu-bootstrap/references/inventory.md new file mode 100644 index 0000000..defa031 --- /dev/null +++ b/plugins/cozystack/skills/ubuntu-bootstrap/references/inventory.md @@ -0,0 +1,70 @@ +# Inventory interview + +Goal: produce a `state.inventory` block that downstream phases (SSH preflight, OS prep, k3s install) can iterate over without further questions. Keep it short — the operator picks defaults for everything when possible. + +## Questions + +1. **SSH user** — default `root`. Common alternatives `ubuntu` (cloud images) and `debian`. If the user isn't root, `sudo -n` must work (passwordless sudo). + +2. **SSH key path** — default `~/.ssh/id_ed25519`. Verify the file exists and is readable. Reject any path containing spaces or globs. + +3. **Nodes** — collect one row per node: + - `host` — IPv4 or DNS name. Validate format with a regex (RFC 1123 hostname or IPv4 dotted-quad). + - `role` — `cp` or `worker`. Recommended: 3 × cp, then workers as needed. + - `name` — optional override. If empty, infer from hostname at OS-prep time. + + Cap at 32 nodes in v1 (UX wall — the per-node loops get unmanageable). + +4. **Virtual IP (VIP)** — optional. Used as `--tls-san=$VIP` so the kube-apiserver certificate accepts traffic to the VIP, even though the VIP itself is managed externally (keepalived, kube-vip, cloud LB). If specified, every CP also gets `--tls-san=` plus the VIP. + +5. **k3s version** — defaults to `v1.32.3+k3s1` (verified against Cozystack v1.3.x). If the operator passes `--k3s-version=...`, validate it matches the format `v\d+\.\d+\.\d+\+k3s\d+`. + +## Role rules + +- At least 1 `cp` node — sandbox mode. +- HA quorum requires odd `cp` count ≥ 3. Two CPs is a split-brain anti-pattern; refuse it unless the operator explicitly overrides. +- Workers are optional in single-node sandbox (CP can run workloads). + +## HA prerequisites + +The skill installs k3s with `--cluster-init` on the first CP (embedded etcd raft) and joins additional CPs via `--server https://$CP1_HOST:6443`. For HA to actually be HA, the operator must arrange: + +- A way to reach kube-apiserver after CP1 is gone. Three options the skill does NOT configure for the operator: + - A virtual IP (keepalived / kube-vip) — operator runs it on the CP nodes themselves. Surface the VIP question; if specified, add it to `tls-san`. + - An external L4 LB (HAProxy, cloud LB) in front of all CP IPs on port 6443. Operator configures it. + - DNS round-robin — basic, no health checks. Don't recommend in v1. +- Identical hardware / OS version across CPs (recommended). The skill does not enforce, but surfaces a warning if `nodeInfo.osImage` differs. + +## Output shape + +The skill emits exactly this into `state.inventory`: + +```yaml +inventory: + ssh_user: "ubuntu" + ssh_key: "/Users/me/.ssh/cozystack-lab" + vip: "" # empty means no VIP + nodes: + - host: "10.0.0.10" + role: "cp" + name: "cp1" + - host: "10.0.0.11" + role: "cp" + name: "cp2" + - host: "10.0.0.12" + role: "cp" + name: "cp3" + - host: "10.0.0.20" + role: "worker" + name: "w1" +``` + +## Validation checklist before moving to Phase 2 + +- All hosts unique (no duplicates). +- All names unique (after inference). +- Role counts sane (1 cp + 0..N workers, **or** 3 cp + 0..N workers — refuse 2-cp). +- SSH key file exists, mode 0600 or 0400 (warn otherwise). +- VIP, if specified, is not in `nodes[].host`. + +A failed validation goes back to the interview, not forward into SSH preflight. diff --git a/plugins/cozystack/skills/ubuntu-bootstrap/references/kubeconfig.md b/plugins/cozystack/skills/ubuntu-bootstrap/references/kubeconfig.md new file mode 100644 index 0000000..75a081d --- /dev/null +++ b/plugins/cozystack/skills/ubuntu-bootstrap/references/kubeconfig.md @@ -0,0 +1,86 @@ +# Kubeconfig retrieval and merge + +Phase 8 fetches `/etc/rancher/k3s/k3s.yaml` from the first CP node, rewrites the server URL from `https://127.0.0.1:6443` to the CP's IP (or the VIP), then either saves it standalone or merges it into the operator's existing `~/.kube/config`. + +## Standalone path + +```bash +TS="$STATE_TS" +TMP="/tmp/cozystack-install-$TS/k3s.yaml" +TARGET="$HOME/.kube/cozystack-lab.yaml" # ask operator for the target path + +scp -q -i "$SSH_KEY" "$SSH_USER@$CP1_HOST:/etc/rancher/k3s/k3s.yaml" "$TMP" + +# Rewrite the loopback URL. Pick the right server: +# - inventory.vip if set +# - else CP1_HOST +SERVER_URL="https://${VIP:-$CP1_HOST}:6443" +sed -i.bak "s#https://127\.0\.0\.1:6443#$SERVER_URL#g" "$TMP" +rm -f "${TMP}.bak" + +chmod 0600 "$TMP" +mv "$TMP" "$TARGET" +chmod 0600 "$TARGET" +``` + +Verify: + +```bash +KUBECONFIG="$TARGET" kubectl config get-contexts +KUBECONFIG="$TARGET" kubectl get nodes +``` + +Result: every inventory node in `Ready`. + +## Merge path (preferred when `kubecm` is installed) + +```bash +kubectl krew install kc # if not installed yet — once per workstation +kubectl kc add --file "$TMP" --context-name cozystack-lab --cover +``` + +`--cover` writes the merged result back to `~/.kube/config` instead of a sibling file. The new context is named `cozystack-lab` (or whatever the operator picked). + +Test the merged context: + +```bash +kubectl --context cozystack-lab get nodes +``` + +## Context naming conflict + +If `cozystack-lab` already exists in `~/.kube/config` (operator ran the skill before, then re-bootstrapped a different cluster with the same name), `kubectl kc add` will refuse / overwrite without warning. Options the skill offers: + +- Pick a different name (suggest `cozystack-lab-2`, `cozystack-lab-$TS`). +- Delete the old context first: `kubectl kc delete cozystack-lab`. Surface the dropped cluster's `server` URL so the operator can confirm they really want to forget it. + +## What to store in `state.yaml` + +After the merge / standalone path: + +```yaml +cluster: + context: cozystack-lab + kubeconfig: ~/.kube/config # if merged; else the standalone path + api_endpoint: https://10.0.0.10:6443 +``` + +Downstream skills (`cluster-install`) read `cluster.context` and trust it. They do not re-read the kubeconfig file from `state.yaml`; the OS lookup chain (`$KUBECONFIG` env, then `~/.kube/config`) covers it. + +## When to leave it standalone + +For lab / multi-cluster workflows where the operator manages multiple clusters with sibling files (`~/.kube/cozystack-lab.yaml`, `~/.kube/cozystack-prod.yaml`), don't merge. Standalone keeps each cluster's lifecycle independent. + +The operator picks merge vs standalone at the Phase 8 gate. + +## Don't lose the node-token + +The kubeconfig is recoverable from CP1 as long as CP1 is alive (or any CP in HA). The node-token cannot be regenerated post-install — if the operator wipes `/var/lib/rancher/k3s/server/` they're rebuilding from scratch. The skill does not back it up automatically (it's a secret); if the operator wants belt-and-braces, suggest: + +```bash +ssh -i "$SSH_KEY" "$SSH_USER@$CP1_HOST" 'sudo cat /var/lib/rancher/k3s/server/node-token' \ + > ~/cozystack-lab-node-token.txt +chmod 0600 ~/cozystack-lab-node-token.txt +``` + +Surface this as an optional step at the end of Phase 7, **not** Phase 8 (no kubeconfig coupling). diff --git a/plugins/cozystack/skills/wizard/SKILL.md b/plugins/cozystack/skills/wizard/SKILL.md new file mode 100644 index 0000000..7c6007f --- /dev/null +++ b/plugins/cozystack/skills/wizard/SKILL.md @@ -0,0 +1,567 @@ +--- +name: wizard +description: Use this as the entry point for installing Cozystack from scratch. Asks one question — Talos / Ubuntu / Existing cluster — then dispatches the right chain of skills. Routes are `talos-bootstrap → cluster-install` (Talos), `ubuntu-bootstrap → cluster-install` (Ubuntu/Debian; ubuntu-bootstrap wraps cozystack/ansible-cozystack), or `cluster-install` direct (existing self-managed or managed k8s). All artifacts (inventory.yml, kubeconfig, .state.yaml, cozystack-platform-package.yaml) live in a cluster config directory the operator picks, so the result is a directory they can manage as code in their own git workflow. Refuses for clusters that already run Cozystack and points the operator at `cozystack:cluster-upgrade`. +argument-hint: "[--config-dir=] [--target=] [--sops] [--no-sops] [--allow-ephemeral] [--resume]" +--- + +# cozystack:wizard + +Work in reasoning mode. Use the phrasing `cozystack:wizard` (not "the skill"). Announce phase transitions: `cozystack:wizard Phase N — `. + +This is the orchestrator. It does not perform mutations on its own — every concrete step is delegated to a downstream skill in the chain. The wizard interviews, builds the route, persists state to `/.state.yaml`, and hands off to the next skill in turn. + +> **Note on language in this SKILL.md** — every operator-facing prompt below is written in English for clarity. At runtime the skill matches the operator's natural language detected from prior conversation messages (per Core Principle "Match the operator's natural language" below). Treat the English text as a template for tone, structure, and content; the LLM translates while preserving meaning. Code identifiers, command examples, file paths, and any text destined for GitHub stay canonical (English) regardless of operator language. + +## Cluster config directory — the artifact contract + +Every cluster `cozystack:wizard` touches has a directory on the operator's workstation: + +``` +/ + .gitignore # skill-managed; excludes secrets + state + .state.yaml # chain progress (gitignored) + inventory.yml # ubuntu route — ansible inventory + kubeconfig.yaml # bootstrap output (gitignored) + nodes/ # talos route — per-node machine-config + talosconfig # talos route (gitignored) + cozystack-platform-package.yaml # cluster-install output + extractedprism-values.yaml # cluster-install Phase 5.6 +``` + +Git is **out of scope** — the skill never runs `git init` / `git add` / `git commit`. The `.gitignore` it writes is a plain text file with sensible exclusions; harmless without git, ready when the operator decides to use it. + +## Core principles + +- Match the operator's natural language. Detect from the conversation context — the language they wrote in for any prior turn. Use it in every interactive prompt, AskUserQuestion option label, summary, and gate message. Never ask "what language?" separately; just match. Code identifiers, command examples, file paths, and any text that goes to GitHub stay in their canonical form (usually English). When the wizard writes `state.operator_language`, every downstream skill reads it from there. +- One valid path → just do it. Don't gate on an approval question when there's only one sensible next step. Keep the AskUserQuestion shape for (a) multi-option choices the operator actually makes, (b) destructive operations (`talosctl reset`, `zpool destroy`, `kubectl delete` on prod-context, anything in a `prod`-labelled context), (c) plan presentation before a long batch. Otherwise proceed. "I'll wait for you to say done, ok?" is not a gate; it's friction. Just check. +- Front-load the interview. **Every question the skill might ask in any later phase is asked once up front** in a single intake phase, before any execution. Run all read-only lookups first (cluster probes, node enumeration, file checks), merge with `intent_hints` from Phase 0, then present **one consolidated summary** with every slot filled (defaults marked Recommended) and quick-edit affordances (`Approve / Edit / Cancel`). Fall back to a discrete AskUserQuestion only when both lookup AND hints failed to fill a slot. Later phases consume the collected answers and never re-prompt — per-node storage devices, per-node boot methods, sops opt-in, kubeconfig merge target, all of it lives in the intake. +- Operator-facing only. Talk to the operator, never to the cluster directly. +- One state file per cluster, lives in `/.state.yaml`. Every downstream skill reads and updates it. +- Idempotent transitions. `--resume` re-reads state and continues from the next not-yet-completed step. +- Never silently choose a route. The route is always shown and approved before the first delegation. +- Three routes only (after the v1 refactor). Anything more nuanced lives in the downstream skill, not in the wizard. + +## Phase 0 — Free-form context + +Before any structured question, ask the operator to describe the situation in their own words: + +```text +Before we dive into structured questions — tell me, in your own words, +what you want to install and what's already in place. + +Anything you mention here I'll use to pre-fill the rest of the interview so +you don't have to repeat yourself. Examples of useful things to share: + + - Hardware / hosting: bare-metal, cloud VMs, specific provider, count of nodes. + - OS: Talos / Ubuntu / something else. + - Existing Kubernetes: yes/no, which distribution. + - Constraints: GPU workload, no public domain, air-gapped, etc. + - Previous attempts: anything that failed before that we should know about. + +If you're not sure of details, that's fine — say what you know. +``` + +Surface this question in the operator's natural language (see Core principles below). Don't translate the prompt into English if the operator was writing in Russian; ask in their language and accept the answer in their language. + +Parse the free-form answer for hints. Common extracts: + +| What operator said | Phase used | Pre-fill | +|---|---|---| +| "3 Hetzner servers running Ubuntu 24.04" | Phase 2 target | `bare-metal-ubuntu`; Phase 4 inventory size 3 | +| "we have k3s already running" | Phase 2 target | `existing` (and distribution=k3s); skip the bootstrap step in chain | +| "Talos on bare-metal, 5 nodes" | Phase 2 target | `bare-metal-talos`; 5 nodes | +| "GPU workload, nvidia 4090" | Phase 2 OS hint | hint: pick Ubuntu over Talos (NVIDIA driver paths simpler) | +| "no domain yet" | Phase 4 publishing.host | suggest nip.io path | +| "tried before, dashboard never came up" | Phase 5 | start with `/cozystack:debug` after Phase 2 picks the cluster | +| "I want to use my existing kubeconfig at ~/.kube/lab" | Phase 1 | use `$PWD` of that file's dir; Phase 2 → existing target | +| "this is for production, we need HA" | Phase 4 | warn on single-CP; insist on 3+ CP | + +Record extracted hints to `state.intent_summary` (a short paragraph the operator confirmed) and `state.intent_hints` (the parsed key/value pairs): + +```yaml +intent_summary: "Operator wants to install Cozystack on 3 Hetzner Ubuntu 24.04 servers, no GPU, no public domain (will use nip.io). Has tried before, dashboard never came up." +intent_hints: + target: "bare-metal-ubuntu" + node_count: 3 + distribution_hint: "k3s" + domain_hint: "nip.io" + prior_failure: "cozy-dashboard/dashboard never reached Ready" +``` + +Don't over-extract. If the operator's free-form is vague, summarise what's there and move on — the structured questions cover the rest. If the operator says "I don't know, just walk me through it" — record `intent_summary` as that phrase and skip the hints; rely entirely on Phase 2 onwards. + +After parsing, echo the summary back to the operator **inline** with the next question — do **not** demand a separate confirmation. A single "looks right?" between every wizard phase compounds to 5–6 yes/no checks per session, which is friction the operator already complained about. The single explicit approval point is Phase 4 consolidated intake review, where every extracted hint and every collected slot get verified at once. + +```text +got it: + +> + +Next: where should I keep cluster config for this install? + + 1. $PWD (current directory) + 2. ... +``` + +Adjust phrasing per language. The operator's answer to Phase 1's question moves the chain forward; if they want to fix the parsed summary, they can say so in free form and the skill re-parses without a "no/yes" gate. + +## Phase 1 — Pick the cluster config directory + +Ask the operator where the cluster config goes: + +```text +Where should cozystack:wizard keep cluster config for this install? + + 1. $PWD (current directory) + 2. $PWD/ (subdirectory under current directory) + 3. Other — type a path + 4. Scratch dir under $TMPDIR (will be lost on reboot — test runs only) + +Operator hint: options 1–3 want somewhere git-able. The skill won't touch git +itself, but all artifacts written there are designed to be committed. +Option 4 is for sandbox / throwaway runs where surviving the next reboot +doesn't matter. The skill warns once and proceeds. +``` + +If option 2 (subdir) — also ask for cluster name (default `cozystack-lab`). If option 3 — validate the path is writable; offer to mkdir. + +Resolve to absolute path. Record as `state.config_dir`. + +If `/.state.yaml` already exists: + +- If `--resume` was passed — go straight to Phase 5 dispatch from the next pending step. +- Else, if the state file is `< 24h` old, offer `Resume previous session (target=$T, route=$R, $X/$Y steps complete) / Start fresh (wipe .state.yaml only — other files stay) / Cancel`. +- Else (older than 24h, or `Start fresh`) — overwrite `.state.yaml` only, leave other files alone. + +Initialise `.gitignore` if missing or lacks the cozystack section (see ubuntu-bootstrap Phase 2 for the exact block). The exact content of the cozystack section depends on whether sops is enabled in Phase 1.5 — leave a placeholder until then. + +## Phase 1.5 — Sops opt-in + +Skip if `--no-sops` was passed. Run if `--sops` was passed; otherwise ask: + +```text +Wrap secrets with sops? (encrypted-in-tree instead of gitignored) + +Cozystack:wizard writes several files that contain (or may contain) secrets: + - kubeconfig.yaml (TLS certs + tokens) + - .state.yaml (collected values; may carry tokens, SSH key path) + - inventory.yml (Ubuntu route; k3s join token, maybe ansible_become_password) + - cozystack-platform-package.yaml (any operator-supplied creds in values) + - extractedprism-values.yaml (rare; included for symmetry) + +Talos artefacts (talosconfig, secrets.yaml under nodes/) are already covered +by talm — it respects .sops.yaml when present in the same directory. + +Without sops: secret files stay gitignored (operator stores them outside git). +With sops: secret files are encrypted in place after every write with your + age public key; the encrypted forms can be committed safely. + Decryption is automatic when sops finds your age private key + (~/.config/sops/age/keys.txt or $SOPS_AGE_KEY_FILE). + +Options: + - Yes, enable sops (Recommended for shared cluster configs in git) + - No, use .gitignore (Recommended for solo workflows) +``` + +On `No` — write the cozystack `.gitignore` section that excludes `kubeconfig.yaml`, `.state.yaml`, `inventory.yml`, `talosconfig`, `secrets.yaml`. Record `state.sops = {enabled: false}`. Continue to Phase 2. + +On `Yes`: + +1. **Verify tools** — `sops --version` and `age --version` must succeed. If either is missing, refuse opt-in with the install pointer (`brew install sops age` on macOS; `https://github.com/getsops/sops/releases` and `https://github.com/FiloSottile/age#installation` otherwise). Operator can re-run with `--no-sops` to skip. + +2. **Resolve age key** — search in this order, surface each result to the operator and confirm which one to use: + + 1. `/.sops.yaml` — if present, read `creation_rules[].age` recipients. + 2. `$SOPS_AGE_KEY_FILE` env var — if set, read the file's `# public key: age1...` comment. + 3. `~/.config/sops/age/keys.txt` — standard sops/age location. + 4. None of the above — print a generation command and ask approve: + + ```bash + mkdir -p ~/.config/sops/age + age-keygen --output ~/.config/sops/age/keys.txt + chmod 0600 ~/.config/sops/age/keys.txt + ``` + + Show the resulting public key + a warning: **the private key in `~/.config/sops/age/keys.txt` is your only way to decrypt these files. Back it up before committing anything encrypted.** + + Ask the operator to confirm the public key shown is correct (paste-back the `age1...` string). Do not proceed on guesswork. + +3. **Write `/.sops.yaml`** with creation_rules covering the cozystack secret-files (see `references/sops.md` for the exact block). If the file already existed, merge — don't overwrite — preserving any custom rules the operator added. + +4. **Record state**: + + ```yaml + sops: + enabled: true + recipients: ["age1abc...", "age1def..."] # everyone who can decrypt + config_path: "/.sops.yaml" + ``` + +5. **Write the cozystack `.gitignore` section** without the secret-file lines (they're encrypted-in-tree now). When sops is on, `talosconfig`, `secrets.yaml`, `kubeconfig.yaml`, `.state.yaml`, `inventory.yml`, `cozystack-platform-package.yaml`, `extractedprism-values.yaml`, and `nodes/*.yaml` are all encrypted by talm or by the skill's own `maybe_encrypt` helper — they're commit-friendly and stay out of `.gitignore`. Only `*.tar.gz` (diagnostic bundles) stays ignored. See `references/sops.md` for the per-file decision matrix. + +## Phase 2 — Target (the only real question) + +If `state.intent_hints.target` was already extracted in Phase 0, **do not ask a separate "right?" confirmation here**. The Phase 0 echo already had the operator confirm `intent_summary`; asking the same question again ("from your description it looks like X — right?") is friction. The Phase 4 consolidated intake review is the single confirmation point — every extracted hint, the inferred target, and every collected slot land there, and the operator approves the whole picture once. + +If `state.intent_hints.target` was **not** extracted (operator was vague at Phase 0), single AskUserQuestion: + +```text +What's the target environment? + + 1. Bare-metal Talos — nodes will run Talos Linux, no Cozystack yet + 2. Bare-metal Ubuntu / Debian — nodes run Ubuntu/Debian, no Kubernetes yet + (ubuntu-bootstrap wraps cozystack/ansible-cozystack) + 3. Existing Kubernetes cluster — kubectl works, no Cozystack yet + (includes managed: EKS / GKE / AKS / DOKS) + 4. Existing Cozystack cluster — already installed, want to upgrade + +Recommended-by-load hints (operator can override): + - General-purpose / VMs / databases → Talos (rec) + - GPU / custom userspace → Ubuntu (rec) +``` + +For target 4 (existing Cozystack), **refuse** and point at `cozystack:cluster-upgrade`. Do not enter the chain. + +For target 3 (existing k8s), do a quick read-only probe: + +- Show `kubectl --context $CTX get nodes -o wide` and `kubectl get ns cozy-system --ignore-not-found`. +- If `cozy-system` namespace exists and holds pods → refuse, point at `cozystack:cluster-upgrade`. + +Record `state.target`. + +## Phase 3 — Route build + +Map target to a chain: + +| Target | Chain | +| ----------- | ----------- | +| Bare-metal Talos | `talos-bootstrap` → `cluster-install` | +| Bare-metal Ubuntu / Debian | `ubuntu-bootstrap` → `cluster-install` | +| Existing Kubernetes (self-managed or managed) | `cluster-install` | +| Existing Cozystack | refuse → `cozystack:cluster-upgrade` | + +Record `state.route`. Print the chain with rough time estimates and which skill owns which side-effects: + +```text +cozystack:wizard route + + target: bare-metal-ubuntu + chain: + 1. cozystack:ubuntu-bootstrap (~30–60 min) + └─ wraps cozystack/ansible-cozystack — OS prep, drbd-dkms / ZFS / KubeVirt modules, + k3s install with cozystack-compatible flags, kubeconfig retrieval + 2. cozystack:cluster-install (~30–60 min) + └─ ZFS pool per node, extractedprism, cozy-installer chart, Platform Package, + tenant root ingress patch, wait HRs + + config dir: /Users/me/cozystack-lab + state file: /Users/me/cozystack-lab/.state.yaml + + options: Continue / Edit (change target) / Cancel +``` + +## Phase 4 — Full intake (everything policy-decidable up front) + +Front-load is non-negotiable. Every slot that an operator can decide **before the cluster exists** is collected here, in one consolidated screen, and written to `.state.yaml` under `inventory`, `cluster`, and `cozystack_intake`. Downstream skills read these on entry and only ask the operator again for slots that need post-bootstrap discovery (actual device paths from a running Talos node, KubeOVN label mismatch, etc.) or for STOP GATEs that are destructive by nature. + +The intake has two layers — **bootstrap-stage** (depends on `route`) and **cozystack-stage** (route-independent policy). Both are filled in this phase. + +### Bootstrap-stage slots (per route) + +For `talos-bootstrap` first: + +- **Node list**: IPs/hostnames, role (cp/worker), optional name. +- **Reach mode**: how the workstation reaches nodes — `public` (workstation has public network access to nodes' public IPs), `internal` (workstation is inside the VCN/segment, reaches private IPs), or `vip` (only the VIP is reachable; nodes are behind NAT). Determines the IP set used for `talosctl --nodes`. +- **CP endpoint**: VIP / single-CP IP / external LB IP. Used for `talm init --cluster-endpoint` and lands in machine-config. +- **VIP details** if applicable: VIP address, link to advertise on (`ens5`, `eth1`, …), subnet, MTU if non-default. **Per-node VIP-link static addresses** — on cloud providers where the VIP-carrying interface in maintenance mode lacks an IPv4 (OCI VLAN secondary interfaces are the canonical case), record one address per node in the same `/N` as the VIP subnet (e.g. `10.17.100.11/24` for node0, `10.17.100.12/24` for node1, …). Without these, `talm template` renders a `LinkConfig` with no `addresses:` block and node1+ never reach the VIP — etcd join hangs in `Preparing`. The wizard explicitly asks "does the VIP link have an IPv4 in maintenance mode?" and offers `Auto-allocate from VIP subnet (skip the first N reserved for the VIP)` as the default. Recorded as `intent_hints.vip.per_node[]`. +- **Boot method per node** (only when needs-OS-install): OCI custom image / boot-to-talos / iPXE / "already in maintenance mode" / "already configured — reuse". +- **`talosctl` + `talm` on workstation**: confirm presence. + +For `ubuntu-bootstrap` first: + +- `inventory.nodes` (host, role cp/worker), `inventory.ssh_user`, `inventory.ssh_key`. Recommended HA — 3 CP nodes (embedded etcd raft). + +For `cluster-install` first (existing k8s target): + +- `cluster.context` — default `kubectl config current-context` with confirmation. + +### Cozystack-stage policy slots (route-independent, **always asked**) + +These were historically asked by `cluster-install` Phase 4 *after the cluster existed* — that re-prompted the operator for things they already knew up front. They are policy choices, not discovery-dependent, so they belong here: + +- **Bundles** (multiSelect): `system`, `paas`, `iaas`, `naas`. Default per variant overlay (`isp-full` includes all four). The variant is derived from `state.target` + `intent_hints.workload_class`. +- **Storage layout preference** (Talos / Ubuntu routes only): for each node that will provide storage, ask the layout when more than one is plausible — `single` (Recommended for dev), `mirror` (Recommended for prod on 2-disk nodes), `raidz` (3+ disks). Default `single`. Exact device paths are deferred to `cluster-install` Phase 5.5 because they require post-boot probing; the **preference** is recorded here and Phase 5.5 just picks the largest unmounted disk(s) that fit the layout. ZFS pool name and LINSTOR storage-pool name default `data` and are rarely changed. +- **Publishing host kind**: `nip.io` (Recommended for sandbox; no DNS configuration needed) vs `custom-fqdn` (operator owns the domain). When `custom-fqdn` — collect the FQDN and run the **domain-ownership gate** here, not in `cluster-install` (the operator commits to configuring DNS for it before Phase 8 of cluster-install reaches dashboard/keycloak HRs). +- **External IPs strategy**: on Talos / Ubuntu routes, when nodes have distinct `InternalIP` and `ExternalIP` (typical on OCI, GCP with external NAT, AWS with EIPs), ask: + - `internal` (Recommended for NAT-fronted clouds) — Service `externalIPs` are the nodes' VCN-internal addresses. NAT on the provider fabric (OCI 1:1 NAT, GCP NAT'd public IP, AWS EIP) rewrites packets before they reach the node, so the kernel only ever sees the internal IP; Cilium's externalIPs BPF must match on that. Picking public IPs here is the single most painful misconfiguration on these providers — symptom is `Connection refused` (kernel RST) on `https://dashboard./` even though every HelmRelease is Ready. + - `external` (use external/public IPs) — only when the nodes' public IPs are routed onto the interface (bare-metal, Hetzner with direct public IP, dedicated server with no NAT). + - `explicit` — operator supplies the list (MetalLB pool, external LB VIP). + - The wizard probes `kubectl get nodes -o jsonpath` on Talos / Ubuntu routes after Phase 5 bootstrap completes to confirm the chosen IPs are present on the cluster's node objects, but the *choice* is asked here. +- **Cert solver**: `http01` (Recommended — works with public DNS + port 80) vs `dns01` (works on internal networks, requires DNS-provider creds wired into cert-manager values). On nip.io always `http01`. On `custom-fqdn` default `http01`, ask if operator wants `dns01`. +- **Exposed services** (multiSelect): `api`, `dashboard`, `vm-exportproxy`, `cdi-uploadproxy`. Default `api,dashboard`. +- **Networking CIDRs** (collapsed by default; ask only if operator explicitly wants non-defaults). **Read defaults from the cozystack source** — do not hardcode. The skill greps for the canonical values in this resolution order: + + 1. `~/git/github.com/cozystack/cozystack/packages/core/platform/values.yaml` (operator's local clone) + 2. `~/git/github.com/cozystack/cozystack/packages/core/kubeovn/values.yaml` (older layouts) + 3. URL fallback `https://raw.githubusercontent.com/cozystack/cozystack//packages/core/platform/values.yaml` (HTTP fetch, cached for 24 h alongside `state.research_cache`) + + ```bash + POD_CIDR=$(yq '.kubeovn.podCIDR // .cozystack.podCIDR // ""' "$COZYSTACK_VALUES") + SVC_CIDR=$(yq '.kubeovn.serviceCIDR // .cozystack.serviceCIDR // ""' "$COZYSTACK_VALUES") + JOIN_CIDR=$(yq '.kubeovn.joinCIDR // ""' "$COZYSTACK_VALUES") + ``` + + cozystack v1.3.3 ships `podCIDR: 10.244.0.0/16`, `serviceCIDR: 10.96.0.0/16`, `joinCIDR: 100.64.0.0/16`. Hardcoded `10.42.x` / `10.43.x` (k3s defaults) would mismatch and break Cilium pod-network reconciliation. + + Surface the resolution to the operator with the source path so they know which file informed the default: + + ```text + podCIDR: 10.244.0.0/16 (default from cozystack@v1.3.3/packages/core/platform/values.yaml:42) + serviceCIDR: 10.96.0.0/16 (default from same) + joinCIDR: 100.64.0.0/16 (default from same) + ``` + + Most operators take the defaults; on Edit, validate the new CIDR doesn't overlap with host networks the cluster sees. +- **API server endpoint**: default `https://api.:6443`. Explain it lands in client kubeconfigs. +- **extractedprism opt-out**: default `enabled` on generic / on Talos always disabled (KubePrism built in). Operator can flip to `--no-extractedprism` and supply a single CP IP / VIP / external LB IP themselves. + +### Consolidated review + +Render every slot above on a single screen. Defaults marked `(default)`. The operator answers `Approve all` to proceed, or `Edit ` to fix one and re-render. + +```text +cozystack:wizard — collected values for the whole chain + +target: bare-metal-talos +chain: cozystack:talos-bootstrap → cozystack:cluster-install +operator language: ru +config dir: /Users/me/cozystack-lab +sops: disabled + +bootstrap (talos route): + nodes: 3 (all cp+workload) + node0 10.17.100.11/24 (VIP-link static) — public 158.101.x internal 10.17.0.128 + node1 10.17.100.12/24 — public 129.158.x internal 10.17.0.27 + node2 10.17.100.13/24 — public 157.151.x internal 10.17.0.173 + reach mode: public (workstation talks to nodes' public IPs via talosctl) + cp endpoint: https://10.17.100.10:6443 (shared VIP) + vip link: ens5 10.17.100.0/24 + boot method: already-in-maintenance-mode (all 3 nodes) + +cozystack-stage policy: + bundles: system, paas, iaas, naas (isp-full) + storage layout (per node): node0 single, node1 single, node2 single (Recommended for 1-disk nodes) + zpool / linstor name: data / data + publishing host: 10-17-0-128.nip.io (kind: nip.io — DNS auto-resolves) + external IPs strategy: internal (10.17.0.128, 10.17.0.27, 10.17.0.173) + ↑ OCI 1:1 NAT — public IPs would not match Cilium externalIPs + cert solver: http01 + exposed services: api, dashboard + api server endpoint: https://api.10-17-0-128.nip.io + extractedprism: enabled (default for generic) + networking CIDRs: podCIDR=10.244.0.0/16 serviceCIDR=10.96.0.0/16 joinCIDR=100.64.0.0/16 (defaults read from cozystack@v1.3.3/packages/core/platform/values.yaml) + +options: + - Approve all — write .state.yaml, proceed to Phase 5 dispatch + - Edit — e.g. Edit external IPs strategy, Edit publishing host + - Cancel +``` + +Write the approved values to `/.state.yaml` under `inventory`, `cluster`, and the new top-level `cozystack_intake` (see `references/state-schema.md`). Downstream skills read these on entry and skip questions whose answers are already there. The cluster-install Phase 4 then becomes a fast-path: re-render the same summary read-only, ask `Approve / Edit ` once, and proceed. + +## Phase 4.5 — Active research on the specific combination (read-only, skeptical) + +Before Phase 5 dispatch, run a focused research pass against the concrete combination the operator picked: `target` × `intent_hints.platform` × `cozystack_intake.installer_variant` × `state.cluster.k8s_version` / `talos_version` × the specific bundles. The goal is to surface **known landmines for this combination** so the operator hears about them before execution starts, not after a 30-minute install fails on something well-documented. + +This phase is **read-only and time-boxed**. It does not mutate state, does not contact the target cluster, and finishes in under ~2 minutes. If research stalls or returns nothing useful, the phase completes silently — its absence does not block the install. + +### Why this is at runtime, not bundled + +Skills do not ship a static knowledge base of "known issues per platform". That data ages quickly (Cozystack releases monthly; Talos releases monthly; provider behaviour changes), and operators install across versions and providers the skill author never tested. Instead the wizard does the research **at the time of install**, against the actual versions in play, with current docs and current upstream issue trackers. + +The skill's job is to know **how to look** and what to **skeptically verify** — not to memorise answers. + +### Sources to consult, in order of trust + +1. **Operator's local clones if present** — `~/git/github.com/cozystack/website/content/en/docs/v{MAJOR.MINOR}/`, `~/git/github.com/cozystack/cozystack/` (for source-of-truth on chart values, release notes), `~/git/github.com/cozystack/talm/`. Grep for the platform / scenario terms (`oci`, `hetzner`, the chosen variant). Local clones are highest trust — they're the actual docs for the version the operator picked, not a possibly-stale model recollection. +2. **Upstream issue trackers** — `gh issue list --repo cozystack/cozystack --state open --search " "`, same for `cozystack/talm`, `piraeusdatastore/piraeus-operator`. Open issues at install time are landmines the operator can avoid by adjusting their plan. +3. **Web search** — only when local clones and issue trackers don't cover the combination. Search forms: + - `"cozystack v1.3" "" issue` — find blog reports and discussions + - `"talm" "" maintenance mode` — provider-specific Talos quirks + - `"piraeus" " "` — DRBD/kernel-module compatibility +4. **The operator's own prior conversation** — `intent_summary` may have mentioned a previous failed attempt; reference that explicitly in the research scope. + +### Skeptical verification rules — non-negotiable + +Web search and the model's prior knowledge are both prone to hallucination on niche infrastructure topics. Every claim the research surfaces must be backed by a **traceable source** before it gets shown to the operator: + +- **No "I recall that…"** — if the source isn't a URL, a file path, or a GitHub issue number, the claim does not appear in the findings. +- **Cite the source line** — `cluster-install/references/provider-pitfalls.md:42`, `cozystack/cozystack#1234`, `https://docs.cozystack.io/v1.3/...`. The operator should be able to click through. +- **Recency check** — if the source is a blog post older than 12 months and the cited version is older than the operator's, flag it: "This may be stale — the bug it describes was fixed in vX.Y.Z; only relevant if your install is on the older path." +- **Reproduce-or-not flag** — distinguish "documented in upstream docs" (high confidence) from "reported once in a blog post, not yet in docs" (low confidence). Different actionability. +- **No "I'd expect …" or "It might also affect …"** — if a finding is speculation, drop it. Speculative landmines waste operator attention. + +### What to look for, per axis + +When `intent_hints.platform` is set, scope research to that platform: + +- **`oci`** — VLAN/secondary-VNIC interface quirks (VIP placement, IPv4 in maintenance mode); 1:1 NAT effect on `Service.externalIPs`; OCI's required `paravirtualized` launch mode for QCOW2 custom images. +- **`hetzner`** — RobotLB vs MetalLB (L2 ARP doesn't cross hosts); vSwitch + VLAN setup; rescue-mode + GRUB serial-console artefacts; Secure Boot interaction with rescue-image installs. +- **`aws-with-eip`** — Elastic IP not on interface (similar to OCI NAT); NLB proxy-protocol headers in TCP streams. +- **`gcp-with-nat`** — same as OCI for Cloud-NAT'd public IPs. +- **`bare-metal`** — Secure Boot + unsigned `.ko` (piraeus DRBD runtime compile); MOK enrollment; kernel-lockdown rejection of modules. + +When `target: bare-metal-talos`, additionally scope: + +- talm + `intent_hints.cozystack_version` compatibility (`boot-to-talos` hardcoded default Talos versions can diverge from what cozystack expects). +- Talos PSA `baseline` vs `kubectl debug node` (k8s 1.25+ blocks privileged debug pods on default namespaces). +- system-extension namespacing (zfs / drbd userspace not on host rootfs). + +When `cozystack_intake.bundles` includes `system`: + +- root Tenant ingress patch needed (race against tenants CRD landing). +- StorageClasses not auto-created on v1.3.x (gone in v1.4+). + +When `cozystack_intake.installer_variant: generic` and `extractedprism.enabled: true`: + +- chart version vs cozystack-installer version matrix. + +This is a starting checklist, not exhaustive — the research pass is allowed to surface anything else the search turns up, as long as the verification rules above hold. + +### Output shape + +Append the findings to the consolidated summary the operator already approved in Phase 4. Re-show as a separate section before transitioning to Phase 5: + +```text +cozystack:wizard — known landmines for your specific combination + + target: bare-metal-talos × oci × cozystack v1.3.3 × talos v1.13.0 + + 1. [HIGH] OCI 1:1 NAT — Service.externalIPs must be VCN-internal + source: cozystack docs v1.3/install/providers/oracle-cloud.md + why: OCI fabric rewrites public IPs to internal before the kernel + sees them; Cilium externalIPs BPF won't match on public IPs. + wizard auto-applied: cozystack_intake.external_ips.strategy = internal + + 2. [HIGH] VIP link on OCI VLAN secondary lacks IPv4 in maintenance mode + source: cozystack/cozystack# + own retro + why: talm template renders LinkConfig without addresses; etcd join + hangs. Wizard collected intent_hints.vip.per_node addresses + to auto-patch nodes/*.yaml before talm apply. + wizard auto-applied: 3 per-node /24 addresses recorded + + 3. [MEDIUM] cozystack v1.3.3 does not create StorageClasses automatically + source: cozystack/cozystack@v1.3.3 packages/system/linstor/ + templates/storageclass.yaml.disabled + why: tenants CRD spec.storageClasses lands only in v1.4. + mitigation: cluster-install Phase 8.6 will apply local + replicated SCs. + + 4. [LOW / stale-check] "OCI dashboard install fails after 5 min" + source: blog post 2024-08-12 — describes v1.2 behaviour + why: superseded by inline Tenant ingress patch in v1.3.x; flagged + only because keywords matched. + + options: + - Acknowledged — proceed to Phase 5 dispatch + - Pause — let me investigate one of these before continuing + - Edit Phase 4 values — go back and change the plan based on a finding +``` + +The operator's `Acknowledged` is the gate into Phase 5. `Pause` stops the wizard with the cluster config dir intact; operator can re-invoke with `--resume`. `Edit Phase 4 values` jumps back to Phase 4 with the offending slot pre-focused. + +If the research pass returns **no findings** (clean combination, nothing flagged), print one quiet line: `Phase 4.5 research: no known landmines for this combination.` and proceed automatically — no gate, no friction. + +### Caching + +If `state.research_cache` exists and is `< 24 h` old AND `cozystack_intake` hasn't changed since the cache was written, skip the research pass and use the cached findings. The combination of (cozystack_version, talos_version, platform, variant, bundles) is the cache key. `--resume` honours the cache; explicit `--rerun-research` forces a fresh pass. + +## Phase 5 — Dispatch loop + +For each skill in `state.route`: + +1. Mark `state.status.` = `dispatched_at: ` and write `.state.yaml`. +2. Print to the operator: + + ```text + cozystack:wizard — handing off to /cozystack: + + config dir: + state file: /.state.yaml + ``` + +3. **Instruct Claude in the next message** to invoke `/cozystack:` and wait for its summary. Use this exact phrasing: `Next, invoke /cozystack: — it will resume from /.state.yaml.` + + **Do not set a ScheduleWakeup fallback** before dispatching the downstream skill. Each downstream skill owns its own task-notification lifecycle (Phase 8 watch loop, ansible run, talm apply — they notify Claude when done). A wizard-side wakeup fires regardless of whether the dispatched work finished, costs a turn re-checking state that hasn't changed, and confuses the dispatch decision (was the wake from the wakeup, or from the skill completing?). Trust the downstream notification. If a downstream skill is genuinely stuck without notifying, the operator surfaces it interactively — wakeup-as-deadman-switch is the wrong mechanism here. +4. After Claude returns from the downstream skill, re-read `.state.yaml`. Expect exactly one of `state.status..completed_at` or `failed_at` set. If `failed_at` is set, **automatically dispatch `/cozystack:debug`** to investigate and (when possible) resolve before deciding next step. If **neither** is set, treat that as a broken contract (see Guardrails) — synthesise `failed_at: "contract-violation: skill returned without writing status"` and dispatch debug anyway. + + The debug skill writes `status.debug.{classification, action, ...}`. After it returns: + + - `action: resolved` (operator-error or config-drift was fixed) — retry the originally-failing skill: clear its `failed_at`, set `dispatched_at` afresh, hand off again. + - `action: workaround` — same retry path. Workaround is in place; the originally-failing skill should succeed now. + - `action: issue-drafted` without a workaround — wizard pauses, surfaces the issue draft path and the `gh` command the operator can run later, then offers `Skip the failing step (cluster will be incomplete) / Cancel install`. + - `action: no-action` — debug couldn't determine the cause. Offer `Retry the failing skill / Skip / Cancel`. + +5. On success (`completed_at` set), continue to the next entry. + +The loop is the wizard's main job — sit between every downstream skill, verify the state transition, decide the next step. + +## Phase 6 — Final summary + +After all skills in the chain report success, print a chain-level NOTES: + +```text +cozystack:wizard — install complete + +config dir: /Users/me/cozystack-lab +route taken: + ✓ cozystack:ubuntu-bootstrap completed 17:25 UTC (20:25 MSK) + ✓ cozystack:cluster-install completed 18:40 UTC (21:40 MSK) + +cluster: + context: cozystack-lab + kubeconfig: /Users/me/cozystack-lab/kubeconfig.yaml + api: https://10.0.0.10:6443 + +cozystack: + installer variant: talos # cozy-installer chart variant + platform variant: isp-full # platform-package overlay + bundles: system, paas, iaas, naas + dashboard: https://dashboard.10-0-0-50.nip.io + +artifacts under (safe to commit, see .gitignore for exclusions): + inventory.yml + cozystack-platform-package.yaml + extractedprism-values.yaml + +next: + list tenants: kubectl --context cozystack-lab get tenants.apps.cozystack.io -A + upgrade later: /cozystack:cluster-upgrade +``` + +The detailed per-skill NOTES (storage breakdown, Keycloak admin path, etc.) were printed by each downstream skill at its own final phase; the wizard summary intentionally stays high-level. + +## Guardrails + +- NEVER perform cluster mutations directly. Every mutation lives in a downstream skill. +- NEVER skip the route approval gate in Phase 3. +- NEVER hand off to the next skill without verifying the previous one wrote `completed_at` to `.state.yaml`. +- NEVER ignore a returned skill that wrote **neither** `completed_at` **nor** `failed_at`. That's a broken contract — surface it explicitly: `cozystack:wizard — skill /cozystack: returned but did not write status..completed_at or .failed_at; treating as failed_at='contract-violation' and dispatching /cozystack:debug.` Don't silently retry; the missing write means the skill exited in an unknown state and the next dispatch decision can't be made safely. +- NEVER silently overwrite an existing state file from a different session — always ask resume / fresh / cancel. +- NEVER `git init` / `git add` / `git commit` inside the config dir. Git ops are operator-side. +- AVOID picking a config dir under `/tmp` or another ephemeral location by default — the artifacts are meant to survive. Allow it only when (a) `--allow-ephemeral` was passed, (b) the operator explicitly named a `/tmp/...` path AND signalled test/scratch intent (keywords like "test", "scratch", "throwaway", "ephemeral", "discard after", or the equivalents in their natural language), or (c) Phase 1's 4th option "scratch dir under $TMPDIR (will be lost on reboot)" was chosen. Always surface a one-line warning when ephemeral path is in use. +- ALWAYS read `.state.yaml` after every downstream invocation — don't assume the skill succeeded just because Claude returned. +- ALWAYS leave the operator with a clear next-step command if the chain pauses (failure, manual checklist in talos-bootstrap, etc.). + +## References + +- `references/routes.md` — target × chain decision table with the reasoning. +- `references/state-schema.md` — `.state.yaml` narrative shape, which fields each skill reads and writes. +- `references/state.schema.json` — machine-readable JSON Schema for `.state.yaml`. Skills validate before writes; `tools/check-refs.sh` can be extended to validate state files in fixtures. Operators / contributors can drive yaml-language-server with `# yaml-language-server: $schema=https://raw.githubusercontent.com/cozystack/ccp/main/plugins/cozystack/skills/wizard/references/state.schema.json` at the top of `.state.yaml`. +- `references/sops.md` — sops opt-in (Phase 1.5): file-by-file decision matrix, age key resolution, `.sops.yaml` block, encrypt-after-write pattern, talm interaction, on↔off toggling. + +Downstream skills: + +- `/cozystack:talos-bootstrap` — Talos node prep (v1: manual-steps handoff). +- `/cozystack:ubuntu-bootstrap` — Ubuntu / Debian bootstrap via ansible-cozystack wrapper. +- `/cozystack:cluster-install` — Cozystack on a ready Kubernetes cluster. +- `/cozystack:debug` — auto-dispatched by Phase 5 when any chain step writes `failed_at`. Investigates, classifies (operator error / config drift / upstream bug / not-supported), applies workaround when possible, drafts upstream issue on approval. diff --git a/plugins/cozystack/skills/wizard/references/routes.md b/plugins/cozystack/skills/wizard/references/routes.md new file mode 100644 index 0000000..a3f5abd --- /dev/null +++ b/plugins/cozystack/skills/wizard/references/routes.md @@ -0,0 +1,35 @@ +# Routes: target × chain + +After the v1 refactor the wizard supports exactly **three** routes plus one refusal. Anything more nuanced lives in the downstream skill, not in the wizard interview. + +| Target | Chain | Notes | +| ----------- | ----------- | ----------- | +| Bare-metal Talos | `talos-bootstrap` → `cluster-install` | Talos image factory + talm + manifests dir — handled by `talos-bootstrap` v1 stub (guided checklist; full automation is a follow-up). | +| Bare-metal Ubuntu / Debian | `ubuntu-bootstrap` → `cluster-install` | `ubuntu-bootstrap` is an ansible wrapper around upstream `cozystack/ansible-cozystack/examples/ubuntu/`. Covers OS prep, drbd-dkms / ZFS / KubeVirt, k3s install. | +| Existing Kubernetes (self-managed or managed) | `cluster-install` | Same skill handles vanilla self-managed (kubeadm / k3s / RKE2 already in place) and managed (EKS / GKE / AKS / DOKS). `cluster-install` picks the right installer variant — `generic` or `hosted` — from cluster lookup. | +| Existing Cozystack | **refuse** → `cozystack:cluster-upgrade` | `cozy-system` namespace already holds pods; this is an upgrade scenario, not an install. | + +## Recommended-by-load hints + +Surface these in the Phase 2 question text so click-ops operators have a hint, but never force the choice: + +- **General-purpose / VMs / databases** → Talos (Recommended for prod). Immutable, predictable kernel modules, all cozystack extensions baked into the cozystack-tuned image. +- **GPU workloads / custom userspace drivers** → Ubuntu. NVIDIA / AMD driver paths are more turnkey on Ubuntu than on Talos; custom kernel modules outside Talos's extension catalog need a generic Linux. +- **Cloud / managed Kubernetes** → Existing cluster. Provider runs the control plane; cozystack runs only the workload-layer (`hosted` variant — no LINSTOR, no KubeVirt). + +## Refusal flows + +- **Existing Cozystack** — wizard prints: "This cluster already runs Cozystack. Run `/cozystack:cluster-upgrade` to upgrade. If you want to wipe and reinstall, delete `cozy-system` namespace and any `package.cozystack.io` CRs manually first, then re-run wizard." +- **Managed k8s where Phase 3 of `cluster-install` would refuse** — managed providers (EKS et al.) don't allow `kubectl debug node` reliably; `cluster-install` notices and runs in hosted-variant mode that doesn't need node-level mutations. +- **Unsupported target** — Windows, k8s versions below the floor in `cluster-install/references/requirements.md`, etc. — wizard refuses early. + +## Why the OS-axis question, not a workload-axis question + +An earlier draft asked "what workload?" first (general / GPU / VMs / databases) and inferred OS. That added a layer of indirection without removing any question — the operator still ended up picking Talos vs Ubuntu eventually, just one screen later. The current shape (one OS-axis question with workload hints in the prompt text) is shorter for the operator and exactly as informative. + +## What the wizard does NOT decide + +- `apiServerHost` (Talos KubePrism vs extractedprism vs operator-supplied) — `cluster-install` Phase 4. +- Storage backend (ZFS, no choice) — `cluster-install` Phase 4 + Phase 5.5. +- DNS / cert-manager solver — `cluster-install` Phase 4. +- k3s / kubeadm / RKE2 — wizard picks k3s via `ubuntu-bootstrap` (v1). Other distributions are out of scope until there's a real reason. diff --git a/plugins/cozystack/skills/wizard/references/sops.md b/plugins/cozystack/skills/wizard/references/sops.md new file mode 100644 index 0000000..052ad41 --- /dev/null +++ b/plugins/cozystack/skills/wizard/references/sops.md @@ -0,0 +1,144 @@ +# sops opt-in + +`cozystack:wizard` Phase 1.5 offers to encrypt secret files in the cluster config directory with sops + age. This document is the source of truth for which files get encrypted, how the age key is resolved, and how operators decrypt / edit after the fact. + +## Why sops + +Without sops, the wizard's `.gitignore` keeps secret files out of git. That's fine for solo workflows but leaves the operator without a reproducible record — disaster recovery means copying files from a workstation that might not exist later. + +With sops, the encrypted forms can be committed safely. Decryption is automatic when sops finds the age private key. Restore = `git clone` + having the age key handy. + +The skill never invents secrets at runtime — every value written is either supplied by the operator or generated by an upstream tool (k3s token, talos PKI, etc.). sops just protects them at rest. + +## File-by-file decision matrix + +| File | Owner skill | Encrypt when sops on? | Why | +|---|---|---|---| +| `talosconfig` | talos-bootstrap | by talm via `.sops.yaml` | talm respects the shared `.sops.yaml` and emits encrypted output natively; the skill doesn't double-encrypt. | +| `nodes/.yaml` | talos-bootstrap | by talm via `.sops.yaml` | Same — talm output. | +| `secrets.yaml` (talos bundle) | talos-bootstrap | by talm via `.sops.yaml` | Same. | +| `kubeconfig.yaml` | ubuntu-bootstrap / talos-bootstrap | yes — skill encrypts after write | Contains TLS client cert + key; high-value secret. | +| `.state.yaml` | every skill | yes — skill encrypts after write | Carries collected inputs (tokens, host lists, SSH key paths). Less sensitive than kubeconfig but still operator state. | +| `inventory.yml` | ubuntu-bootstrap | yes — skill encrypts after write | Carries k3s `token` (joining nodes can use it to compromise the cluster) and possibly `ansible_become_password`. | +| `cozystack-platform-package.yaml` | cluster-install | yes — skill encrypts after write | May embed operator-supplied creds via `components.*.values.*.password` etc. | +| `extractedprism-values.yaml` | cluster-install | yes — skill encrypts after write | No secrets today, included for symmetry — turning sops off and on shouldn't change which files are commit-friendly. | + +The `.gitignore` cozystack section adjusts accordingly: when sops on, only items genuinely outside scope (private age key, ad-hoc `*.tar.gz` diagnostic bundles) stay ignored; the secret files above are encrypted-in-tree and tracked. + +## Age key resolution + +Phase 1.5 looks in this order, presents each find to the operator, asks them to confirm which key to use: + +1. **`/.sops.yaml`** — if a `.sops.yaml` already exists in the config dir, read its `creation_rules[].age` recipients. This covers re-running the wizard on a directory that was set up earlier. + +2. **`$SOPS_AGE_KEY_FILE`** — environment variable, common in CI. Read the public key from the file's `# public key: age1...` comment. + +3. **`~/.config/sops/age/keys.txt`** — the sops/age default. Same `# public key:` comment. + +4. **None found** — print the generation command and ask the operator to approve: + + ```bash + mkdir -p ~/.config/sops/age + age-keygen --output ~/.config/sops/age/keys.txt + chmod 0600 ~/.config/sops/age/keys.txt + ``` + + Surface the resulting public key + a hard warning: the private key in `~/.config/sops/age/keys.txt` is the only way to decrypt these files. Back it up to a password manager / hardware token / offline copy **before** committing anything encrypted to a shared repo. Losing the key = losing the cluster's bootstrap secrets. + +The skill **does not** copy the private key into the config dir. Operators who want a per-cluster key generate it themselves and add the public key to `.sops.yaml`; the private key always lives outside the cluster config directory. + +## `.sops.yaml` block the wizard writes + +```yaml +creation_rules: + - path_regex: "(^|.*/)kubeconfig\\.yaml$" + age: &cozystack_recipients + - age1abc... + - path_regex: "(^|.*/)\\.state\\.yaml$" + age: *cozystack_recipients + - path_regex: "(^|.*/)inventory\\.yml$" + age: *cozystack_recipients + - path_regex: "(^|.*/)cozystack-platform-package\\.yaml$" + age: *cozystack_recipients + - path_regex: "(^|.*/)extractedprism-values\\.yaml$" + age: *cozystack_recipients + - path_regex: "(^|.*/)nodes/.*\\.yaml$" + age: *cozystack_recipients + - path_regex: "(^|.*/)talosconfig$" + age: *cozystack_recipients + - path_regex: "(^|.*/)secrets\\.yaml$" + age: *cozystack_recipients +``` + +The talos-related entries cover talm's outputs so talm's own `.sops.yaml` lookup finds matching rules and emits encrypted artefacts directly. + +If a `.sops.yaml` already existed (operator-managed), the wizard **merges** rather than overwriting — preserves any custom rules. Easiest merge is to read the existing file, dedupe by `path_regex`, append the cozystack rules that aren't already there. + +## Encrypt-after-write pattern in downstream skills + +Every skill that writes a secret file does: + +```bash +# After plain-text write: +if [ "$(yq '.sops.enabled // false' "$CONFIG_DIR/.state.yaml")" = "true" ]; then + sops --encrypt --in-place "$CONFIG_DIR/" +fi +``` + +Or the equivalent inline check via the Skill tool's own state-reading logic. The encrypt is idempotent — running it on an already-encrypted file is a no-op. + +## Decrypt / edit workflow for operators + +Direct decrypt: + +```bash +sops --decrypt /kubeconfig.yaml > /tmp/kubeconfig.yaml +KUBECONFIG=/tmp/kubeconfig.yaml kubectl get nodes +``` + +Or use the sops shell integration: + +```bash +sops /kubeconfig.yaml # opens $EDITOR with the decrypted form +``` + +For ad-hoc decryption of specific values without writing a plain file: + +```bash +sops --decrypt --extract '["clusters"][0]["cluster"]["server"]' /kubeconfig.yaml +``` + +Re-running the wizard on a sops-enabled dir does **not** require the operator to decrypt — sops reads the encrypted file natively when the skill calls `yq` / `kubectl` through sops integration, or the skill can shell out to `sops --decrypt` for its own reads. The exact decrypt mechanism is per-skill and documented in each skill's SKILL.md. + +## Switching from sops-off to sops-on (and back) + +Off → on: + +1. Re-run `/cozystack:wizard --sops` on the same config dir. +2. Phase 1.5 sees `state.sops.enabled = false` (or absent), goes through the opt-in flow. +3. Existing secret files in the config dir are encrypted in place via `sops --encrypt --in-place` over each. +4. `.gitignore` is updated — secret-file lines removed. + +On → off: + +1. Re-run `/cozystack:wizard --no-sops` on the same config dir. +2. Phase 1.5 sees `state.sops.enabled = true`, decrypts every secret file via `sops --decrypt --in-place`. +3. `.gitignore` is updated — secret-file lines added back. +4. `.sops.yaml` stays in place (so future opt-in finds the same recipients); operator can `git rm` it if they want it gone entirely. + +## Talm interaction + +talm reads `.sops.yaml` from its working directory (which is `` for `cozystack:talos-bootstrap` v1). Talm's own secrets handling: + +- `talm gen secrets` writes `secrets.yaml` encrypted if `.sops.yaml` covers it. +- `talm gen config` / `talm apply` works against encrypted `nodes/.yaml` files transparently. + +`cozystack:talos-bootstrap` does not call `sops` itself for talos artefacts — it relies on talm's native handling. The wizard's `.sops.yaml` block above includes the right `path_regex` patterns so talm's lookup matches. + +## What sops does NOT cover + +- Private key safety — operator's responsibility (backup, rotation, hardware-token integration). +- Recipient management for multi-operator teams — the wizard adds one recipient (the operator's age public key). Adding a teammate's key means editing `.sops.yaml` and re-encrypting via `sops updatekeys`. That's a manual step today; no skill automates it yet. +- Key rotation — `age-keygen` produces a new keypair, `sops updatekeys` re-wraps every encrypted file with the new recipient list, then the operator commits the result. Not automated yet. + +If any of those becomes a real friction point in practice, that's the trigger to write a dedicated cozystack sops-helper skill. For now sops opt-in is bring-your-own-key, single-recipient by default. diff --git a/plugins/cozystack/skills/wizard/references/state-schema.md b/plugins/cozystack/skills/wizard/references/state-schema.md new file mode 100644 index 0000000..f772620 --- /dev/null +++ b/plugins/cozystack/skills/wizard/references/state-schema.md @@ -0,0 +1,302 @@ +# .state.yaml schema + +Single source of truth for what each skill in the chain reads and writes. Stored at `/.state.yaml`. Append-only in spirit — skills add their own sections, but don't overwrite sections owned by another skill (`status.` is the only field the wizard updates between dispatches). + +## Top-level keys + +```yaml +created_at: "2026-05-15T17:00:00Z" # ISO-8601 UTC, written once by wizard +session_id: "20260515-170000" # filename-friendly UTC timestamp +config_dir: "/Users/me/cozystack-lab" # absolute path, written by wizard Phase 1 +intent_summary: "..." # free-form recap from wizard Phase 0 +intent_hints: {...} # parsed key/values from wizard Phase 0 +operator_language: "ru" # detected language code (ISO 639-1), for skill prompts +target: "bare-metal-ubuntu" # bare-metal-talos / bare-metal-ubuntu / existing +route: ["ubuntu-bootstrap", "cluster-install"] +sops: {...} # secret-encryption settings — Phase 1.5 +inventory: {...} # bootstrap-only +cluster: {...} # filled after bootstrap or upfront +cozystack_intake: {...} # operator's policy decisions — wizard Phase 4 +cozystack: {...} # discovery + execution outcome — cluster-install +status: {...} # per-skill outcomes +``` + +## `intent_summary` and `intent_hints` + +Written by wizard Phase 0 from the operator's free-form answer to "tell me what you're doing and what's already in place". + +```yaml +intent_summary: "Operator wants to install Cozystack on 3 Hetzner Ubuntu 24.04 servers, no GPU, no public domain (will use nip.io). Has tried before, dashboard never came up." +intent_hints: + target: "bare-metal-ubuntu" # used to pre-fill Phase 2 target + node_count: 3 # used to size inventory in Phase 4 + distribution_hint: "k3s" # ubuntu-bootstrap default + domain_hint: "nip.io" # cluster-install Phase 4 publishing.host + prior_failure: "cozy-dashboard/dashboard never reached Ready" + # signals to start chain with debug + hardware_provider: "hetzner" # informational + workload_class: "general" # general / gpu / vms / databases + platform: "metal" # talos platform — metal / nocloud / aws / oci / azure / gcp + # used by talos-bootstrap to pick the right installer profile + cloud_hint: "oci" # free-form short tag for downstream routing + # (OCI custom-image instructions, AWS AMI flow, etc.) + reach_mode: "public" # how workstation reaches nodes — public / internal / vip + # determines IP set used in talosctl --nodes + cp_endpoint: "https://10.17.100.10:6443" # VIP / single CP IP / external LB — used for + # talm init --cluster-endpoint and in machine-config + vip: # only when cp_endpoint is a VIP + address: "10.17.100.10" + link: "ens5" + subnet: "10.17.100.0/24" + mtu: 9000 +``` + +The list is open-ended — any structured key the operator's free-form answer maps to is fair game. Downstream skills read `intent_hints` to skip questions whose answers are already known. + +## `operator_language` + +Detected from the operator's free-form Phase 0 answer (or first message before that). ISO 639-1 code: + +- `ru` — Russian +- `en` — English +- `de` — German +- etc. + +Every skill matches this language in its prompts, AskUserQuestion option labels, summaries, and gate messages. Code identifiers, command examples, file paths, and any text destined for GitHub stay in canonical form (usually English). Skills do not ask the operator which language to use — `operator_language` is filled once in Phase 0 and reused. + +## `sops` + +Written by wizard Phase 1.5. Every downstream skill reads `sops.enabled` before each secret-file write and runs `sops --encrypt --in-place ` after the plain write if true. + +```yaml +sops: + enabled: true + recipients: # everyone who can decrypt; usually one operator + maybe a shared team key + - "age1abc..." + config_path: "/.sops.yaml" # absolute path to the .sops.yaml + # When enabled is false, no other field matters and skills skip the encrypt step. +``` + +If `sops.enabled` is missing or false, the skills behave as before — secret files land in plain text and `.gitignore` keeps them out of commits. See `references/sops.md`. + +## `inventory` + +Only present when the chain starts with a bootstrap skill. Written by the wizard during Phase 4 interview; refined by the bootstrap skill if it learns more (e.g. real hostnames after `hostnamectl`). + +```yaml +inventory: + ssh_user: "ubuntu" + ssh_key: "/Users/me/.ssh/cozystack-lab" + nodes: + - host: "10.0.0.10" # external / ansible_host + internal_ip: "10.0.0.10" # used as inventory key for ansible-cozystack + role: "cp" + name: "cp1" + - host: "10.0.0.11" + internal_ip: "10.0.0.11" + role: "cp" + name: "cp2" + - host: "10.0.0.12" + internal_ip: "10.0.0.12" + role: "cp" + name: "cp3" + - host: "10.0.0.20" + internal_ip: "10.0.0.20" + role: "worker" + name: "w1" + vip: "" # optional virtual IP; empty = use cp1.host as tls-san +``` + +Constraints: + +- `nodes[].role` is `cp` or `worker`. +- For HA: at least 3 `cp` nodes (embedded etcd raft). +- For single-node sandbox: one `cp`, zero or more workers. + +## `cluster` + +Written by whichever bootstrap skill ran (`talos-bootstrap` or `ubuntu-bootstrap`) **or** filled directly by the wizard for the "existing k8s" target. + +```yaml +cluster: + context: "cozystack-lab" + kubeconfig: "/Users/me/cozystack-lab/kubeconfig.yaml" + api_endpoint: "https://10.0.0.10:6443" + distribution: "k3s" # k3s / talos / kubeadm / rke2 / managed-eks / ... + k8s_version: "v1.32.3+k3s1" +``` + +`cluster.context` is what every downstream `kubectl` invocation uses as `--context`. If the operator already had a kubeconfig with the same context name, the bootstrap skill renames the freshly-fetched one (e.g. `cozystack-lab-2`) and surfaces the rename. + +## `cozystack_intake` + +Operator's policy decisions collected by **wizard Phase 4**, read by `cluster-install` Phase 4 to skip re-prompting. Every slot below was historically asked by `cluster-install` *after the cluster existed* — moving them into `wizard` is the front-load contract. Discovery-dependent values (actual device paths, KubeOVN label-value mismatches) are still resolved by `cluster-install` against the live cluster. + +```yaml +cozystack_intake: + # Variant + bundle selection — kept SEPARATE because they map to different + # cozy-installer chart inputs: + # --set cozystackOperator.variant= (chooses cozy-installer behaviour) + # the chart then loads packages/core/platform/values-.yaml as overlay + # Both names come from upstream cozystack files; do not invent values. + # Real upstream platform_variant overlays in cozystack v1.3.x: + # default, isp-full, isp-full-generic, isp-hosted + # Typical pairing: + # installer_variant=talos ↔ platform_variant=isp-full + # installer_variant=generic ↔ platform_variant=isp-full-generic + # installer_variant=hosted ↔ platform_variant=isp-hosted + bundles: ["system", "paas", "iaas", "naas"] + installer_variant: "talos" # generic / talos / hosted (derived from target + workload_class) + platform_variant: "isp-full" # default / isp-full / isp-full-generic / isp-hosted + + # Storage layout preference (Talos / Ubuntu routes; "hosted" target skips) + storage_pref: + layout_per_node: + node0: "single" # single / mirror / raidz + node1: "single" + node2: "single" + zpool_name: "data" + linstor_pool_name: "data" + + # Networking — defaults READ from cozystack source values.yaml at wizard Phase 4 + # (not hardcoded). See wizard SKILL.md Phase 4 for the resolution order. + network: + pod_cidr: "10.244.0.0/16" + service_cidr: "10.96.0.0/16" + join_cidr: "100.64.0.0/16" + defaults_source: "cozystack@v1.3.3/packages/core/platform/values.yaml" + + # Publishing — host, certs, exposure + publishing: + host: "10-17-0-128.nip.io" + host_kind: "nip.io" # nip.io / custom-fqdn + domain_ownership_confirmed: true # always true for nip.io; explicit operator confirm for custom-fqdn + cert_solver: "http01" # http01 / dns01 + exposed_services: ["api", "dashboard"] + api_server_endpoint: "https://api.10-17-0-128.nip.io" + + # External IPs strategy — what Service.externalIPs gets populated with + external_ips: + strategy: "internal" # internal / external / explicit + explicit: [] # populated only when strategy=explicit + # Reason recorded so operator and reviewer can see why this choice was made: + reason: "OCI 1:1 NAT — public IPs would not match Cilium externalIPs BPF" + + # kube-apiserver HA + extractedprism: + enabled: true # default for generic; auto-false for talos / hosted + api_host_override: "" # set only when operator passed --no-extractedprism with --api-host= +``` + +Constraints: + +- `external_ips.strategy: internal` is the safe default whenever `Node.status.addresses` has both `InternalIP` and `ExternalIP` that differ. Picking `external` on such providers (OCI, GCP+NAT, AWS+EIP) silently breaks Cilium externalIPs matching — symptom is RST on dashboard ingress even with all HRs Ready. +- `publishing.host_kind: custom-fqdn` requires `domain_ownership_confirmed: true` (explicit operator confirm); the wizard refuses to proceed otherwise. +- `storage_pref.layout_per_node` is a *preference* keyed by node name from `inventory.nodes[].name`. The actual device paths are resolved by `cluster-install` Phase 5.5 against the running cluster (post-bootstrap, devices are discoverable). +- `extractedprism.enabled: false` on `installer_variant: generic` requires `extractedprism.api_host_override` to be a non-empty IP / VIP / external LB endpoint. + +## `cozystack` + +Written by `cluster-install`. Mirrors what gets serialised to `/cozystack-platform-package.yaml`. + +```yaml +cozystack: + installer_variant: "talos" # generic / talos / hosted — same key as cozystack_intake + platform_variant: "isp-full" # default / isp-full / isp-full-generic / isp-hosted — upstream overlay name + bundles: ["system", "paas", "iaas", "naas"] + api_server_host: "127.0.0.1" # or CP1_IP / VIP / "" (hosted) + api_server_port: "7445" + api_server_source: "extractedprism (default)" + storage: + backend: "zfs" # zfs (only supported) + linstor_pool: "data" + nodes: + - name: "cp1" + devices: ["/dev/nvme1n1"] + layout: "single" + zpool: "data" + - name: "cp2" + devices: ["/dev/nvme1n1", "/dev/nvme2n1"] + layout: "mirror" + zpool: "data" + - ... + publishing: + host: "10-0-0-50.nip.io" + api_endpoint: "https://api.10-0-0-50.nip.io" + external_ips: ["10.0.0.50"] + exposure: "externalIPs" + cert_solver: "http01" +``` + +## `status` + +Per-skill state machine. The wizard owns this section. + +```yaml +status: + ubuntu-bootstrap: + dispatched_at: "2026-05-15T17:05:00Z" + completed_at: "2026-05-15T17:25:12Z" + cluster-install: + dispatched_at: "2026-05-15T17:25:30Z" + failed_at: "2026-05-15T17:32:15Z" + error: "STOP GATE 1: br_netfilter missing on node cp2" +``` + +Each skill writes exactly one of `completed_at` or `failed_at` plus an `error` string when failing. The wizard never edits these fields after a skill writes them — only sets `dispatched_at` before handing off and reads `completed_at` / `failed_at` after. + +## Skill responsibilities + +| Skill | Reads | Writes | +| ----------- | ----------- | ----------- | +| `wizard` | everything (verifies progress) | `created_at`, `session_id`, `config_dir`, `target`, `route`, initial `inventory`, initial `cluster.context` (for existing-k8s target), `cozystack_intake.*`, `status..dispatched_at` | +| `talos-bootstrap` | `inventory`, `target`, `config_dir`, `intent_hints.vip.per_node` | `cluster.*`, `status.talos-bootstrap.*` | +| `ubuntu-bootstrap` | `inventory`, `target`, `config_dir` | `cluster.*`, refines `inventory.nodes[].name`, `status.ubuntu-bootstrap.*` | +| `cluster-install` | `cluster.*`, `inventory.nodes[].host`, `cozystack_intake.*`, `config_dir` | `cozystack.*`, `status.cluster-install.*` | +| `debug` | everything (auto-dispatched on any `failed_at`) | `status.debug.*` with `target`, `classification`, `action`, optional `source.{repo,file,line}`, `issue_repo` | + +## `status.debug` + +Written by `cozystack:debug` after Phase 6: + +```yaml +status: + debug: + dispatched_at: "2026-05-15T18:00:00Z" + completed_at: "2026-05-15T18:15:00Z" + target: "hr/cozy-dashboard/dashboard" + classification: "upstream-bug" # operator-error | config-drift | upstream-bug | not-supported + action: "workaround" # resolved | workaround | issue-drafted | no-action + source: # populated only when classification=upstream-bug + repo: "cozystack/cozystack" + file: "packages/system/dashboard/charts/dashboard/templates/gatekeeper.yaml" + line: 47 + summary: "gatekeeper container always dials https://keycloak.${HOST} without TLS skip-verify" + issue_repo: "" # set when operator approved an issue filing + issue_body_path: "" # path to the rendered issue body if drafted +``` + +The wizard's Phase 5 dispatch loop reads `action` to decide what comes next: + +- `resolved` or `workaround` → retry the originally-failing skill (clear its `failed_at`, set fresh `dispatched_at`, hand off again). +- `issue-drafted` without a workaround → wizard pauses, offers Skip / Cancel. +- `no-action` → wizard offers Retry / Skip / Cancel. + +## On-disk artifacts (siblings of `.state.yaml`) + +All paths under `/`: + +| File | Written by | Gitignored | Purpose | +|---|---|---|---| +| `.gitignore` | wizard (Phase 1) | no | Excludes secrets + state. Markers `# === BEGIN cozystack ===` / `# === END cozystack ===` let operator add their own rules around the cozystack block. | +| `.state.yaml` | every skill | yes | Chain progress + collected values. Refer-to-by-reference between skills. | +| `inventory.yml` | `ubuntu-bootstrap` Phase 4 | no | Ansible inventory rendered from `state.inventory`. Safe to commit. | +| `kubeconfig.yaml` | `ubuntu-bootstrap` Phase 9 / `talos-bootstrap` Phase 6 | yes | Bootstrap output; chmod 0600. | +| `nodes/cp*.yaml` | `talos-bootstrap` (v1: operator-edited from talm template) | no | Per-node Talos machine-config. Safe to commit. | +| `talosconfig` | `talos-bootstrap` | yes | talos client config; contains certs. | +| `cozystack-platform-package.yaml` | `cluster-install` Phase 4 | no | The Package CR. Safe to commit. | +| `extractedprism-values.yaml` | `cluster-install` Phase 5.6 | no | Endpoints list + chart values. Safe to commit. | + +## Resuming + +`/cozystack:wizard --resume` (with `--config-dir` either explicit or inferred from `$PWD`) skips Phase 2/3/4 interviews and goes straight to Phase 5 dispatch from the next not-yet-completed step in `route`. The wizard re-prints the route with completed steps marked ✓ so the operator knows where the chain stands. diff --git a/plugins/cozystack/skills/wizard/references/state.schema.json b/plugins/cozystack/skills/wizard/references/state.schema.json new file mode 100644 index 0000000..e9e1e8a --- /dev/null +++ b/plugins/cozystack/skills/wizard/references/state.schema.json @@ -0,0 +1,255 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/cozystack/ccp/plugins/cozystack/skills/wizard/references/state.schema.json", + "title": ".state.yaml — Cozystack wizard chain state", + "description": "Single source of truth for what each skill in the chain reads and writes. Lives at /.state.yaml. See state-schema.md for narrative descriptions of each section's purpose; this schema is the machine-readable contract.", + "type": "object", + "additionalProperties": true, + "required": ["created_at", "session_id", "config_dir", "target", "route"], + "properties": { + "created_at": { + "type": "string", + "format": "date-time", + "description": "ISO-8601 UTC. Written once by wizard Phase 1." + }, + "session_id": { + "type": "string", + "pattern": "^[0-9]{8}-[0-9]{6}$", + "description": "Filename-friendly UTC timestamp (YYYYMMDD-HHMMSS)." + }, + "config_dir": { + "type": "string", + "description": "Absolute path. Written by wizard Phase 1." + }, + "operator_language": { + "type": "string", + "pattern": "^[a-z]{2}(-[A-Z]{2})?$", + "description": "ISO 639-1 language code (optional region). Detected from Phase 0 free-form answer." + }, + "intent_summary": { + "type": "string", + "description": "Free-form recap from wizard Phase 0." + }, + "intent_hints": { + "type": "object", + "additionalProperties": true, + "description": "Parsed key/value pairs extracted from Phase 0. Open-ended — any structured key the operator's free-form maps to is fair game.", + "properties": { + "target": { "enum": ["bare-metal-talos", "bare-metal-ubuntu", "existing"] }, + "node_count": { "type": "integer", "minimum": 1 }, + "platform": { "enum": ["oci", "hetzner", "aws-with-eip", "gcp-with-nat", "bare-metal", "other"] }, + "reach_mode": { "enum": ["public", "internal", "vip"] }, + "needs_os_install": { "type": "boolean" }, + "cp_endpoint": { "type": "string" }, + "vip": { + "type": "object", + "properties": { + "address": { "type": "string", "format": "ipv4" }, + "link": { "type": "string" }, + "subnet": { "type": "string" }, + "mtu": { "type": "integer" }, + "per_node": { + "type": "object", + "additionalProperties": { "type": "string", "format": "ipv4" } + } + } + } + } + }, + "target": { + "enum": ["bare-metal-talos", "bare-metal-ubuntu", "existing"] + }, + "route": { + "type": "array", + "items": { "enum": ["talos-bootstrap", "ubuntu-bootstrap", "cluster-install"] }, + "minItems": 1 + }, + "sops": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "recipients": { "type": "array", "items": { "type": "string" } }, + "config_path": { "type": "string" } + }, + "required": ["enabled"] + }, + "inventory": { + "type": "object", + "properties": { + "ssh_user": { "type": "string" }, + "ssh_key": { "type": "string" }, + "nodes": { + "type": "array", + "items": { + "type": "object", + "required": ["host", "role"], + "properties": { + "host": { "type": "string" }, + "internal_ip": { "type": "string" }, + "public_ip": { "type": "string" }, + "role": { "enum": ["cp", "worker"] }, + "name": { "type": "string" } + } + } + } + } + }, + "cluster": { + "type": "object", + "properties": { + "context": { "type": "string" }, + "kubeconfig": { "type": "string" }, + "api_endpoint": { "type": "string" }, + "distribution": { "enum": ["k3s", "talos", "kubeadm", "rke2", "managed-eks", "managed-gke", "managed-aks", "managed-doks", "other"] }, + "k8s_version": { "type": "string" }, + "talos_version": { "type": "string" }, + "vip": { + "type": "object", + "properties": { + "shared_address": { "type": "string", "format": "ipv4" }, + "link": { "type": "string" }, + "subnet": { "type": "string" }, + "per_node": { + "type": "object", + "additionalProperties": { "type": "string", "format": "ipv4" } + } + } + } + } + }, + "cozystack_intake": { + "type": "object", + "description": "Operator's policy decisions collected by wizard Phase 4. Read by cluster-install Phase 4 to skip re-prompting.", + "properties": { + "bundles": { + "type": "array", + "items": { "enum": ["system", "paas", "iaas", "naas"] } + }, + "installer_variant": { "enum": ["generic", "talos", "hosted"] }, + "platform_variant": { "enum": ["default", "isp-full", "isp-full-generic", "isp-hosted"] }, + "storage_pref": { + "type": "object", + "properties": { + "layout_per_node": { + "type": "object", + "additionalProperties": { "enum": ["single", "mirror", "raidz"] } + }, + "zpool_name": { "type": "string" }, + "linstor_pool_name": { "type": "string" } + } + }, + "network": { + "type": "object", + "properties": { + "pod_cidr": { "type": "string" }, + "service_cidr": { "type": "string" }, + "join_cidr": { "type": "string" }, + "defaults_source": { "type": "string" } + } + }, + "publishing": { + "type": "object", + "properties": { + "host": { "type": "string" }, + "host_kind": { "enum": ["nip.io", "custom-fqdn"] }, + "domain_ownership_confirmed": { "type": "boolean" }, + "cert_solver": { "enum": ["http01", "dns01"] }, + "exposed_services": { + "type": "array", + "items": { "enum": ["api", "dashboard", "vm-exportproxy", "cdi-uploadproxy"] } + }, + "api_server_endpoint": { "type": "string" } + } + }, + "external_ips": { + "type": "object", + "properties": { + "strategy": { "enum": ["internal", "external", "explicit"] }, + "explicit": { "type": "array", "items": { "type": "string" } }, + "reason": { "type": "string" } + }, + "required": ["strategy"] + }, + "extractedprism": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "api_host_override": { "type": "string" } + } + } + } + }, + "cozystack": { + "type": "object", + "description": "Discovery + execution outcome written by cluster-install.", + "properties": { + "installer_variant": { "enum": ["generic", "talos", "hosted"] }, + "installer_version": { "type": "string", "description": "Canonical git-tag form, e.g. v1.3.3. cluster-install strips the v before passing to OCI helm --version." }, + "platform_variant": { "enum": ["default", "isp-full", "isp-full-generic", "isp-hosted"] }, + "bundles": { "type": "array", "items": { "type": "string" } }, + "api_server_host": { "type": "string" }, + "api_server_port": { "type": "string" }, + "api_server_source": { "type": "string" }, + "storage": { + "type": "object", + "properties": { + "backend": { "const": "zfs" }, + "linstor_pool": { "type": "string" }, + "nodes": { + "type": "array", + "items": { + "type": "object", + "required": ["name", "zpool"], + "properties": { + "name": { "type": "string" }, + "devices": { "type": "array", "items": { "type": "string" } }, + "layout": { "enum": ["single", "mirror", "raidz"] }, + "zpool": { "type": "string" }, + "linstor_pool": { "type": "string" } + } + } + } + } + }, + "publishing": { + "type": "object", + "properties": { + "host": { "type": "string" }, + "api_endpoint": { "type": "string" }, + "external_ips": { "type": "array", "items": { "type": "string" } }, + "exposure": { "enum": ["externalIPs", "loadBalancer"] }, + "cert_solver": { "enum": ["http01", "dns01"] } + } + } + } + }, + "research_cache": { + "type": "object", + "description": "Cache key for Phase 4.5 active research. 24h TTL keyed on the cozystack_intake combination.", + "properties": { + "cached_at": { "type": "string", "format": "date-time" }, + "key": { "type": "string", "description": "Hash of (cozystack_version, talos_version, platform, variant, bundles)." }, + "findings": { "type": "array" } + } + }, + "status": { + "type": "object", + "description": "Per-skill state machine owned by the wizard. Each skill writes exactly one of completed_at OR failed_at plus error string when failing.", + "additionalProperties": { + "type": "object", + "properties": { + "dispatched_at": { "type": "string", "format": "date-time" }, + "completed_at": { "type": "string", "format": "date-time" }, + "failed_at": { "type": "string", "format": "date-time" }, + "error": { "type": "string" }, + "error_detail": { "type": "string" } + }, + "oneOf": [ + { "required": ["completed_at"] }, + { "required": ["failed_at"] }, + { "required": ["dispatched_at"], "not": { "anyOf": [{ "required": ["completed_at"] }, { "required": ["failed_at"] }] } } + ] + } + } + } +} diff --git a/plugins/linstor/.claude-plugin/plugin.json b/plugins/linstor/.claude-plugin/plugin.json new file mode 100644 index 0000000..d8f9742 --- /dev/null +++ b/plugins/linstor/.claude-plugin/plugin.json @@ -0,0 +1,9 @@ +{ + "name": "linstor", + "version": "1.0.0", + "description": "LINSTOR / DRBD operations bundle for Kubernetes. Skills, invoked as linstor:: recover (diagnose and recover broken DRBD resources — handles StandAlone, DELETING, Inconsistent, Diskless, quorum loss, bitmap errors, and other common failure modes). Useful on any Kubernetes cluster that runs piraeus-operator / LINSTOR, not just on Cozystack.", + "author": { + "name": "Cozystack", + "url": "https://github.com/cozystack" + } +} diff --git a/skills/drbd-recovery/skills/drbd-recovery/SKILL.md b/plugins/linstor/skills/recover/SKILL.md similarity index 87% rename from skills/drbd-recovery/skills/drbd-recovery/SKILL.md rename to plugins/linstor/skills/recover/SKILL.md index 85f5e31..fdf4e27 100644 --- a/skills/drbd-recovery/skills/drbd-recovery/SKILL.md +++ b/plugins/linstor/skills/recover/SKILL.md @@ -1,9 +1,9 @@ --- -name: drbd-recovery +name: recover description: Diagnose and recover DRBD/LINSTOR storage issues in Kubernetes clusters — handles StandAlone, DELETING, Inconsistent, Diskless, quorum loss, bitmap errors, and other common failure modes. Use when `linstor r l --faulty` shows broken resources or nodes have `drbd.linbit.com/lost-quorum` taints. --- -# DRBD/LINSTOR Recovery Skill +# linstor:recover Specialized skill for diagnosing and recovering DRBD/LINSTOR storage issues in Kubernetes clusters. @@ -19,6 +19,10 @@ Specialized skill for diagnosing and recovering DRBD/LINSTOR storage issues in K ## Core Principles +0. **Match the operator's natural language.** Detect from prior conversation messages. Use that language in every prompt, AskUserQuestion option, summary, and gate. Never ask "what language?" separately. Code identifiers, `linstor` / `drbdadm` commands, file paths, and GitHub-public text stay in their canonical form (usually English). +0a. **One valid path → just do it.** When the diagnostic graph picks one resource to fix and one operation that's not destructive (re-attach, re-promote, refresh), the skill runs it without "ok to do this?" friction. The dangerous-operations gate at principle 8 still applies — `linstor node lost`, deleting the last replica, `drbdadm down` on InUse/Primary, `--discard-my-data` with one diskful copy, `drbdadm create-md --force` — those always ask. Safe single-path operations don't. +0b. **Front-load the interview.** Read the full diagnostic graph first (`linstor r l --faulty`, `linstor n l`, `drbdadm status` on each affected node, `linstor error-reports list`), then present a single recovery plan with the ordered operations, classifications (safe / dangerous), and the dangerous-operation approvals batched into one approval screen. Operator either approves the lot or names what to skip. Phases of the recovery run uninterrupted against the collected approvals. +0c. **Layer-pure operator output.** The skill never says "returning control to wizard" or makes any orchestration commentary in the **operator-facing** summary. linstor:recover is a standalone skill that nobody auto-dispatches today, but even if it ever gets called from a chain, whoever invoked it figures out what's next on their own. 1. **Work one resource at a time.** On mass incidents, resist the urge to fix everything at once. Serial, monotonic recovery is safer. 2. **Always verify on the node itself.** LINSTOR's view can be stale or wrong. `drbdadm status` on the satellite is the source of truth. 3. **Preserve UpToDate replicas.** Never touch the source-of-truth replica first. Fix broken copies by working outward from the healthy one. @@ -44,7 +48,7 @@ linstor r l --faulty | grep -oP '(UpToDate|Outdated|Inconsistent|StandAlone|Conn Check node taints: ```bash -kubectl get nodes -o custom-columns='NAME:.metadata.name,TAINTS:.spec.taints[*].key' +kubectl --context $CTX get nodes -o custom-columns='NAME:.metadata.name,TAINTS:.spec.taints[*].key' ``` Find which resources block quorum: @@ -64,7 +68,7 @@ linstor r lv -r ```bash # Enter satellite on the problem node: -kubectl exec -ti -n cozy-linstor ds/linstor-satellite. -c linstor-satellite -- bash +kubectl --context $CTX exec -ti -n cozy-linstor ds/linstor-satellite. -c linstor-satellite -- bash # Check DRBD kernel state (most reliable): drbdadm status diff --git a/skills/cozy-bump/.claude-plugin/plugin.json b/skills/cozy-bump/.claude-plugin/plugin.json deleted file mode 100644 index 046c690..0000000 --- a/skills/cozy-bump/.claude-plugin/plugin.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "cozy-bump", - "version": "1.0.0", - "description": "Bump a cozystack monorepo package — detects upstream source, fetches changelog between current and target versions, surfaces breaking changes / deprecations / new required keys, adapts values.yaml and templates, regenerates schema and ApplicationDefinition, runs helm template + lint, commits with Conventional-Commit message, and optionally deploys to a dev cluster via cozyhr suspend + make apply with ttl.sh ephemeral image registry", - "author": { - "name": "Cozystack", - "url": "https://github.com/cozystack" - } -} diff --git a/skills/cozy-deploy/.claude-plugin/plugin.json b/skills/cozy-deploy/.claude-plugin/plugin.json deleted file mode 100644 index 1beac27..0000000 --- a/skills/cozy-deploy/.claude-plugin/plugin.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "cozy-deploy", - "version": "1.0.0", - "description": "Deploy a Cozystack package to a dev cluster via make + cozyhr — handles fresh install and dev-loop iteration with ExternalArtifact support", - "author": { - "name": "Cozystack", - "url": "https://github.com/cozystack" - } -} diff --git a/skills/cozy-external-app/.claude-plugin/plugin.json b/skills/cozy-external-app/.claude-plugin/plugin.json deleted file mode 100644 index b045f07..0000000 --- a/skills/cozy-external-app/.claude-plugin/plugin.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "cozy-external-app", - "version": "1.0.0", - "description": "Scaffold a new Cozystack external app package — generates chart skeleton, ApplicationDefinition, and handles dependency integration (e.g. Immich → Postgres) via managed CNPG clusters or external secret references", - "author": { - "name": "Cozystack", - "url": "https://github.com/cozystack" - } -} diff --git a/skills/cozystack-upgrade/.claude-plugin/plugin.json b/skills/cozystack-upgrade/.claude-plugin/plugin.json deleted file mode 100644 index 1e5c65c..0000000 --- a/skills/cozystack-upgrade/.claude-plugin/plugin.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "cozystack-upgrade", - "version": "1.0.1", - "description": "Guided upgrade of a running Cozystack v1.x cluster to a newer v1.x patch or minor version — release-notes analysis, prechecks, stop gates, helm upgrade, targeted post-upgrade verification, known failure recovery", - "author": { - "name": "Cozystack", - "url": "https://github.com/cozystack" - } -} diff --git a/skills/drbd-recovery/.claude-plugin/plugin.json b/skills/drbd-recovery/.claude-plugin/plugin.json deleted file mode 100644 index c7424bc..0000000 --- a/skills/drbd-recovery/.claude-plugin/plugin.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "drbd-recovery", - "version": "1.0.0", - "description": "Diagnose and recover DRBD/LINSTOR storage issues in Kubernetes clusters — handles StandAlone, DELETING, Inconsistent, Diskless, quorum loss, bitmap errors, and other common failure modes. Use when `linstor r l --faulty` shows broken resources or nodes have `drbd.linbit.com/lost-quorum` taints.", - "author": { - "name": "Cozystack", - "url": "https://github.com/cozystack" - } -} diff --git a/tools/check-refs.sh b/tools/check-refs.sh new file mode 100755 index 0000000..5116002 --- /dev/null +++ b/tools/check-refs.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash +# Cross-reference validator for the CCP plugin tree. +# +# Catches the class of bug that landed six blockers in one branch-review: +# a string in one file no longer matching reality in another file +# (renamed skill, deleted reference doc, marketplace description out of sync). +# +# Checks performed: +# 1. Every `references/.md` mentioned in a SKILL.md exists on disk. +# 2. Every `/:` or `cozystack:` / `linstor:` +# mention in any SKILL.md or reference doc resolves to a real directory +# under `plugins//skills//`. +# 3. Every plugin's `description` (in both `.claude-plugin/plugin.json` and +# the matching `plugins[]` entry in `.claude-plugin/marketplace.json`) +# mentions every skill name that exists under `plugins//skills/`. +# +# Exit code: 0 on success, 1 on any violation. +# +# Run locally before commit, and as a CI gate (.github/workflows/validate.yml). + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" + +if ! command -v jq >/dev/null 2>&1; then + echo "FATAL: jq is required" >&2 + exit 2 +fi + +errors=0 + +err() { + echo "ERROR: $*" >&2 + errors=$((errors + 1)) +} + +# ---------------------------------------------------------------- Check 1: references/.md exist +echo "==> Check 1: references/.md mentions resolve" +while IFS= read -r skill_md; do + skill_dir="$(dirname "$skill_md")" + # grep references/...md mentions; tolerate backticks and bare; strip trailing .md + # mentions look like `references/.md` or references/.md + while IFS= read -r ref; do + # ref like "references/foo.md" + target="$skill_dir/$ref" + if [ ! -f "$target" ]; then + err "$skill_md references missing file: $ref (looked at $target)" + fi + done < <( + # Match `references/.md` ONLY when preceded by start-of-line, + # whitespace, backtick, or open-paren — never as a tail of a longer + # path like `cluster-install/references/foo.md` (that's a cross-skill + # reference, not a sibling-references claim). + grep -hoE '(^|[[:space:]`(])references/[a-zA-Z0-9_-]+\.md' "$skill_md" 2>/dev/null \ + | sed -E 's#^[[:space:]`(]##' \ + | sort -u + ) +done < <(find plugins -name SKILL.md -type f) + +# ---------------------------------------------------------------- Check 2: plugin:skill mentions resolve +echo "==> Check 2: /: mentions resolve to real directories" + +# Build a set of known plugin:skill identifiers from the filesystem. +known_skills=$(mktemp) +trap 'rm -f "$known_skills"' EXIT +for plugin_dir in plugins/*/; do + plugin=$(basename "$plugin_dir") + if [ -d "$plugin_dir/skills" ]; then + for skill_dir in "$plugin_dir"skills/*/; do + [ -d "$skill_dir" ] || continue + skill=$(basename "$skill_dir") + echo "$plugin:$skill" >> "$known_skills" + done + fi +done + +# Find every plugin:skill mention in any text file under plugins/ and check. +while IFS= read -r mention; do + if ! grep -Fxq "$mention" "$known_skills"; then + # Find which file mentioned it (best-effort: first hit). + offending=$(grep -rlF "$mention" plugins/ README.md CLAUDE.md 2>/dev/null | head -1) + err "Unknown skill identifier '$mention' (mentioned in ${offending:-?})" + fi +done < <( + # Look for /cozystack:..., /linstor:..., bare cozystack:..., linstor:... mentions. + # Also catch markdown-bold like **cozystack:foo** by stripping ** before grep. + { + grep -rhEo '/?(cozystack|linstor):[a-zA-Z0-9_-]+' plugins/ README.md CLAUDE.md 2>/dev/null \ + | sed -E 's#^/##; s#\*+##g' \ + | grep -E '^(cozystack|linstor):' \ + || true + } | sort -u +) + +# ---------------------------------------------------------------- Check 3: descriptions mention every skill +echo "==> Check 3: plugin descriptions list every skill" +for plugin_dir in plugins/*/; do + plugin=$(basename "$plugin_dir") + plugin_json="$plugin_dir.claude-plugin/plugin.json" + [ -f "$plugin_json" ] || continue + + # collect skill names from filesystem + skills_on_disk=() + if [ -d "$plugin_dir/skills" ]; then + while IFS= read -r skill_dir; do + skills_on_disk+=("$(basename "$skill_dir")") + done < <(find "$plugin_dir/skills" -mindepth 1 -maxdepth 1 -type d | sort) + fi + [ "${#skills_on_disk[@]}" -gt 0 ] || continue + + plugin_desc=$(jq -r '.description' "$plugin_json") + marketplace_desc=$(jq -r --arg name "$plugin" \ + '.plugins[] | select(.name == $name) | .description' \ + .claude-plugin/marketplace.json) + + for skill in "${skills_on_disk[@]}"; do + if ! printf '%s' "$plugin_desc" | grep -Fq "$skill"; then + err "plugin.json for '$plugin' description omits skill '$skill'" + fi + if ! printf '%s' "$marketplace_desc" | grep -Fq "$skill"; then + err "marketplace.json description for plugin '$plugin' omits skill '$skill'" + fi + done +done + +# ---------------------------------------------------------------- Check 4: kubectl / helm --context discipline +# Every `kubectl ` or `helm ` invocation in any plugins/**/*.md must pass +# --context $CTX (kubectl) or --kube-context $CTX (helm), unless it's +# explicitly context-less (current-context probe, client-version probe, +# auth can-i which is by definition cluster-bound but operator-driven), OR +# the line is tagged `# noverify-context` for genuinely prose mentions. +echo "==> Check 4: kubectl / helm calls pass --context / --kube-context" + +# Allow-list: substrings that, when present on the same line, exempt it. +# Most cover non-cluster-bound subcommands (helm repo / pull / show / template / +# lint do not need a kube-context; kubectl config does not target a cluster). +# `--kubeconfig ` is an equivalent of `--context` and is allowed too. +declare -a allow_substr=( + # kubectl non-cluster + 'kubectl config' # current-context, get-contexts, use-context + 'kubectl version --client' + 'kubectl auth can-i' # operator's own auth check, no cluster mutation + 'kubectl krew' # plugin install + 'kubectl kc ' # kubecm plugin + # kubectl with --kubeconfig (equivalent to --context, allow) + '--kubeconfig ' + # helm non-cluster subcommands — registry / repo / template-level + 'helm repo ' + 'helm search ' + 'helm pull ' + 'helm push ' + 'helm show ' + 'helm template ' + 'helm lint ' + 'helm dep' + 'helm package ' + 'helm registry ' + 'helm verify ' + 'helm version' + 'helm env' + # explicit per-line override + 'noverify-context' +) + +# Helm subcommands that ARE cluster-bound — only flag bare helm for these. +helm_cluster_re='helm (install|upgrade|uninstall|rollback|status|get|list|history|test)( |$)' + +while IFS= read -r line_with_path; do + # path:lineno:content + file="${line_with_path%%:*}" + rest="${line_with_path#*:}" + lineno="${rest%%:*}" + content="${rest#*:}" + + skip=0 + for allow in "${allow_substr[@]}"; do + if printf '%s' "$content" | grep -Fq -- "$allow"; then skip=1; break; fi + done + [ "$skip" -eq 1 ] && continue + + # kubectl path: any `kubectl` invocation that survived allowlist needs --context + if printf '%s' "$content" | grep -qE '^[[:space:]]*kubectl '; then + if ! printf '%s' "$content" | grep -qE -- '--context([^A-Za-z0-9_-]|$)'; then + err "$file:$lineno bare kubectl without --context: $(printf '%s' "$content" | sed -E 's/^ +//' | cut -c1-100)" + fi + fi + # helm path: only flag cluster-bound subcommands + if printf '%s' "$content" | grep -qE "$helm_cluster_re"; then + if ! printf '%s' "$content" | grep -qE -- '--kube-context([^A-Za-z0-9_-]|$)'; then + err "$file:$lineno bare helm without --kube-context: $(printf '%s' "$content" | sed -E 's/^ +//' | cut -c1-100)" + fi + fi +done < <( + # Only catch actual command invocations — line starts (optionally after + # whitespace) with `kubectl ` or `helm `. This intentionally ignores: + # - inline prose: "`kubectl apply` fails with ..." + # - markdown table cells: "| kubectl get nodes | ..." + # - comments: "# kubectl get pods" + # - shell pipelines on continuation lines (rare; can be tagged with + # noverify-context if needed) + find plugins -name '*.md' -type f -print0 \ + | xargs -0 grep -nE '^[[:space:]]*(kubectl|helm) ' \ + | grep -vE '^[^:]+:[0-9]+:[[:space:]]*#' +) + +# ---------------------------------------------------------------- Check 5: private cluster names denylist +echo "==> Check 5: no private cluster names in plugin / public content" + +# Built-in denylist; extensible via CCP_PRIVATE_NAMES (comma-separated). +default_private="dev6,dev9,dev17,instories,homelab" +private_names="${CCP_PRIVATE_NAMES:-$default_private}" + +IFS=',' read -ra denylist <<< "$private_names" +for name in "${denylist[@]}"; do + name="$(echo "$name" | tr -d '[:space:]')" + [ -z "$name" ] && continue + # Word-boundary search to avoid substring false positives. + while IFS= read -r hit; do + # Skip matches inside the validator itself (where the denylist is + # defined) — checked by path prefix. + file="${hit%%:*}" + case "$file" in + tools/check-refs.sh) continue ;; + esac + err "private cluster name '$name' in $hit" + done < <( + find plugins README.md CLAUDE.md -type f \( -name '*.md' -o -name '*.json' \) -print0 2>/dev/null \ + | xargs -0 grep -nE "(^|[^A-Za-z0-9_])${name}([^A-Za-z0-9_]|$)" 2>/dev/null \ + || true + ) +done + +# ---------------------------------------------------------------- Result +if [ "$errors" -gt 0 ]; then + echo "" + echo "FAIL: $errors cross-reference violation(s)." >&2 + exit 1 +fi + +echo "" +echo "OK: all cross-references valid."