Skip to content
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ E2E_CONF_FILE_SOURCE ?= $(shell pwd)/test/e2e/config/cloudscale.yaml
E2E_CONF_FILE ?= $(shell pwd)/test/e2e/config/cloudscale.generated.yaml
E2E_ARTIFACTS_FOLDER ?= $(shell pwd)/_artifacts
E2E_TEMPLATES := test/e2e/data/infrastructure-cloudscale
GINKGO_TIMEOUT ?= 2h
GINKGO_TIMEOUT ?= 3h
GINKGO_NODES ?= 1
SKIP_RESOURCE_CLEANUP ?= false
USE_EXISTING_CLUSTER ?= false
Expand Down Expand Up @@ -147,7 +147,8 @@ generate-e2e-config: ## Generate e2e config from template by resolving environme

.PHONY: test-e2e
test-e2e: TAG = $(E2E_TAG)
test-e2e: $(GINKGO) generate-e2e-templates generate-e2e-config docker-build docker-push ## Run all e2e tests
test-e2e: KUBETEST_CONFIGURATION = ./data/kubetest/conformance-fast.yaml
test-e2e: $(GINKGO) generate-e2e-templates generate-e2e-config docker-build docker-push ## Run all e2e tests (uses conformance-fast; for full conformance run test-e2e-conformance separately)
$(GINKGO) -v --trace --tags=e2e \
--nodes=$(GINKGO_NODES) \
--timeout=$(GINKGO_TIMEOUT) \
Expand Down
23 changes: 14 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,16 @@ clusterctl generate cluster my-cluster \

| Flavor | Network | CP Endpoint | Node Connectivity | Extra Env Vars | Notes |
|---------------------------|---------------------------|-----------------------|-------------------|---------------------------|----------------------|
| *(default)* | Managed (`10.100.0.0/24`) | Public LB (DualStack) | Public + cluster | — | |
| *(default)* | Managed (`172.18.0.0/24`) | Public LB (DualStack) | Public + cluster | — | |
| `fip` | Pre-Existing | Floating IP (IPv4) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | |
| `public-lb-private-nodes` | Pre-Existing + NAT | Public LB | Private only | `CLOUDSCALE_NETWORK_UUID` | Requires NAT gateway |
| `pre-existing-network` | Pre-Existing | Public LB (DualStack) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | |

The default `networks[].cidr` is `172.18.0.0/24` so it does not overlap with the default Cilium
cluster-pool IPAM range `10.0.0.0/8`. If you override `networks[].cidr` to a range inside
`10.0.0.0/8`, make sure to configure your CNI's IP range correctly. Overlapping
ranges may break for example control-plane LB's health checks.

## Development

This is a kubebuilder-scaffolded project. For new APIs, Webhooks, etc. [kubebuilder](https://book.kubebuilder.io/)
Expand Down Expand Up @@ -119,14 +124,14 @@ filtering and are split into suites of increasing cost, scheduled accordingly:

| Suite | Label | Description | ~Duration | Schedule | Make target |
|-------------------------|---------------------------|------------------------------------------------------------------------------------------|-----------|----------|------------------------------------|
| Lifecycle | `lifecycle` | 1 CP + 1 worker: create, validate cloudscale resources, delete | < 5 min | Nightly | `test-e2e-lifecycle` |
| HA lifecycle | `ha` | 3 CP + 2 workers with anti-affinity server groups | < 10 min | Weekly | `test-e2e-ha` |
| Cluster upgrade | `upgrade` | Rolling K8s version upgrade (v1.34 → v1.35) | < 10 min | Weekly | `test-e2e-upgrade` |
| Self-hosted | `self-hosted` | clusterctl move (pivot) to workload cluster. Requires container image in public registry | < 15 min | Weekly | `test-e2e-self-hosted` |
| MD remediation | `md-remediation` | MachineHealthCheck auto-replacement of unhealthy workers | < 10 min | Weekly | `test-e2e-md-remediation` |
| Pre-Existing networking | `pre-existing-networking` | Pre-Existing network: public-LB + private-nodes and floating-IP variants | < 10 min | Weekly | `test-e2e-pre-existing-networking` |
| Conformance (fast) | `conformance` | K8s conformance, skip Serial tests | < 60 min | Weekly | `test-e2e-conformance-fast` |
| Conformance (full) | `conformance` | Full K8s conformance including Serial tests | < 120 min | Biweekly | `test-e2e-conformance` |
| Lifecycle | `lifecycle` | 1 CP + 1 worker: create, validate cloudscale resources, delete | ~5 min | Nightly | `test-e2e-lifecycle` |
| HA lifecycle | `ha` | 3 CP + 2 workers with anti-affinity server groups | ~8 min | Weekly | `test-e2e-ha` |
| Cluster upgrade | `upgrade` | Rolling K8s version upgrade (v1.34 → v1.35) | ~25 min | Weekly | `test-e2e-upgrade` |
| Self-hosted | `self-hosted` | clusterctl move (pivot) to workload cluster. Requires container image in public registry | ~13 min | Weekly | `test-e2e-self-hosted` |
| MD remediation | `md-remediation` | MachineHealthCheck auto-replacement of unhealthy workers | ~6 min | Weekly | `test-e2e-md-remediation` |
| Pre-Existing networking | `pre-existing-networking` | Pre-Existing network: public-LB + private-nodes and floating-IP variants | ~30 min | Weekly | `test-e2e-pre-existing-networking` |
| Conformance (fast) | `conformance` | K8s conformance, skip Serial tests | ~55 min | Weekly | `test-e2e-conformance-fast` |
| Conformance (full) | `conformance` | Full K8s conformance including Serial tests | ~120 min | Biweekly | `test-e2e-conformance` |

Durations are approximate from a real CI run; conformance varies with cluster size.

Expand Down
6 changes: 0 additions & 6 deletions api/v1beta2/cloudscalecluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,6 @@ type LoadBalancerSpec struct {
// +optional
Network string `json:"network,omitempty"`

// IPFamily specifies the IP family for the LB VIP address(es).
// +kubebuilder:validation:Enum=IPv4;IPv6;DualStack
// +kubebuilder:default=DualStack
// +optional
IPFamily IPFamily `json:"ipFamily,omitempty"`

// HealthMonitor configures the load balancer health monitor.
// +optional
HealthMonitor HealthMonitorSpec `json:"healthMonitor,omitempty"`
Expand Down
54 changes: 35 additions & 19 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"crypto/tls"
"flag"
"fmt"
"net/http"
"os"
"time"

Expand Down Expand Up @@ -70,6 +71,8 @@ func main() {
var probeAddr string
var secureMetrics bool
var enableHTTP2 bool
var clusterConcurrency int
var machineConcurrency int
var watchFilter string
var tlsOpts []func(*tls.Config)
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
Expand All @@ -89,6 +92,10 @@ func main() {
flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.")
flag.BoolVar(&enableHTTP2, "enable-http2", false,
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
flag.IntVar(&clusterConcurrency, "cluster-concurrency", 1,
"Maximum concurrent reconciles for CloudscaleCluster controller (1-4)")
flag.IntVar(&machineConcurrency, "machine-concurrency", 1,
"Maximum concurrent reconciles for CloudscaleMachine controller (1-10)")
flag.StringVar(&watchFilter, "watch-filter", "",
fmt.Sprintf("Label value that the controller watches to reconcile cluster-api objects. Label key is always %s. "+
"If unspecified, the controller watches for all cluster-api objects.", clusterv1.WatchLabel))
Expand All @@ -100,6 +107,17 @@ func main() {

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

if clusterConcurrency < 1 || clusterConcurrency > 4 {
setupLog.Error(
fmt.Errorf("--cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency), "invalid flag")
os.Exit(1)
}
if machineConcurrency < 1 || machineConcurrency > 10 {
setupLog.Error(
fmt.Errorf("--machine-concurrency must be between 1 and 10, got %d", machineConcurrency), "invalid flag")
os.Exit(1)
}

// if the enable-http2 flag is false (the default), http/2 should be disabled
// due to its vulnerabilities. More specifically, disabling http/2 will
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
Expand Down Expand Up @@ -174,16 +192,6 @@ func main() {
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "cloudscale.infrastructure.cluster.x-k8s.io",
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
// when the Manager ends. This requires the binary to immediately end when the
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
// speeds up voluntary leader transitions as the new leader don't have to wait
// LeaseDuration time first.
//
// In the default scaffold provided, the program ends immediately after
// the manager stops, so would be fine to enable this option. However,
// if you are doing or is intended to do any operation such as perform cleanups
// after the manager stops then its usage might be unsafe.
// LeaderElectionReleaseOnCancel: true,
})
if err != nil {
Expand All @@ -193,8 +201,12 @@ func main() {

ctx := ctrl.SetupSignalHandler()

// Create a shared HTTP transport for all cloudscale API clients.
// This enables connection pooling and HTTP/2 multiplexing across reconciles.
transport := cloudscale.NewTransport()

// Fetch region information for controllers and webhooks
regionInfo, flavorInfo, err := fetchAPIInfo()
regionInfo, flavorInfo, err := fetchAPIInfo(transport)
if err != nil {
setupLog.Error(err, "unable to fetch API information")
os.Exit(1)
Expand All @@ -203,17 +215,21 @@ func main() {
setupLog.Info("fetched flavor information", "flavors", len(flavorInfo.GetAllFlavors()))

if err := (&controller.CloudscaleClusterReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
WatchFilter: watchFilter,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
WatchFilter: watchFilter,
Transport: transport,
MaxConcurrentReconciles: clusterConcurrency,
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleCluster")
os.Exit(1)
}
if err := (&controller.CloudscaleMachineReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
WatchFilter: watchFilter,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
WatchFilter: watchFilter,
Transport: transport,
MaxConcurrentReconciles: machineConcurrency,
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleMachine")
os.Exit(1)
Expand Down Expand Up @@ -263,7 +279,7 @@ func main() {

// fetchAPIInfo fetches region and flavor information from cloudscale.ch API.
// Requires CLOUDSCALE_API_TOKEN environment variable.
func fetchAPIInfo() (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) {
func fetchAPIInfo(transport *http.Transport) (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) {
token := os.Getenv("CLOUDSCALE_API_TOKEN")
if token == "" {
return nil, nil, fmt.Errorf("CLOUDSCALE_API_TOKEN environment variable is required")
Expand All @@ -272,7 +288,7 @@ func fetchAPIInfo() (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

client := cloudscale.NewClient(token)
client := cloudscale.NewClient(token, transport)

var regionInfo *cloudscale.RegionInfo
var flavorInfo *cloudscale.FlavorInfo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,19 +139,6 @@ spec:
minimum: 1
type: integer
type: object
ipFamily:
allOf:
- enum:
- IPv4
- IPv6
- DualStack
- enum:
- IPv4
- IPv6
- DualStack
default: DualStack
description: IPFamily specifies the IP family for the LB VIP address(es).
type: string
network:
description: |-
Network places the LB VIP on a private network (internal LB).
Expand Down
11 changes: 11 additions & 0 deletions config/e2e/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- ../default

patches:
- path: manager_concurrency_patch.yaml
target:
kind: Deployment
name: controller-manager
6 changes: 6 additions & 0 deletions config/e2e/manager_concurrency_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- op: add
path: /spec/template/spec/containers/0/args/-
value: --cluster-concurrency=4
- op: add
path: /spec/template/spec/containers/0/args/-
value: --machine-concurrency=10
79 changes: 76 additions & 3 deletions internal/cloudscale/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@ limitations under the License.
package cloudscale

import (
"context"
"errors"
"net"
"net/http"
"net/url"
"os"
"strings"
"time"

cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8"
"golang.org/x/oauth2"
Expand All @@ -40,9 +44,61 @@ type Client struct {
Flavors FlavorService
}

func NewClient(token string) *Client {
const (
// ReadTimeout is the context timeout for Get/List API calls.
ReadTimeout = 10 * time.Second

// WriteTimeout is the context timeout for Create/Update API calls.
// Creates can take 60s+ under API load.
WriteTimeout = 2 * time.Minute

// DeleteTimeout is the context timeout for Delete API calls.
DeleteTimeout = 1 * time.Minute
)

// NewTransport creates an http.Transport configured for the cloudscale.ch API.
// The returned transport should be created once and shared across all clients
// to benefit from connection pooling and HTTP/2 multiplexing.
func NewTransport() *http.Transport {
return &http.Transport{
DialContext: (&net.Dialer{
Timeout: 5 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,

TLSHandshakeTimeout: 5 * time.Second,

// needs to be set because we also set DialContext
ForceAttemptHTTP2: true,
HTTP2: &http.HTTP2Config{
SendPingTimeout: 5 * time.Second,
PingTimeout: 3 * time.Second,
},

IdleConnTimeout: 90 * time.Second,
MaxIdleConns: 50,
MaxIdleConnsPerHost: 50,
MaxConnsPerHost: 0,
}
}

// NewClient creates a cloudscale.ch API client using the given token and
// shared transport. The transport should be created once via NewTransport()
// and reused across clients. Each client gets its own oauth2 token injection
// but shares the underlying connection pool.
//
// No global HTTP timeout is set on the client. Instead, callers must use
// context.WithTimeout with ReadTimeout, WriteTimeout, or DeleteTimeout
// for each API call.
func NewClient(token string, transport *http.Transport) *Client {
tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token})
httpClient := oauth2.NewClient(context.Background(), tokenSource)

httpClient := &http.Client{
Transport: &oauth2.Transport{
Source: tokenSource,
Base: transport,
},
}
sdkClient := cloudscalesdk.NewClient(httpClient)

return &Client{
Expand Down Expand Up @@ -82,3 +138,20 @@ func IsFloatingIPNoPublicInterface(err error) bool {
}
return strings.Contains(err.Error(), "does not have a public interface with an IPv4 address")
}

// IsTimeoutError reports whether err indicates the HTTP request timed out
// before receiving a response.
func IsTimeoutError(err error) bool {
if err == nil {
return false
}

var urlErr *url.Error
if errors.As(err, &urlErr) {
return urlErr.Timeout()
}
if errors.Is(err, os.ErrDeadlineExceeded) {
return true
}
return false
}
50 changes: 50 additions & 0 deletions internal/cloudscale/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package cloudscale

import (
"fmt"
"net/url"
"os"
"testing"

cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8"
Expand Down Expand Up @@ -46,3 +48,51 @@ func TestIsNotFound(t *testing.T) {
})
}
}

func TestIsTimeoutError(t *testing.T) {
tests := []struct {
name string
err error
expected bool
}{
{"nil error returns false", nil, false},
{
"url.Error with Timeout=true returns true",
&url.Error{Op: "Post", URL: "https://api.example.com/v1/servers", Err: os.ErrDeadlineExceeded},
true,
},
{
"url.Error with Timeout=false returns false",
&url.Error{Op: "Get", URL: "https://api.example.com/v1/servers", Err: fmt.Errorf("connection refused")},
false,
},
{
"wrapped url.Error with Timeout=true returns true",
fmt.Errorf("outer: %w", &url.Error{Op: "Post", URL: "https://api.example.com/v1/servers", Err: os.ErrDeadlineExceeded}),
true,
},
{
"os.ErrDeadlineExceeded returns true",
os.ErrDeadlineExceeded,
true,
},
{
"generic error returns false",
fmt.Errorf("some other error"),
false,
},
{
"ErrorResponse with 500 returns false",
&cloudscalesdk.ErrorResponse{StatusCode: 500},
false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)
result := IsTimeoutError(tt.err)
g.Expect(result).To(Equal(tt.expected))
})
}
}
Loading