Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions hcloud/cloud_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/syself/hetzner-cloud-controller-manager/internal/annotation"
"github.com/syself/hetzner-cloud-controller-manager/internal/credentials"
"github.com/syself/hetzner-cloud-controller-manager/internal/hcops"
robotclient "github.com/syself/hetzner-cloud-controller-manager/internal/robot/client"
hrobot "github.com/syself/hrobot-go"
"github.com/syself/hrobot-go/models"
corev1 "k8s.io/api/core/v1"
Expand All @@ -44,7 +45,15 @@ type testEnv struct {
Server *httptest.Server
Mux *http.ServeMux
Client *hcloud.Client
RobotClient hrobot.RobotClient
RobotClient robotclient.Client
}

type testRobotClient struct {
hrobot.RobotClient
}

func (c testRobotClient) ServerGetListForceRefresh() ([]models.Server, error) {
return c.ServerGetList()
}

func (env *testEnv) Teardown() {
Expand All @@ -70,7 +79,7 @@ func newTestEnv() testEnv {
Server: server,
Mux: mux,
Client: client,
RobotClient: robotClient,
RobotClient: testRobotClient{RobotClient: robotClient},
}
}

Expand Down
35 changes: 34 additions & 1 deletion hcloud/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package hcloud
import (
"context"
"fmt"
"sync"

"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/syself/hetzner-cloud-controller-manager/internal/legacydatacenter"
Expand All @@ -44,12 +45,22 @@ type instances struct {
robotClient robotclient.Client
addressFamily addressFamily
networkID int64

robotMissMu sync.Mutex
// robotMissByName counts repeated misses for young bare-metal nodes by name.
robotMissByName map[string]int
}

var errServerNotFound = fmt.Errorf("server not found")

func newInstances(client *hcloud.Client, robotClient robotclient.Client, addressFamily addressFamily, networkID int64) *instances {
return &instances{client, robotClient, addressFamily, networkID}
return &instances{
client: client,
robotClient: robotClient,
addressFamily: addressFamily,
networkID: networkID,
robotMissByName: make(map[string]int),
}
}

// lookupServer attempts to locate the corresponding hcloud.Server or models.Server (robot server) for a given v1.Node.
Expand Down Expand Up @@ -95,11 +106,33 @@ func (i *instances) lookupServer(
if err != nil {
return nil, nil, false, fmt.Errorf("failed to get robot server %q: %w", string(node.Name), err)
}
i.trackRobotServerByNameMiss(node, bmServer)
}
}
return hcloudServer, bmServer, isHCloudServer, nil
}

// trackRobotServerByNameMiss remembers repeated misses for young bare-metal nodes and
// emits a warning on the second miss to surface unexpected stale-cache behavior.
func (i *instances) trackRobotServerByNameMiss(node *corev1.Node, bmServer *models.Server) {
if node == nil || node.Name == "" {
return
}

i.robotMissMu.Lock()
defer i.robotMissMu.Unlock()

if bmServer != nil || !isYoungNode(node) {
delete(i.robotMissByName, string(node.Name))
return
}

i.robotMissByName[string(node.Name)]++
if i.robotMissByName[string(node.Name)] == 2 {
klog.Warningf("young node %q still missing in robot after %d lookup misses", node.Name, i.robotMissByName[string(node.Name)])
}
}

func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool, error) {
const op = "hcloud/instancesv2.InstanceExists"
metrics.OperationCalled.WithLabelValues(op).Inc()
Expand Down
146 changes: 146 additions & 0 deletions hcloud/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,24 @@ limitations under the License.
package hcloud

import (
"bytes"
"context"
"encoding/json"
"net"
"net/http"
"reflect"
"strings"
"testing"
"time"

"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/hetznercloud/hcloud-go/v2/hcloud/schema"
"github.com/syself/hetzner-cloud-controller-manager/internal/robot/client/cache"
"github.com/syself/hrobot-go/models"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog/v2"
)

// TestInstances_InstanceExists also tests [lookupServer]. The other tests
Expand Down Expand Up @@ -173,6 +178,147 @@ func TestInstances_InstanceExists(t *testing.T) {
}
}

func TestInstances_InstanceExistsRobotServerCreatedAfterCacheFill(t *testing.T) {
env := newTestEnv()
defer env.Teardown()

resetEnv := Setenv(t,
"ROBOT_USER_NAME", "user",
"ROBOT_PASSWORD", "pass",
"CACHE_TIMEOUT", "1h",
)
defer resetEnv()

// servers backs the Robot list response and is mutated during the test.
servers := make([]models.Server, 0, 2)
servers = append(servers, models.Server{
ServerIP: "123.123.123.123",
ServerIPv6Net: "2a01:f48:111:4221::",
ServerNumber: 321,
Name: "bm-existing",
})
env.Mux.HandleFunc("/robot/server", func(w http.ResponseWriter, _ *http.Request) {
responses := make([]models.ServerResponse, 0, len(servers))
for _, server := range servers {
responses = append(responses, models.ServerResponse{Server: server})
}
json.NewEncoder(w).Encode(responses)
})

robotClient, err := cache.NewCachedRobotClient(t.TempDir(), env.Server.Client(), env.Server.URL+"/robot")
if err != nil {
t.Fatalf("Unexpected error creating cached robot client: %v", err)
}

instances := newInstances(env.Client, robotClient, AddressFamilyIPv4, 0)
// creationTime keeps the test nodes inside the young-node refresh window.
creationTime := metav1.NewTime(time.Now())

// Warm the cache while bm-new does not exist yet.
exists, err := instances.InstanceExists(context.TODO(), &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "bm-existing",
CreationTimestamp: creationTime,
},
})
if err != nil {
t.Fatalf("Unexpected error warming cache: %v", err)
}
if !exists {
t.Fatal("Expected bm-existing to exist")
}

servers = append(servers, models.Server{
ServerIP: "123.123.123.124",
ServerIPv6Net: "2a01:f48:111:4222::",
ServerNumber: 322,
Name: "bm-new",
})

exists, err = instances.InstanceExists(context.TODO(), &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "bm-new",
CreationTimestamp: creationTime,
},
})
if err != nil {
t.Fatalf("Unexpected error for bm-new: %v", err)
}
if !exists {
t.Fatal("Expected bm-new to exist after it was created")
}
}

func TestInstances_InstanceExistsRobotServerLogsSecondYoungNodeMiss(t *testing.T) {
env := newTestEnv()
defer env.Teardown()

resetEnv := Setenv(t,
"ROBOT_USER_NAME", "user",
"ROBOT_PASSWORD", "pass",
"CACHE_TIMEOUT", "1h",
)
defer resetEnv()

env.Mux.HandleFunc("/robot/server", func(w http.ResponseWriter, _ *http.Request) {
json.NewEncoder(w).Encode([]models.ServerResponse{
{
Server: models.Server{
ServerIP: "123.123.123.123",
ServerIPv6Net: "2a01:f48:111:4221::",
ServerNumber: 321,
Name: "bm-existing",
},
},
})
})

robotClient, err := cache.NewCachedRobotClient(t.TempDir(), env.Server.Client(), env.Server.URL+"/robot")
if err != nil {
t.Fatalf("Unexpected error creating cached robot client: %v", err)
}

instances := newInstances(env.Client, robotClient, AddressFamilyIPv4, 0)
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "bm-new",
CreationTimestamp: metav1.NewTime(time.Now()),
},
}

state := klog.CaptureState()
defer state.Restore()

// logs captures klog output so the warning can be asserted directly.
var logs bytes.Buffer
klog.LogToStderr(false)
klog.SetOutput(&logs)

exists, err := instances.InstanceExists(context.TODO(), node)
if err != nil {
t.Fatalf("Unexpected error on first miss: %v", err)
}
if exists {
t.Fatal("Expected bm-new to be missing on first lookup")
}
klog.Flush()
if strings.Contains(logs.String(), "still missing in robot") {
t.Fatal("Did not expect warning log on first miss")
}

exists, err = instances.InstanceExists(context.TODO(), node)
if err != nil {
t.Fatalf("Unexpected error on second miss: %v", err)
}
if exists {
t.Fatal("Expected bm-new to be missing on second lookup")
}
klog.Flush()
if !strings.Contains(logs.String(), `young node "bm-new" still missing in robot after 2 lookup misses`) {
t.Fatalf("Expected warning log after second miss, got %q", logs.String())
}
}

func TestInstances_InstanceShutdown(t *testing.T) {
env := newTestEnv()
defer env.Teardown()
Expand Down
36 changes: 31 additions & 5 deletions hcloud/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"regexp"
"strings"
"time"

"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/syself/hetzner-cloud-controller-manager/internal/hcops"
Expand All @@ -30,6 +31,9 @@ import (
corev1 "k8s.io/api/core/v1"
)

// youngRobotServerLookupWindow limits forced Robot refreshes to newly created nodes.
var youngRobotServerLookupWindow = 10 * time.Minute

func getHCloudServerByName(ctx context.Context, c *hcloud.Client, name string) (*hcloud.Server, error) {
const op = "hcloud/getServerByName"
metrics.OperationCalled.WithLabelValues(op).Inc()
Expand Down Expand Up @@ -71,13 +75,18 @@ func getRobotServerByName(c robotclient.Client, node *corev1.Node) (server *mode
return nil, fmt.Errorf("%s: %w", op, err)
}

for i, s := range serverList {
if s.Name == node.Name {
server = &serverList[i]
}
server = findRobotServerByName(serverList, string(node.Name))
if server != nil || !isYoungNode(node) {
return server, nil
}

return server, nil
serverList, err = c.ServerGetListForceRefresh()
if err != nil {
hcops.HandleRateLimitExceededError(err, node)
return nil, fmt.Errorf("%s: refresh for young node: %w", op, err)
}

return findRobotServerByName(serverList, string(node.Name)), nil
}

func getRobotServerByID(c robotclient.Client, id int, node *corev1.Node) (s *models.Server, e error) {
Expand Down Expand Up @@ -116,6 +125,23 @@ func getRobotServerByID(c robotclient.Client, id int, node *corev1.Node) (s *mod
return server, nil
}

func findRobotServerByName(serverList []models.Server, name string) *models.Server {
for i, s := range serverList {
if s.Name == name {
return &serverList[i]
}
}
return nil
}

func isYoungNode(node *corev1.Node) bool {
if node == nil || node.CreationTimestamp.IsZero() {
return false
}

return time.Since(node.CreationTimestamp.Time) <= youngRobotServerLookupWindow
}

func isHCloudServerByName(name string) bool {
return !strings.HasPrefix(name, hostNamePrefixRobot)
}
Expand Down
4 changes: 4 additions & 0 deletions internal/mocks/robot.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ func (m *RobotClient) ServerGetList() ([]models.Server, error) {
return getRobotServers(args, 0), args.Error(1)
}

func (m *RobotClient) ServerGetListForceRefresh() ([]models.Server, error) {
return m.ServerGetList()
}

func (m *RobotClient) SetCredentials(_, _ string) error {
args := m.Called()
return args.Error(3)
Expand Down
Loading
Loading