diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index f0ff06969..d092c8d90 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -34,6 +34,7 @@ import ( "github.com/syself/hetzner-cloud-controller-manager/internal/annotation" "github.com/syself/hetzner-cloud-controller-manager/internal/credentials" "github.com/syself/hetzner-cloud-controller-manager/internal/hcops" + robotclient "github.com/syself/hetzner-cloud-controller-manager/internal/robot/client" hrobot "github.com/syself/hrobot-go" "github.com/syself/hrobot-go/models" corev1 "k8s.io/api/core/v1" @@ -44,7 +45,15 @@ type testEnv struct { Server *httptest.Server Mux *http.ServeMux Client *hcloud.Client - RobotClient hrobot.RobotClient + RobotClient robotclient.Client +} + +type testRobotClient struct { + hrobot.RobotClient +} + +func (c testRobotClient) ServerGetListForceRefresh() ([]models.Server, error) { + return c.ServerGetList() } func (env *testEnv) Teardown() { @@ -70,7 +79,7 @@ func newTestEnv() testEnv { Server: server, Mux: mux, Client: client, - RobotClient: robotClient, + RobotClient: testRobotClient{RobotClient: robotClient}, } } diff --git a/hcloud/instances.go b/hcloud/instances.go index 4a943a4e3..6b95bb043 100644 --- a/hcloud/instances.go +++ b/hcloud/instances.go @@ -19,6 +19,7 @@ package hcloud import ( "context" "fmt" + "sync" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/syself/hetzner-cloud-controller-manager/internal/legacydatacenter" @@ -44,12 +45,22 @@ type instances struct { robotClient robotclient.Client addressFamily addressFamily networkID int64 + + robotMissMu sync.Mutex + // robotMissByName counts repeated misses for young bare-metal nodes by name. + robotMissByName map[string]int } var errServerNotFound = fmt.Errorf("server not found") func newInstances(client *hcloud.Client, robotClient robotclient.Client, addressFamily addressFamily, networkID int64) *instances { - return &instances{client, robotClient, addressFamily, networkID} + return &instances{ + client: client, + robotClient: robotClient, + addressFamily: addressFamily, + networkID: networkID, + robotMissByName: make(map[string]int), + } } // lookupServer attempts to locate the corresponding hcloud.Server or models.Server (robot server) for a given v1.Node. @@ -95,11 +106,33 @@ func (i *instances) lookupServer( if err != nil { return nil, nil, false, fmt.Errorf("failed to get robot server %q: %w", string(node.Name), err) } + i.trackRobotServerByNameMiss(node, bmServer) } } return hcloudServer, bmServer, isHCloudServer, nil } +// trackRobotServerByNameMiss remembers repeated misses for young bare-metal nodes and +// emits a warning on the second miss to surface unexpected stale-cache behavior. +func (i *instances) trackRobotServerByNameMiss(node *corev1.Node, bmServer *models.Server) { + if node == nil || node.Name == "" { + return + } + + i.robotMissMu.Lock() + defer i.robotMissMu.Unlock() + + if bmServer != nil || !isYoungNode(node) { + delete(i.robotMissByName, string(node.Name)) + return + } + + i.robotMissByName[string(node.Name)]++ + if i.robotMissByName[string(node.Name)] == 2 { + klog.Warningf("young node %q still missing in robot after %d lookup misses", node.Name, i.robotMissByName[string(node.Name)]) + } +} + func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool, error) { const op = "hcloud/instancesv2.InstanceExists" metrics.OperationCalled.WithLabelValues(op).Inc() diff --git a/hcloud/instances_test.go b/hcloud/instances_test.go index 21c2f56bf..c6089e943 100644 --- a/hcloud/instances_test.go +++ b/hcloud/instances_test.go @@ -17,19 +17,24 @@ limitations under the License. package hcloud import ( + "bytes" "context" "encoding/json" "net" "net/http" "reflect" + "strings" "testing" + "time" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" + "github.com/syself/hetzner-cloud-controller-manager/internal/robot/client/cache" "github.com/syself/hrobot-go/models" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" cloudprovider "k8s.io/cloud-provider" + "k8s.io/klog/v2" ) // TestInstances_InstanceExists also tests [lookupServer]. The other tests @@ -173,6 +178,147 @@ func TestInstances_InstanceExists(t *testing.T) { } } +func TestInstances_InstanceExistsRobotServerCreatedAfterCacheFill(t *testing.T) { + env := newTestEnv() + defer env.Teardown() + + resetEnv := Setenv(t, + "ROBOT_USER_NAME", "user", + "ROBOT_PASSWORD", "pass", + "CACHE_TIMEOUT", "1h", + ) + defer resetEnv() + + // servers backs the Robot list response and is mutated during the test. + servers := make([]models.Server, 0, 2) + servers = append(servers, models.Server{ + ServerIP: "123.123.123.123", + ServerIPv6Net: "2a01:f48:111:4221::", + ServerNumber: 321, + Name: "bm-existing", + }) + env.Mux.HandleFunc("/robot/server", func(w http.ResponseWriter, _ *http.Request) { + responses := make([]models.ServerResponse, 0, len(servers)) + for _, server := range servers { + responses = append(responses, models.ServerResponse{Server: server}) + } + json.NewEncoder(w).Encode(responses) + }) + + robotClient, err := cache.NewCachedRobotClient(t.TempDir(), env.Server.Client(), env.Server.URL+"/robot") + if err != nil { + t.Fatalf("Unexpected error creating cached robot client: %v", err) + } + + instances := newInstances(env.Client, robotClient, AddressFamilyIPv4, 0) + // creationTime keeps the test nodes inside the young-node refresh window. + creationTime := metav1.NewTime(time.Now()) + + // Warm the cache while bm-new does not exist yet. + exists, err := instances.InstanceExists(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bm-existing", + CreationTimestamp: creationTime, + }, + }) + if err != nil { + t.Fatalf("Unexpected error warming cache: %v", err) + } + if !exists { + t.Fatal("Expected bm-existing to exist") + } + + servers = append(servers, models.Server{ + ServerIP: "123.123.123.124", + ServerIPv6Net: "2a01:f48:111:4222::", + ServerNumber: 322, + Name: "bm-new", + }) + + exists, err = instances.InstanceExists(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bm-new", + CreationTimestamp: creationTime, + }, + }) + if err != nil { + t.Fatalf("Unexpected error for bm-new: %v", err) + } + if !exists { + t.Fatal("Expected bm-new to exist after it was created") + } +} + +func TestInstances_InstanceExistsRobotServerLogsSecondYoungNodeMiss(t *testing.T) { + env := newTestEnv() + defer env.Teardown() + + resetEnv := Setenv(t, + "ROBOT_USER_NAME", "user", + "ROBOT_PASSWORD", "pass", + "CACHE_TIMEOUT", "1h", + ) + defer resetEnv() + + env.Mux.HandleFunc("/robot/server", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode([]models.ServerResponse{ + { + Server: models.Server{ + ServerIP: "123.123.123.123", + ServerIPv6Net: "2a01:f48:111:4221::", + ServerNumber: 321, + Name: "bm-existing", + }, + }, + }) + }) + + robotClient, err := cache.NewCachedRobotClient(t.TempDir(), env.Server.Client(), env.Server.URL+"/robot") + if err != nil { + t.Fatalf("Unexpected error creating cached robot client: %v", err) + } + + instances := newInstances(env.Client, robotClient, AddressFamilyIPv4, 0) + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bm-new", + CreationTimestamp: metav1.NewTime(time.Now()), + }, + } + + state := klog.CaptureState() + defer state.Restore() + + // logs captures klog output so the warning can be asserted directly. + var logs bytes.Buffer + klog.LogToStderr(false) + klog.SetOutput(&logs) + + exists, err := instances.InstanceExists(context.TODO(), node) + if err != nil { + t.Fatalf("Unexpected error on first miss: %v", err) + } + if exists { + t.Fatal("Expected bm-new to be missing on first lookup") + } + klog.Flush() + if strings.Contains(logs.String(), "still missing in robot") { + t.Fatal("Did not expect warning log on first miss") + } + + exists, err = instances.InstanceExists(context.TODO(), node) + if err != nil { + t.Fatalf("Unexpected error on second miss: %v", err) + } + if exists { + t.Fatal("Expected bm-new to be missing on second lookup") + } + klog.Flush() + if !strings.Contains(logs.String(), `young node "bm-new" still missing in robot after 2 lookup misses`) { + t.Fatalf("Expected warning log after second miss, got %q", logs.String()) + } +} + func TestInstances_InstanceShutdown(t *testing.T) { env := newTestEnv() defer env.Teardown() diff --git a/hcloud/util.go b/hcloud/util.go index e031ca6dc..7eadc2bc4 100644 --- a/hcloud/util.go +++ b/hcloud/util.go @@ -21,6 +21,7 @@ import ( "fmt" "regexp" "strings" + "time" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/syself/hetzner-cloud-controller-manager/internal/hcops" @@ -30,6 +31,9 @@ import ( corev1 "k8s.io/api/core/v1" ) +// youngRobotServerLookupWindow limits forced Robot refreshes to newly created nodes. +var youngRobotServerLookupWindow = 10 * time.Minute + func getHCloudServerByName(ctx context.Context, c *hcloud.Client, name string) (*hcloud.Server, error) { const op = "hcloud/getServerByName" metrics.OperationCalled.WithLabelValues(op).Inc() @@ -71,13 +75,18 @@ func getRobotServerByName(c robotclient.Client, node *corev1.Node) (server *mode return nil, fmt.Errorf("%s: %w", op, err) } - for i, s := range serverList { - if s.Name == node.Name { - server = &serverList[i] - } + server = findRobotServerByName(serverList, string(node.Name)) + if server != nil || !isYoungNode(node) { + return server, nil } - return server, nil + serverList, err = c.ServerGetListForceRefresh() + if err != nil { + hcops.HandleRateLimitExceededError(err, node) + return nil, fmt.Errorf("%s: refresh for young node: %w", op, err) + } + + return findRobotServerByName(serverList, string(node.Name)), nil } func getRobotServerByID(c robotclient.Client, id int, node *corev1.Node) (s *models.Server, e error) { @@ -116,6 +125,23 @@ func getRobotServerByID(c robotclient.Client, id int, node *corev1.Node) (s *mod return server, nil } +func findRobotServerByName(serverList []models.Server, name string) *models.Server { + for i, s := range serverList { + if s.Name == name { + return &serverList[i] + } + } + return nil +} + +func isYoungNode(node *corev1.Node) bool { + if node == nil || node.CreationTimestamp.IsZero() { + return false + } + + return time.Since(node.CreationTimestamp.Time) <= youngRobotServerLookupWindow +} + func isHCloudServerByName(name string) bool { return !strings.HasPrefix(name, hostNamePrefixRobot) } diff --git a/internal/mocks/robot.go b/internal/mocks/robot.go index f2b1783e2..9b7dafef5 100644 --- a/internal/mocks/robot.go +++ b/internal/mocks/robot.go @@ -14,6 +14,10 @@ func (m *RobotClient) ServerGetList() ([]models.Server, error) { return getRobotServers(args, 0), args.Error(1) } +func (m *RobotClient) ServerGetListForceRefresh() ([]models.Server, error) { + return m.ServerGetList() +} + func (m *RobotClient) SetCredentials(_, _ string) error { args := m.Called() return args.Error(3) diff --git a/internal/robot/client/cache/client.go b/internal/robot/client/cache/client.go index 29d02b961..481f83293 100644 --- a/internal/robot/client/cache/client.go +++ b/internal/robot/client/cache/client.go @@ -90,22 +90,9 @@ func NewCachedRobotClient(rootDir string, httpClient *http.Client, baseURL strin func (c *cacheRobotClient) ServerGet(id int) (*models.Server, error) { if c.shouldSync() { - list, err := c.robotClient.ServerGetList() - if err != nil { + if _, err := c.sync(); err != nil { return nil, err } - - // populate list - c.l = list - - // remove all entries from map and populate it freshly - c.m = make(map[int]*models.Server) - for i, server := range list { - c.m[server.ServerNumber] = &list[i] - } - - // set time of last update - c.lastUpdate = time.Now() } server, found := c.m[id] @@ -119,27 +106,17 @@ func (c *cacheRobotClient) ServerGet(id int) (*models.Server, error) { func (c *cacheRobotClient) ServerGetList() ([]models.Server, error) { if c.shouldSync() { - list, err := c.robotClient.ServerGetList() - if err != nil { - return list, err - } - - // populate list - c.l = list - - // remove all entries from map and populate it freshly - c.m = make(map[int]*models.Server) - for i, server := range list { - c.m[server.ServerNumber] = &list[i] - } - - // set time of last update - c.lastUpdate = time.Now() + return c.sync() } return c.l, nil } +// ServerGetListForceRefresh bypasses the timeout check and reloads the cache from Robot. +func (c *cacheRobotClient) ServerGetListForceRefresh() ([]models.Server, error) { + return c.sync() +} + func (c *cacheRobotClient) shouldSync() bool { // map is nil means we have no cached value yet if c.m == nil { @@ -161,3 +138,24 @@ func (c *cacheRobotClient) SetCredentials(username, password string) error { c.m = nil return nil } + +func (c *cacheRobotClient) sync() ([]models.Server, error) { + list, err := c.robotClient.ServerGetList() + if err != nil { + return list, err + } + + // populate list + c.l = list + + // remove all entries from map and repopulate it from the current list + c.m = make(map[int]*models.Server) + for i, server := range list { + c.m[server.ServerNumber] = &list[i] + } + + // set time of last update + c.lastUpdate = time.Now() + + return c.l, nil +} diff --git a/internal/robot/client/interface.go b/internal/robot/client/interface.go index 06ecc9bf3..ee086f8ba 100644 --- a/internal/robot/client/interface.go +++ b/internal/robot/client/interface.go @@ -5,5 +5,6 @@ import "github.com/syself/hrobot-go/models" type Client interface { ServerGet(id int) (*models.Server, error) ServerGetList() ([]models.Server, error) + ServerGetListForceRefresh() ([]models.Server, error) SetCredentials(username, password string) error }