From 48f2d5005657390f1050ba449cc63377b5ed06d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=A4stner?= Date: Wed, 27 May 2026 18:51:30 +0200 Subject: [PATCH] Implement NX-OS reprovisioning via NX-API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the defunct GNMI-based reprovisioning with NXAPI JSON-RPC calls. The new implementation issues two requests: 1. boot poap enable + copy running-config startup-config (batched with stop-on-error rollback) 2. reload (separate request that tolerates transport errors because the device goes down before responding) Add nxapi.IsTransportError to distinguish network-level errors (EOF, timeout, connection reset) from logical errors (RPCError, HTTPError), enabling callers of disruptive commands to tolerate expected connection drops. Remove the now unused BootPOAP GNMI type. Signed-off-by: Felix Kästner --- internal/provider/cisco/nxos/provider.go | 43 +++--- .../provider/cisco/nxos/reprovision_test.go | 122 ++++++++++++++++++ internal/provider/cisco/nxos/system.go | 8 -- internal/transport/nxapi/nxapi.go | 17 +++ internal/transport/nxapi/nxapi_test.go | 81 ++++++++++++ 5 files changed, 246 insertions(+), 25 deletions(-) create mode 100644 internal/provider/cisco/nxos/reprovision_test.go diff --git a/internal/provider/cisco/nxos/provider.go b/internal/provider/cisco/nxos/provider.go index 09302ca23..a79ad2743 100644 --- a/internal/provider/cisco/nxos/provider.go +++ b/internal/provider/cisco/nxos/provider.go @@ -70,13 +70,14 @@ type Provider struct { nxapi *nxapi.Client } +// timeout is the default timeout for all HTTP/gRPC requests made by the provider. +const timeout = 30 * time.Second + func NewProvider() provider.Provider { return &Provider{} } func (p *Provider) Connect(ctx context.Context, conn *deviceutil.Connection) (err error) { - // timeout is the default timeout for all HTTP/gRPC requests made by the provider. - const timeout = 30 * time.Second p.conn, err = grpcext.NewClient(conn, grpcext.WithDefaultTimeout(timeout)) if err != nil { return fmt.Errorf("failed to create grpc connection: %w", err) @@ -141,23 +142,31 @@ func (p *Provider) FactoryReset(ctx context.Context, conn *deviceutil.Connection return FactoryReset(ctx, p.conn) } -func (p *Provider) Reprovision(ctx context.Context, conn *deviceutil.Connection) (reterr error) { - if err := p.Connect(ctx, conn); err != nil { - return err +func (p *Provider) Reprovision(ctx context.Context, conn *deviceutil.Connection) error { + c := *conn + c.Address = netip.MustParseAddrPort(conn.Address).String() + client, err := nxapi.NewClient(&c, timeout) + if err != nil { + return fmt.Errorf("failed to create nxapi client: %w", err) } - defer func() { - if err := p.Disconnect(ctx, conn); err != nil { - reterr = errors.Join(reterr, err) - } - }() - // This is currently defunct on NX-OS, as enabling POAP requires a `copy running-config startup-config` which we - // cannot issue via GNMI - // TODO add once NXAPI client is available - poap := BootPOAP("enable") - if err := p.client.Update(ctx, &poap); err != nil { - return err + + _, err = client.Do(ctx, nxapi.NewRequest( + "boot poap enable", + "copy running-config startup-config", + ).WithRollback(nxapi.Stop)) + if err != nil { + return fmt.Errorf("failed to prepare device for reprovisioning: %w", err) } - return Reboot(ctx, p.conn) + + // Reboot is issued as a separate request because it actually restarts + // the device. The connection will drop before a response is received, + // so transport errors are expected and tolerated. + _, err = client.Do(ctx, nxapi.NewRequest("reload")) + if err != nil && !nxapi.IsTransportError(err) { + return fmt.Errorf("failed to reboot device: %w", err) + } + + return nil } func (p *Provider) ListPorts(ctx context.Context) ([]provider.DevicePort, error) { diff --git a/internal/provider/cisco/nxos/reprovision_test.go b/internal/provider/cisco/nxos/reprovision_test.go new file mode 100644 index 000000000..6c9dba26e --- /dev/null +++ b/internal/provider/cisco/nxos/reprovision_test.go @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package nxos + +import ( + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "slices" + "testing" + + "github.com/ironcore-dev/network-operator/internal/deviceutil" +) + +func TestReprovision(t *testing.T) { + t.Run("success with connection drop on reload", func(t *testing.T) { + var requests [][]string + called := 0 + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var cmds []struct { + Params struct { + Cmd string `json:"cmd"` + } `json:"params"` + } + if err := json.NewDecoder(r.Body).Decode(&cmds); err != nil { + t.Fatalf("failed to decode request: %v", err) + } + + batch := make([]string, len(cmds)) + for i, c := range cmds { + batch[i] = c.Params.Cmd + } + requests = append(requests, batch) + called++ + + if called == 1 { + w.Header().Set("Content-Type", "application/json-rpc") + fmt.Fprint(w, `[ + {"jsonrpc":"2.0","result":null,"id":1}, + {"jsonrpc":"2.0","result":null,"id":2} + ]`) + return + } + + // Simulate device going down by closing connection abruptly. + hijacker, ok := w.(http.Hijacker) + if !ok { + t.Fatal("server does not support hijacking") + } + conn, _, err := hijacker.Hijack() + if err != nil { + t.Fatalf("hijack failed: %v", err) + } + conn.Close() + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err != nil { + t.Fatalf("Reprovision returned unexpected error: %v", err) + } + + if len(requests) != 2 { + t.Fatalf("expected 2 NXAPI requests, got %d", len(requests)) + } + if !slices.Equal(requests[0], []string{"boot poap enable", "copy running-config startup-config"}) { + t.Errorf("prep batch = %v", requests[0]) + } + if !slices.Equal(requests[1], []string{"reload"}) { + t.Errorf("reload request = %v", requests[1]) + } + }) + + t.Run("prep batch RPC error fails", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json-rpc") + w.WriteHeader(http.StatusBadRequest) + fmt.Fprint(w, `[{"jsonrpc":"2.0","error":{"code":-32602,"message":"Invalid command"},"id":1}]`) + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err == nil { + t.Fatal("expected error from prep batch, got nil") + } + }) + + t.Run("reload RPC error fails", func(t *testing.T) { + called := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called++ + w.Header().Set("Content-Type", "application/json-rpc") + if called == 1 { + fmt.Fprint(w, `[ + {"jsonrpc":"2.0","result":null,"id":1}, + {"jsonrpc":"2.0","result":null,"id":2} + ]`) + return + } + w.WriteHeader(http.StatusBadRequest) + fmt.Fprint(w, `[{"jsonrpc":"2.0","error":{"code":-32602,"message":"Permission denied"},"id":1}]`) + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err == nil { + t.Fatal("expected error from reload RPC failure, got nil") + } + }) +} diff --git a/internal/provider/cisco/nxos/system.go b/internal/provider/cisco/nxos/system.go index 009c22f1d..b1d8999a7 100644 --- a/internal/provider/cisco/nxos/system.go +++ b/internal/provider/cisco/nxos/system.go @@ -68,14 +68,6 @@ func (*FirmwareVersion) XPath() string { return "System/showversion-items/nxosVersion" } -var _ gnmiext.DataElement = (*BootPOAP)(nil) - -type BootPOAP string - -func (*BootPOAP) XPath() string { - return "/System/boot-items/poap" -} - type BootTime UnixTime func (*BootTime) XPath() string { diff --git a/internal/transport/nxapi/nxapi.go b/internal/transport/nxapi/nxapi.go index 5881ecbfd..427e27f9c 100644 --- a/internal/transport/nxapi/nxapi.go +++ b/internal/transport/nxapi/nxapi.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io" + "net" "net/http" "net/url" "time" @@ -257,3 +258,19 @@ type HTTPError struct { func (e *HTTPError) Error() string { return fmt.Sprintf("nxapi: non-2xx status code: %d - %s", e.Code, string(e.Body)) } + +// IsTransportError reports whether err is a network-level transport error +// (connection reset, timeout, EOF) as opposed to a logical error returned +// by the NX-API endpoint (RPCError, HTTPError). This is useful for callers +// that issue disruptive commands (e.g. reboot) where the device going down +// mid-request is expected. +func IsTransportError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + return true + } + var netErr net.Error + return errors.As(err, &netErr) +} diff --git a/internal/transport/nxapi/nxapi_test.go b/internal/transport/nxapi/nxapi_test.go index 0b6c73a32..53e17f25b 100644 --- a/internal/transport/nxapi/nxapi_test.go +++ b/internal/transport/nxapi/nxapi_test.go @@ -9,8 +9,10 @@ import ( "encoding/json" "errors" "fmt" + "io" "net/http" "net/http/httptest" + "net/url" "strings" "testing" @@ -295,3 +297,82 @@ func TestDo(t *testing.T) { }) } } + +func TestIsTransportError(t *testing.T) { + tests := []struct { + desc string + err error + want bool + }{ + { + desc: "nil is not a transport error", + err: nil, + want: false, + }, + { + desc: "RPCError is not a transport error", + err: &RPCError{Code: -32602, Message: "Invalid params"}, + want: false, + }, + { + desc: "RPCErrors is not a transport error", + err: RPCErrors{&RPCError{Code: -32602, Message: "Invalid params"}}, + want: false, + }, + { + desc: "HTTPError is not a transport error", + err: &HTTPError{Code: 401, Body: []byte("unauthorized")}, + want: false, + }, + { + desc: "generic error is not a transport error", + err: errors.New("some logic error"), + want: false, + }, + { + desc: "io.EOF is a transport error", + err: io.EOF, + want: true, + }, + { + desc: "io.ErrUnexpectedEOF is a transport error", + err: io.ErrUnexpectedEOF, + want: true, + }, + { + desc: "wrapped io.EOF is a transport error", + err: fmt.Errorf("request failed: %w", io.EOF), + want: true, + }, + { + desc: "net.Error is a transport error", + err: &netError{msg: "i/o timeout"}, + want: true, + }, + { + desc: "url.Error wrapping net.Error is a transport error", + err: &url.Error{Op: "Post", URL: "http://x/ins", Err: &netError{msg: "i/o timeout"}}, + want: true, + }, + { + desc: "wrapped net.Error is a transport error", + err: fmt.Errorf("read tcp: %w", &netError{msg: "connection reset by peer"}), + want: true, + }, + } + for _, test := range tests { + t.Run(test.desc, func(t *testing.T) { + got := IsTransportError(test.err) + if got != test.want { + t.Errorf("IsTransportError(%v) = %t, want %t", test.err, got, test.want) + } + }) + } +} + +// netError is a mock net.Error for testing. +type netError struct{ msg string } + +func (e *netError) Error() string { return e.msg } +func (e *netError) Timeout() bool { return false } +func (e *netError) Temporary() bool { return false }