feat(blobmanager/s3accesspoint): optional managed session policy via PolicyArns (#3145)

jiparis · web-flow · commit 97a2ccc76661 · 2026-05-22T11:26:05.000+02:00
diff --git a/pkg/blobmanager/s3accesspoint/backend.go b/pkg/blobmanager/s3accesspoint/backend.go
@@ -30,6 +30,7 @@ import (
 	"github.com/aws/aws-sdk-go-v2/service/s3"
 	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
 	"github.com/aws/aws-sdk-go-v2/service/sts"
+	ststypes "github.com/aws/aws-sdk-go-v2/service/sts/types"
 	"github.com/aws/smithy-go"
 	pb "github.com/chainloop-dev/chainloop/app/artifact-cas/api/cas/v1"
 	robotaccount "github.com/chainloop-dev/chainloop/internal/robotaccount/cas"
@@ -47,6 +48,13 @@ const (
 // be useless against an AP policy condition.
 var ErrMissingRequestingOrg = errors.New("s3accesspoint: requesting org missing from claims")
 
+// stsAssumer is the subset of *sts.Client that the credentials provider
+// actually uses. Keeping the dependency at interface-level lets tests
+// inject a fake without spinning up a real AWS config.
+type stsAssumer interface {
+	AssumeRole(ctx context.Context, in *sts.AssumeRoleInput, optFns ...func(*sts.Options)) (*sts.AssumeRoleOutput, error)
+}
+
 // Backend is the per-tenant uploader/downloader. One *Backend instance is
 // bound to one access point; the actual AWS credentials are minted
 // per-request via STS using the org UUID found in the request context.
@@ -56,7 +64,7 @@ type Backend struct {
 	// stsClient is built once at construction using the pod's ambient
 	// IAM identity. The credential chain (IRSA → IMDS → env →
 	// ~/.aws/credentials) picks up the identity automatically.
-	stsClient *sts.Client
+	stsClient stsAssumer
 
 	// s3Client uses a custom CredentialsProvider that mints a scoped
 	// session per request (cached in-process per requesting-org so back-
@@ -261,7 +269,7 @@ func (b *Backend) CheckWritePermissions(ctx context.Context) error {
 // reusing the temporary credentials across consecutive calls until the
 // expiration window approaches.
 type sessionCredentialsProvider struct {
-	stsClient *sts.Client
+	stsClient stsAssumer
 
 	// ambientCreds is the SDK-default credentials provider captured from
 	// awsCfg at construction time. Only consulted when
@@ -296,21 +304,34 @@ func (p *sessionCredentialsProvider) Retrieve(ctx context.Context) (aws.Credenti
 		return p.ambientCreds.Retrieve(ctx)
 	}
 
-	// Session policy intersects with the base role's permissions and
-	// pins this session to the caller's AP ARN. Cross-tenant defense
-	// against a tampered AccessPointARN in the secret blob lives in the
-	// AP's resource policy (aws:userid StringEquals on the role session
-	// name minted from the request-context org UUID), not here — keeping
-	// the inline policy small leaves headroom in STS's packed-policy
-	// budget for tags inherited from the caller principal.
-	sessionPolicy := buildSessionPolicy(p.creds.AccessPointARN)
-
-	out, err := p.stsClient.AssumeRole(ctx, &sts.AssumeRoleInput{
+	// The session policy intersects with the base role's permissions and
+	// pins this session to the caller's AP. Cross-tenant defense against
+	// a tampered AccessPointARN in the secret blob lives in the AP's
+	// resource policy (aws:userid StringEquals on the role session name
+	// minted from the request-context org UUID), not here.
+	//
+	// When the operator has provisioned a managed IAM policy and
+	// recorded its ARN in SessionPolicyARN, reference it via PolicyArns
+	// instead of inlining a JSON document. Only the ARN counts against
+	// STS's packed-policy budget that way, leaving more headroom for
+	// session tags inherited from the caller principal (IRSA / Pod
+	// Identity). When SessionPolicyARN is empty we fall back to the
+	// inline default — a missing ARN must NOT degrade to an unscoped
+	// session that inherits the full BaseRoleARN permissions.
+	input := &sts.AssumeRoleInput{
 		RoleArn:         aws.String(p.creds.BaseRoleARN),
 		RoleSessionName: aws.String(roleSessionName(info.OrgID)),
-		Policy:          aws.String(sessionPolicy),
 		DurationSeconds: aws.Int32(int32(SessionDuration.Seconds())),
-	})
+	}
+	if p.creds.SessionPolicyARN != "" {
+		input.PolicyArns = []ststypes.PolicyDescriptorType{
+			{Arn: aws.String(p.creds.SessionPolicyARN)},
+		}
+	} else {
+		input.Policy = aws.String(buildSessionPolicy(p.creds.AccessPointARN))
+	}
+
+	out, err := p.stsClient.AssumeRole(ctx, input)
 	if err != nil {
 		return aws.Credentials{}, fmt.Errorf("sts:AssumeRole for org %s: %w", info.OrgID, err)
 	}
diff --git a/pkg/blobmanager/s3accesspoint/backend_test.go b/pkg/blobmanager/s3accesspoint/backend_test.go
@@ -19,8 +19,11 @@ import (
 	"bytes"
 	"context"
 	"testing"
+	"time"
 
 	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/sts"
+	ststypes "github.com/aws/aws-sdk-go-v2/service/sts/types"
 	pb "github.com/chainloop-dev/chainloop/app/artifact-cas/api/cas/v1"
 	robotaccount "github.com/chainloop-dev/chainloop/internal/robotaccount/cas"
 	jwtmiddleware "github.com/go-kratos/kratos/v2/middleware/auth/jwt"
@@ -174,6 +177,83 @@ func (c *countingCredsProvider) Retrieve(_ context.Context) (aws.Credentials, er
 	return c.creds, nil
 }
 
+// fakeSTSAssumer captures the AssumeRoleInput passed by the credentials
+// provider so tests can lock down the AssumeRole call shape (inline
+// policy vs PolicyArns) without making a real AWS call.
+type fakeSTSAssumer struct {
+	lastInput *sts.AssumeRoleInput
+}
+
+func (f *fakeSTSAssumer) AssumeRole(_ context.Context, in *sts.AssumeRoleInput, _ ...func(*sts.Options)) (*sts.AssumeRoleOutput, error) {
+	f.lastInput = in
+	return &sts.AssumeRoleOutput{
+		Credentials: &ststypes.Credentials{
+			AccessKeyId:     aws.String("AKFAKE"),
+			SecretAccessKey: aws.String("secret"),
+			SessionToken:    aws.String("token"),
+			Expiration:      aws.Time(time.Now().Add(time.Hour)),
+		},
+	}, nil
+}
+
+// TestAssumeRoleInput_DefaultsToInlinePolicy locks down the fallback
+// path: with an empty SessionPolicyARN the AssumeRole call must carry an
+// inline Policy and must NOT pass PolicyArns — otherwise an upgrade
+// that forgets to set the new field would silently degrade to a session
+// scoped only by the BaseRoleARN's identity policies.
+func TestAssumeRoleInput_DefaultsToInlinePolicy(t *testing.T) {
+	t.Parallel()
+
+	fake := &fakeSTSAssumer{}
+	p := &sessionCredentialsProvider{
+		stsClient: fake,
+		creds: &Credentials{
+			AccessPointARN: "arn:aws:s3:us-east-1:111:accesspoint/ap-a",
+			BaseRoleARN:    "arn:aws:iam::111:role/r",
+		},
+	}
+	ctx := jwtmiddleware.NewContext(context.Background(),
+		&robotaccount.Claims{OrgID: "org-A", StoredSecretID: "foo", BackendType: "BT", Role: robotaccount.Uploader})
+
+	_, err := p.Retrieve(ctx)
+	require.NoError(t, err)
+	require.NotNil(t, fake.lastInput, "stub STS must have been invoked")
+
+	require.NotNil(t, fake.lastInput.Policy, "empty SessionPolicyARN must produce an inline Policy")
+	assert.Contains(t, *fake.lastInput.Policy, "arn:aws:s3:us-east-1:111:accesspoint/ap-a/object/*")
+	assert.Empty(t, fake.lastInput.PolicyArns, "inline path must not also pass PolicyArns")
+}
+
+// TestAssumeRoleInput_UsesPolicyArnsWhenSet locks down the opt-in path:
+// a configured SessionPolicyARN must reach STS via PolicyArns, and the
+// inline Policy must be omitted so we don't double-count against the
+// packed-policy budget.
+func TestAssumeRoleInput_UsesPolicyArnsWhenSet(t *testing.T) {
+	t.Parallel()
+
+	const managedARN = "arn:aws:iam::111:policy/chainloop-cas-session"
+	fake := &fakeSTSAssumer{}
+	p := &sessionCredentialsProvider{
+		stsClient: fake,
+		creds: &Credentials{
+			AccessPointARN:   "arn:aws:s3:us-east-1:111:accesspoint/ap-a",
+			BaseRoleARN:      "arn:aws:iam::111:role/r",
+			SessionPolicyARN: managedARN,
+		},
+	}
+	ctx := jwtmiddleware.NewContext(context.Background(),
+		&robotaccount.Claims{OrgID: "org-A", StoredSecretID: "foo", BackendType: "BT", Role: robotaccount.Uploader})
+
+	_, err := p.Retrieve(ctx)
+	require.NoError(t, err)
+	require.NotNil(t, fake.lastInput, "stub STS must have been invoked")
+
+	assert.Nil(t, fake.lastInput.Policy, "PolicyArns path must NOT also send an inline Policy")
+	require.Len(t, fake.lastInput.PolicyArns, 1, "expected exactly one PolicyArn descriptor")
+	require.NotNil(t, fake.lastInput.PolicyArns[0].Arn)
+	assert.Equal(t, managedARN, *fake.lastInput.PolicyArns[0].Arn)
+}
+
 // --- helpers -----------------------------------------------------------
 
 // newTestBackend constructs a fully wired *Backend that uses static dummy
diff --git a/pkg/blobmanager/s3accesspoint/provider.go b/pkg/blobmanager/s3accesspoint/provider.go
@@ -34,6 +34,13 @@
 //     (${apARN}/object/*), keeping the inline document small so STS's
 //     packed-policy budget isn't consumed by chainloop before tags
 //     inherited from the caller principal are even accounted for.
+//  4. Optionally, a customer-managed IAM policy referenced via
+//     Credentials.SessionPolicyARN. When set, the per-request AssumeRole
+//     call passes that ARN through PolicyArns instead of an inline Policy
+//     document; only the ~60-char ARN counts against the packed-policy
+//     budget, leaving more headroom for caller-inherited session tags.
+//     The managed policy is the action allowlist in that mode, so it
+//     MUST be at least as restrictive as the inline default.
 //
 // The session name MUST come from the request context, not from the secret
 // blob: a secrets-store compromise alone must not let an attacker reroute
@@ -49,6 +56,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/aws/aws-sdk-go-v2/aws/arn"
 	backend "github.com/chainloop-dev/chainloop/pkg/blobmanager"
 	"github.com/chainloop-dev/chainloop/pkg/credentials"
 )
@@ -113,6 +121,15 @@ type Credentials struct {
 	// accounts without a config change. Required unless DevModeEnvVar is
 	// set on the running binary.
 	BaseRoleARN string
+	// SessionPolicyARN is optional. When non-empty, the per-request
+	// AssumeRole call passes this ARN via PolicyArns instead of an
+	// inline Policy document, trimming chainloop's contribution to
+	// STS's packed-policy budget down to the ARN string itself. The
+	// IAM policy at this ARN MUST be at least as restrictive as the
+	// inline default (s3:GetObject + s3:PutObject scoped to the AP's
+	// /object/* prefix). When empty, the backend falls back to the
+	// inline session policy.
+	SessionPolicyARN string
 }
 
 func (c *Credentials) Validate() error {
@@ -136,6 +153,21 @@ func (c *Credentials) Validate() error {
 			return fmt.Errorf("%w: base_role_arn %q is not a valid IAM role ARN", backend.ErrValidation, c.BaseRoleARN)
 		}
 	}
+	// SessionPolicyARN is optional. When set it must be a syntactically
+	// valid IAM managed policy ARN — anything else (an S3 ARN, a role
+	// ARN, a policy ARN with no name, a role ARN that happens to embed
+	// ":policy/" in its path) would be silently rejected by STS at
+	// request time, surfacing as opaque errors deep in the upload path.
+	// Fail loudly here instead.
+	if c.SessionPolicyARN != "" {
+		a, err := arn.Parse(c.SessionPolicyARN)
+		if err != nil ||
+			a.Service != "iam" ||
+			!strings.HasPrefix(a.Resource, "policy/") ||
+			a.Resource == "policy/" {
+			return fmt.Errorf("%w: session_policy_arn %q is not a valid IAM managed policy ARN", backend.ErrValidation, c.SessionPolicyARN)
+		}
+	}
 	return nil
 }
 
diff --git a/pkg/blobmanager/s3accesspoint/provider_test.go b/pkg/blobmanager/s3accesspoint/provider_test.go
@@ -67,6 +67,58 @@ func TestCredentials_Validate(t *testing.T) {
 			mutate:  func(c *Credentials) { c.BaseRoleARN = "not-an-arn" },
 			wantErr: "not a valid IAM role ARN",
 		},
+		{
+			name:    "session policy arn empty is allowed",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "" },
+			wantErr: "",
+		},
+		{
+			name:    "session policy arn valid",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws:iam::123456789012:policy/chainloop-cas-session" },
+			wantErr: "",
+		},
+		{
+			name:    "session policy arn pointing at s3 ARN rejected",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws:s3:::some-bucket" },
+			wantErr: "not a valid IAM managed policy ARN",
+		},
+		{
+			name:    "session policy arn pointing at role rejected",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws:iam::123456789012:role/some-role" },
+			wantErr: "not a valid IAM managed policy ARN",
+		},
+		{
+			name:    "session policy arn garbage rejected",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "not-an-arn-at-all" },
+			wantErr: "not a valid IAM managed policy ARN",
+		},
+		{
+			name:    "session policy arn for AWS-managed policy accepted",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws:iam::aws:policy/AdministratorAccess" },
+			wantErr: "",
+		},
+		{
+			// previously the loose substring check would accept this because
+			// the string both starts with arn:aws:iam:: and contains :policy/
+			name:    "session policy arn embedded in role path rejected",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws:iam::123456789012:role/foo:policy/bar" },
+			wantErr: "not a valid IAM managed policy ARN",
+		},
+		{
+			name:    "session policy arn with empty name rejected",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws:iam::123456789012:policy/" },
+			wantErr: "not a valid IAM managed policy ARN",
+		},
+		{
+			name:    "session policy arn missing service rejected",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws::us-east-1:123456789012:policy/foo" },
+			wantErr: "not a valid IAM managed policy ARN",
+		},
+		{
+			name:    "session policy arn structurally malformed rejected",
+			mutate:  func(c *Credentials) { c.SessionPolicyARN = "arn:aws:iam" },
+			wantErr: "not a valid IAM managed policy ARN",
+		},
 	}
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {