From f7ba5da0b5f702acfa1f3e9efbf3a926d1d9e72c Mon Sep 17 00:00:00 2001
From: Mihir Dixit <dixitmihir1@gmail.com>
Date: Fri, 1 May 2026 19:23:16 +0000
Subject: [PATCH] cmd/alertmanager: add --config.auto-reload-interval flag

Add a --config.auto-reload-interval flag that starts a background
goroutine polling the SHA256 checksum of the config file at the given
interval. When a change is detected the goroutine writes to the existing
webReload channel, the same path used by SIGHUP and POST /-/reload, so
no new reload logic is required.

The flag defaults to 0s (disabled). Any non-zero duration enables the
watcher. When the process shuts down the watcher exits cleanly via
context cancellation.

SHA256 polling is used instead of fsnotify because Kubernetes kubelet
uses AtomicWriter to update ConfigMap and Secret mounts via symlink
swaps, which causes fsnotify to miss updates after the first one.
Polling is the same approach Prometheus uses for --web.config.file.

Fixes #5195

Signed-off-by: Mihir Dixit <dixitmihir1@gmail.com>
---
 cmd/alertmanager/main.go      |  96 ++++++++++++++++++
 cmd/alertmanager/main_test.go | 186 ++++++++++++++++++++++++++++++++++
 docs/configuration.md         |  16 +++
 docs/management_api.md        |   4 +
 4 files changed, 302 insertions(+)

diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go
index 8e9961194c..d6f6004c40 100644
--- a/cmd/alertmanager/main.go
+++ b/cmd/alertmanager/main.go
@@ -15,6 +15,8 @@ package main
 
 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
 	"errors"
 	"fmt"
 	"log/slog"
@@ -140,6 +142,7 @@ func run() int {
 
 	var (
 		configFile                  = kingpin.Flag("config.file", "Alertmanager configuration file name.").Default("alertmanager.yml").String()
+		configAutoReloadInterval    = kingpin.Flag("config.auto-reload-interval", "Interval for checking and automatically reloading the Alertmanager configuration file. Set to 0 to disable.").Default("0s").Duration()
 		dataDir                     = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
 		retention                   = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
 		maintenanceInterval         = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
@@ -659,6 +662,14 @@ func run() int {
 	signal.Notify(hup, syscall.SIGHUP)
 	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
 
+	// Start the auto-reload watcher if the interval is non-zero.
+	if *configAutoReloadInterval > 0 {
+		logger.Info("Auto-reload enabled: checking for configuration changes", "interval", *configAutoReloadInterval, "file", *configFile)
+		watcherCtx, cancelWatcher := context.WithCancel(context.Background())
+		defer cancelWatcher()
+		go runConfigWatcher(watcherCtx, *configFile, *configAutoReloadInterval, webReload, logger)
+	}
+
 	for {
 		select {
 		case <-hup:
@@ -680,6 +691,91 @@ func run() int {
 	}
 }
 
+// configFileChecksum reads the file at path and returns its SHA256 hex digest.
+// It returns an error if the file cannot be read.
+func configFileChecksum(path string) (string, error) {
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return "", fmt.Errorf("reading config file for checksum: %w", err)
+	}
+	sum := sha256.Sum256(b)
+	return hex.EncodeToString(sum[:]), nil
+}
+
+// runConfigWatcher polls the config file checksum every interval and sends
+// to reloadCh when a change is detected. It exits when ctx is cancelled.
+// Interval must be > 0; callers are responsible for checking this.
+func runConfigWatcher(
+	ctx context.Context,
+	configFile string,
+	interval time.Duration,
+	reloadCh chan<- chan error,
+	logger *slog.Logger,
+) {
+	// Compute the initial checksum at startup so we only reload on *changes*,
+	// not on the first tick unconditionally.
+	lastChecksum, err := configFileChecksum(configFile)
+	hasChecksum := err == nil
+	if err != nil {
+		// Log but don't abort - the coordinator already validated the file at
+		// startup, so this is a transient read error. We'll retry next tick.
+		logger.Warn("Auto-reload: failed to compute initial config checksum", "file", configFile, "err", err)
+	}
+
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			logger.Info("Auto-reload: watcher stopped", "file", configFile)
+			return
+		case <-ticker.C:
+			checksum, err := configFileChecksum(configFile)
+			if err != nil {
+				logger.Warn("Auto-reload: failed to read config file", "file", configFile, "err", err)
+				continue // don't update lastChecksum; retry next tick
+			}
+
+			if !hasChecksum {
+				// Startup read failed; seed the baseline now without reloading.
+				lastChecksum = checksum
+				hasChecksum = true
+				continue
+			}
+
+			if checksum == lastChecksum {
+				continue // no change
+			}
+
+			logger.Info("Auto-reload: config file changed, reloading", "file", configFile)
+
+			// Trigger reload via the same channel that SIGHUP and POST /-/reload use.
+			// Use a select so that a simultaneous SIGTERM doesn't leave this
+			// goroutine blocked on the send or the result receive.
+			errCh := make(chan error)
+			select {
+			case reloadCh <- errCh:
+			case <-ctx.Done():
+				return
+			}
+			select {
+			case err := <-errCh:
+				if err != nil {
+					logger.Error("Auto-reload: reload failed", "file", configFile, "err", err)
+					// Don't update lastChecksum so we retry on the next tick.
+					continue
+				}
+			case <-ctx.Done():
+				return
+			}
+
+			lastChecksum = checksum
+			logger.Info("Auto-reload: config reload successful", "file", configFile)
+		}
+	}
+}
+
 // clusterWait returns a function that inspects the current peer state and returns
 // a duration of one base timeout for each peer with a higher ID than ourselves.
 func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration {
diff --git a/cmd/alertmanager/main_test.go b/cmd/alertmanager/main_test.go
index 28f21bdc2b..4987778282 100644
--- a/cmd/alertmanager/main_test.go
+++ b/cmd/alertmanager/main_test.go
@@ -14,8 +14,13 @@
 package main
 
 import (
+	"context"
 	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
 	"testing"
+	"time"
 
 	"github.com/prometheus/common/promslog"
 	"github.com/stretchr/testify/require"
@@ -95,3 +100,184 @@ func TestExternalURL(t *testing.T) {
 		})
 	}
 }
+
+func TestConfigFileChecksum_ReturnsConsistentHash(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+	require.NoError(t, os.WriteFile(path, []byte("content: a"), 0o644))
+
+	sum1, err := configFileChecksum(path)
+	require.NoError(t, err)
+	sum2, err := configFileChecksum(path)
+	require.NoError(t, err)
+
+	require.Equal(t, sum1, sum2, "same content should produce same checksum")
+}
+
+func TestConfigFileChecksum_DifferentContentDifferentHash(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+
+	require.NoError(t, os.WriteFile(path, []byte("content: a"), 0o644))
+	sumA, err := configFileChecksum(path)
+	require.NoError(t, err)
+
+	require.NoError(t, os.WriteFile(path, []byte("content: b"), 0o644))
+	sumB, err := configFileChecksum(path)
+	require.NoError(t, err)
+
+	require.NotEqual(t, sumA, sumB, "different content must produce different checksum")
+}
+
+func TestConfigFileChecksum_MissingFileReturnsError(t *testing.T) {
+	_, err := configFileChecksum("/nonexistent/path/alertmanager.yml")
+	require.Error(t, err)
+}
+
+func TestRunConfigWatcher_NoReloadWhenFileUnchanged(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+	require.NoError(t, os.WriteFile(path, []byte("route:\n  receiver: default\n"), 0o644))
+
+	reloadCh := make(chan chan error, 1)
+	ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
+	defer cancel()
+
+	go runConfigWatcher(ctx, path, 50*time.Millisecond, reloadCh, slog.Default())
+
+	// Let watcher run for 3 ticks minimum.
+	<-ctx.Done()
+
+	// reloadCh must be empty — no reload should have been triggered.
+	require.Empty(t, reloadCh, "no reload expected when file is unchanged")
+}
+
+func TestRunConfigWatcher_TriggersReloadOnChange(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+	require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))
+
+	reloadCh := make(chan chan error, 1)
+	ctx := t.Context()
+
+	go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())
+
+	// Wait one tick to let initial checksum be set.
+	time.Sleep(50 * time.Millisecond)
+
+	// Change the file.
+	require.NoError(t, os.WriteFile(path, []byte("changed"), 0o644))
+
+	// Wait for the watcher to detect the change and send to reloadCh.
+	select {
+	case errCh := <-reloadCh:
+		errCh <- nil // Simulate a successful reload.
+	case <-time.After(300 * time.Millisecond):
+		t.Fatal("timed out waiting for reload signal after file change")
+	}
+}
+
+func TestRunConfigWatcher_DoesNotRetriggerAfterSuccessfulReload(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+	require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))
+
+	reloadCh := make(chan chan error, 2) // buffer=2 to catch any spurious second reload
+	ctx := t.Context()
+
+	go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())
+
+	time.Sleep(50 * time.Millisecond) // allow initial checksum to be set
+
+	// Change the file once.
+	require.NoError(t, os.WriteFile(path, []byte("changed"), 0o644))
+
+	// Consume the first (expected) reload.
+	select {
+	case errCh := <-reloadCh:
+		errCh <- nil
+	case <-time.After(300 * time.Millisecond):
+		t.Fatal("expected first reload not received")
+	}
+
+	// Let 3 more ticks pass — file is unchanged so no second reload should come.
+	time.Sleep(150 * time.Millisecond)
+
+	require.Empty(t, reloadCh, "no second reload expected after successful reload of same content")
+}
+
+func TestRunConfigWatcher_RetriesAfterFailedReload(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+	require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))
+
+	reloadCh := make(chan chan error, 2)
+	ctx := t.Context()
+
+	go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())
+
+	time.Sleep(50 * time.Millisecond)
+
+	require.NoError(t, os.WriteFile(path, []byte("invalid-config"), 0o644))
+
+	// First attempt — simulate a reload failure.
+	select {
+	case errCh := <-reloadCh:
+		errCh <- fmt.Errorf("simulated parse error")
+	case <-time.After(300 * time.Millisecond):
+		t.Fatal("expected first reload attempt not received")
+	}
+
+	// Second attempt — watcher must retry because lastChecksum was not updated.
+	select {
+	case errCh := <-reloadCh:
+		errCh <- nil // Now succeeds.
+	case <-time.After(300 * time.Millisecond):
+		t.Fatal("expected retry reload attempt not received after failed reload")
+	}
+}
+
+func TestRunConfigWatcher_HandlesUnreadableFile(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+	require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))
+
+	reloadCh := make(chan chan error, 1)
+	ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
+	defer cancel()
+
+	go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())
+
+	time.Sleep(50 * time.Millisecond)
+
+	// Remove the file entirely — simulates a transient FS issue.
+	require.NoError(t, os.Remove(path))
+
+	// Watcher should log a warning but NOT send to reloadCh.
+	<-ctx.Done()
+	require.Empty(t, reloadCh, "no reload expected when file is unreadable")
+}
+
+func TestRunConfigWatcher_ExitsWhenContextCancelled(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "alertmanager.yml")
+	require.NoError(t, os.WriteFile(path, []byte("content"), 0o644))
+
+	reloadCh := make(chan chan error, 1)
+	ctx, cancel := context.WithCancel(context.Background())
+
+	done := make(chan struct{})
+	go func() {
+		runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())
+		close(done)
+	}()
+
+	cancel() // Cancel immediately.
+
+	select {
+	case <-done:
+		// Watcher exited cleanly.
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("watcher goroutine did not exit after context cancellation")
+	}
+}
diff --git a/docs/configuration.md b/docs/configuration.md
index 72cce8c2df..f331581262 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -20,6 +20,22 @@ is not well-formed, the changes will not be applied and an error is logged.
 A configuration reload is triggered by sending a `SIGHUP` to the process or
 sending an HTTP POST request to the `/-/reload` endpoint.
 
+## Auto-reload
+
+To have Alertmanager automatically reload its configuration when the file
+changes on disk, set `--config.auto-reload-interval` to a non-zero duration.
+Alertmanager will poll the file for changes at that interval and apply the new
+configuration if it detects a difference. The feature is off by default.
+
+```bash
+alertmanager --config.file=alertmanager.yml --config.auto-reload-interval=30s
+```
+
+This is especially useful in Kubernetes where ConfigMaps and Secrets are
+updated through mounted files. The reload follows the same code path as
+`SIGHUP` and `POST /-/reload`, so an invalid configuration is rejected and
+logged without affecting the running configuration.
+
 ## Limits
 
 Alertmanager supports a number of configurable limits via command-line flags.
diff --git a/docs/management_api.md b/docs/management_api.md
index 23f8e9af45..0b72413b01 100644
--- a/docs/management_api.md
+++ b/docs/management_api.md
@@ -35,3 +35,7 @@ POST /-/reload
 This endpoint triggers a reload of the Alertmanager configuration file.
 
 An alternative way to trigger a configuration reload is by sending a `SIGHUP` to the Alertmanager process.
+
+Configuration can also be reloaded automatically on file change using the
+`--config.auto-reload-interval` flag. See the
+[configuration documentation](configuration.md) for details.