File tree Expand file tree Collapse file tree
src/main/java/com/cloud/agent/properties
scripts/vm/hypervisor/kvm Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -310,6 +310,22 @@ iscsi.session.cleanup.enabled=false
310310# This parameter specifies if the host must be rebooted when something goes wrong with the heartbeat.
311311# reboot.host.and.alert.management.on.heartbeat.timeout=true
312312
313+ # Action taken by kvmheartbeat.sh / kvmspheartbeat.sh when a storage heartbeat
314+ # write fails persistently. Supersedes the legacy binary
315+ # 'reboot.host.and.alert.management.on.heartbeat.timeout' when set to a non-default value.
316+ #
317+ # Allowed values:
318+ # reboot - immediate sysrq-trigger reboot (default; original behavior)
319+ # graceful-reboot - 'systemctl reboot' instead of sysrq; allows VMs to stop cleanly
320+ # restart-agent - restart cloudstack-agent only; running VMs are preserved
321+ # log-only - log + alert; take no automatic action (admin must investigate)
322+ #
323+ # The 'graceful-reboot', 'restart-agent', and 'log-only' actions are recommended
324+ # for setups using LINSTOR/DRBD or any local storage with replication, where
325+ # transient I/O contention can cause a heartbeat write to time out without the
326+ # host actually being unhealthy.
327+ # kvm.heartbeat.fence.action=reboot
328+
313329# Enables manually setting CPU's topology on KVM's VM.
314330# enable.manually.setting.cpu.topology.on.kvm.vm=true
315331
Original file line number Diff line number Diff line change @@ -598,6 +598,25 @@ public class AgentProperties{
598598 public static final Property <Boolean > REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT
599599 = new Property <>("reboot.host.and.alert.management.on.heartbeat.timeout" , true );
600600
601+ /**
602+ * Action taken by the KVM agent's storage heartbeat scripts (kvmheartbeat.sh / kvmspheartbeat.sh)
603+ * when a heartbeat write fails persistently. Allowed values:
604+ * <ul>
605+ * <li>{@code reboot} (default) — immediate sysrq-trigger reboot; original behavior</li>
606+ * <li>{@code graceful-reboot} — {@code systemctl reboot} instead of sysrq, lets VMs stop cleanly</li>
607+ * <li>{@code restart-agent} — restart cloudstack-agent only; running VMs preserved</li>
608+ * <li>{@code log-only} — log + alert, no automatic action</li>
609+ * </ul>
610+ * The non-default values are recommended for setups using LINSTOR/DRBD or other replicated
611+ * local storage, where transient I/O contention can cause a heartbeat write to time out
612+ * without the host actually being unhealthy.<br>
613+ * Read by the heartbeat shell scripts directly from agent.properties.<br>
614+ * Data type: String.<br>
615+ * Default value: {@code reboot}
616+ */
617+ public static final Property <String > KVM_HEARTBEAT_FENCE_ACTION
618+ = new Property <>("kvm.heartbeat.fence.action" , "reboot" );
619+
601620 /**
602621 * Enables manually setting CPU's topology on KVM's VM. <br>
603622 * Data type: Boolean.<br>
Original file line number Diff line number Diff line change @@ -156,11 +156,43 @@ then
156156 exit 0
157157elif [ " $cflag " == " 1" ]
158158then
159- /usr/bin/logger -t heartbeat " kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
160- sync &
161- sleep 5
162- echo b > /proc/sysrq-trigger
163- exit $?
159+ # Read fence action from agent.properties (default: reboot for backward compatibility).
160+ # Allowed values: reboot | graceful-reboot | restart-agent | log-only
161+ AGENT_PROPS=" /etc/cloudstack/agent/agent.properties"
162+ FENCE_ACTION=" reboot"
163+ if [ -r " $AGENT_PROPS " ]; then
164+ val=$( grep -E ' ^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' " $AGENT_PROPS " | tail -n 1 | cut -d= -f2- | tr -d ' [:space:]' )
165+ [ -n " $val " ] && FENCE_ACTION=" $val "
166+ fi
167+
168+ case " $FENCE_ACTION " in
169+ log-only)
170+ /usr/bin/logger -t heartbeat " kvmheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
171+ exit 0
172+ ;;
173+ restart-agent)
174+ /usr/bin/logger -t heartbeat " kvmheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
175+ sync &
176+ sleep 2
177+ systemctl restart cloudstack-agent
178+ exit $?
179+ ;;
180+ graceful-reboot)
181+ /usr/bin/logger -t heartbeat " kvmheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
182+ sync &
183+ sleep 5
184+ systemctl reboot
185+ exit $?
186+ ;;
187+ reboot|* )
188+ # Original behavior: immediate kernel-level reboot via sysrq-trigger
189+ /usr/bin/logger -t heartbeat " kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
190+ sync &
191+ sleep 5
192+ echo b > /proc/sysrq-trigger
193+ exit $?
194+ ;;
195+ esac
164196else
165197 write_hbLog
166198 exit $?
Original file line number Diff line number Diff line change @@ -58,9 +58,41 @@ deleteVMs() {
5858
5959if [ " $cflag " == " 1" ]
6060then
61- /usr/bin/logger -t heartbeat " kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
62- sync &
63- sleep 5
64- echo b > /proc/sysrq-trigger
65- exit $?
61+ # Read fence action from agent.properties (default: reboot for backward compatibility).
62+ # Allowed values: reboot | graceful-reboot | restart-agent | log-only
63+ AGENT_PROPS=" /etc/cloudstack/agent/agent.properties"
64+ FENCE_ACTION=" reboot"
65+ if [ -r " $AGENT_PROPS " ]; then
66+ val=$( grep -E ' ^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' " $AGENT_PROPS " | tail -n 1 | cut -d= -f2- | tr -d ' [:space:]' )
67+ [ -n " $val " ] && FENCE_ACTION=" $val "
68+ fi
69+
70+ case " $FENCE_ACTION " in
71+ log-only)
72+ /usr/bin/logger -t heartbeat " kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
73+ exit 0
74+ ;;
75+ restart-agent)
76+ /usr/bin/logger -t heartbeat " kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
77+ sync &
78+ sleep 2
79+ systemctl restart cloudstack-agent
80+ exit $?
81+ ;;
82+ graceful-reboot)
83+ /usr/bin/logger -t heartbeat " kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
84+ sync &
85+ sleep 5
86+ systemctl reboot
87+ exit $?
88+ ;;
89+ reboot|* )
90+ # Original behavior: immediate kernel-level reboot via sysrq-trigger
91+ /usr/bin/logger -t heartbeat " kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
92+ sync &
93+ sleep 5
94+ echo b > /proc/sysrq-trigger
95+ exit $?
96+ ;;
97+ esac
6698fi
You can’t perform that action at this time.
0 commit comments