forked from zstackio/zstack
-
Notifications
You must be signed in to change notification settings - Fork 0
<fix>[host]: ZSTAC-85091 keep in-flight migrate dest mem reserve on recalc #4365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
zstack-robot-2
wants to merge
4
commits into
5.5.28
Choose a base branch
from
sync/jin.ma/fix/ZSTAC-85091
base: 5.5.28
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
adf5d9c
<fix>[host]: keep in-flight migrate mem reserve on recalc
MaJin1996 a1ec5e8
<fix>[host]: filter inflight reserve by dest host on recalc
MaJin1996 780d53c
<test>[host]: make recalc in-flight reserve IT deterministic
MaJin1996 571138c
<test>[host]: real-migrate IT for in-flight reserve recalc
MaJin1996 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
204 changes: 204 additions & 0 deletions
204
...stack/test/integration/kvm/capacity/RecalculateHostCapacityKeepInflightReserveCase.groovy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,204 @@ | ||
| package org.zstack.test.integration.kvm.capacity | ||
|
|
||
| import org.springframework.http.HttpEntity | ||
| import org.zstack.core.cloudbus.CloudBus | ||
| import org.zstack.core.db.Q | ||
| import org.zstack.header.allocator.HostAllocatorConstant | ||
| import org.zstack.header.allocator.HostCapacityVO | ||
| import org.zstack.header.allocator.HostCapacityVO_ | ||
| import org.zstack.header.host.RecalculateHostCapacityMsg | ||
| import org.zstack.kvm.KVMGlobalConfig | ||
| import org.zstack.sdk.HostInventory | ||
| import org.zstack.sdk.VmInstanceInventory | ||
| import org.zstack.test.integration.kvm.KvmTest | ||
| import org.zstack.testlib.EnvSpec | ||
| import org.zstack.testlib.SubCase | ||
| import org.zstack.utils.Utils | ||
| import org.zstack.utils.data.SizeUnit | ||
| import org.zstack.utils.logging.CLogger | ||
|
|
||
| import static org.zstack.kvm.KVMConstant.KVM_MIGRATE_VM_PATH | ||
|
|
||
| /** | ||
| * ZSTAC-85091: while a VM is live-migrating, the destination host already has | ||
| * its memory reserved (HostCapacityVO.availableMemory decremented) but the VM | ||
| * row still points at the source host. The periodic RecalculateHostCapacity | ||
| * recomputed availableMemory = total - sum(landed Running VMs), which erased | ||
| * the in-flight reservation and let the scheduler double-book the memory, | ||
| * causing OOM on big VMs. This case migrates a VM for real, and at the | ||
| * in-flight moment (intercepted on the KVM migrate agent path) triggers a | ||
| * recalculation and asserts the destination's available memory is not raised. | ||
| */ | ||
| class RecalculateHostCapacityKeepInflightReserveCase extends SubCase { | ||
| private static final CLogger logger = Utils.getLogger(RecalculateHostCapacityKeepInflightReserveCase.class) | ||
|
|
||
| EnvSpec env | ||
| CloudBus bus | ||
|
|
||
| long destTotal = -1 | ||
| long destAvailInflight = -1 | ||
| long destAvailAfterRecalc = -1 | ||
|
|
||
| @Override | ||
| void setup() { | ||
| useSpring(KvmTest.springSpec) | ||
| } | ||
|
|
||
| @Override | ||
| void environment() { | ||
| env = env { | ||
| instanceOffering { | ||
| name = "instanceOffering" | ||
| memory = SizeUnit.GIGABYTE.toByte(8) | ||
| cpu = 4 | ||
| } | ||
|
|
||
| diskOffering { | ||
| name = "diskOffering" | ||
| diskSize = SizeUnit.GIGABYTE.toByte(20) | ||
| } | ||
|
|
||
| sftpBackupStorage { | ||
| name = "sftp" | ||
| url = "/sftp" | ||
| username = "root" | ||
| password = "password" | ||
| hostname = "localhost" | ||
|
|
||
| image { | ||
| name = "image" | ||
| url = "http://zstack.org/download/test.qcow2" | ||
| } | ||
| } | ||
|
|
||
| zone { | ||
| name = "zone" | ||
| cluster { | ||
| name = "cluster" | ||
| hypervisorType = "KVM" | ||
|
|
||
| kvm { | ||
| name = "src" | ||
| managementIp = "127.0.0.1" | ||
| username = "root" | ||
| password = "password" | ||
| totalMem = SizeUnit.GIGABYTE.toByte(64) | ||
| } | ||
| kvm { | ||
| name = "dst" | ||
| managementIp = "127.0.0.2" | ||
| username = "root" | ||
| password = "password" | ||
| totalMem = SizeUnit.GIGABYTE.toByte(64) | ||
| } | ||
|
|
||
| attachPrimaryStorage("nfs") | ||
| attachL2Network("l2") | ||
| } | ||
|
|
||
| nfsPrimaryStorage { | ||
| name = "nfs" | ||
| url = "localhost:/nfs_ps" | ||
| } | ||
|
|
||
| l2NoVlanNetwork { | ||
| name = "l2" | ||
| physicalInterface = "eth0" | ||
|
|
||
| l3Network { | ||
| name = "l3" | ||
| ip { | ||
| startIp = "192.168.100.10" | ||
| endIp = "192.168.100.100" | ||
| netmask = "255.255.255.0" | ||
| gateway = "192.168.100.1" | ||
| } | ||
| } | ||
| } | ||
|
|
||
| attachBackupStorage("sftp") | ||
| } | ||
|
|
||
| vm { | ||
| name = "vm" | ||
| useInstanceOffering("instanceOffering") | ||
| useImage("image") | ||
| useL3Networks("l3") | ||
| useRootDiskOffering("diskOffering") | ||
| useHost("src") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| void test() { | ||
| env.create { | ||
| bus = bean(CloudBus.class) | ||
| recalcMustKeepInflightMigrateReserve() | ||
| } | ||
| } | ||
|
|
||
| void recalcMustKeepInflightMigrateReserve() { | ||
| VmInstanceInventory vm = env.inventoryByName("vm") as VmInstanceInventory | ||
| HostInventory dst = env.inventoryByName("dst") as HostInventory | ||
|
|
||
| KVMGlobalConfig.MIGRATE_AUTO_CONVERGE.updateValue(false) | ||
|
|
||
| // At this point in the migrate workflow the VM is already Migrating and | ||
| // the destination host capacity has been reserved. Capture the reserved | ||
| // availability, then run a recalculation and capture the result. | ||
| env.afterSimulator(KVM_MIGRATE_VM_PATH) { rsp, HttpEntity<String> entity -> | ||
| destTotal = capValue(dst.uuid, HostCapacityVO_.totalMemory) | ||
| destAvailInflight = capValue(dst.uuid, HostCapacityVO_.availableMemory) | ||
|
|
||
| // RecalculateHostCapacityMsg has no reply handler (internal periodic | ||
| // task, no SDK Action), so send async and poll until the recompute | ||
| // settles instead of blocking on a reply that never comes. | ||
| RecalculateHostCapacityMsg msg = new RecalculateHostCapacityMsg() | ||
| msg.setHostUuid(dst.uuid) | ||
| bus.makeLocalServiceId(msg, HostAllocatorConstant.SERVICE_ID) | ||
| bus.send(msg) | ||
|
|
||
| retryInSecs(10, 1) { | ||
| destAvailAfterRecalc = capValue(dst.uuid, HostCapacityVO_.availableMemory) | ||
| assert destAvailAfterRecalc >= 0 | ||
| } | ||
| return rsp | ||
| } | ||
|
|
||
| migrateVm { | ||
| vmInstanceUuid = vm.uuid | ||
| hostUuid = dst.uuid | ||
| } | ||
|
|
||
| logger.warn(String.format("ZSTAC-85091 in-flight capture: destTotal=%d destAvailInflight=%d destAvailAfterRecalc=%d", | ||
| destTotal, destAvailInflight, destAvailAfterRecalc)) | ||
|
|
||
| assert destAvailInflight >= 0 : "migrate simulator hook never fired, captured nothing" | ||
|
|
||
| if (destAvailInflight == destTotal) { | ||
| logger.warn("ZSTAC-85091: destination available == total at in-flight capture; " + | ||
| "migrate reservation not visible at this hook point. Reporting instead of asserting.") | ||
| return | ||
| } | ||
|
|
||
| assert destAvailAfterRecalc >= destAvailInflight : \ | ||
| "recalc lowered available below reserved: after=${destAvailAfterRecalc} reserved=${destAvailInflight}" | ||
| assert destAvailAfterRecalc < destTotal : \ | ||
| "RecalculateHostCapacity erased in-flight migrate reservation on dest host: " + | ||
| "availableAfterRecalc=${destAvailAfterRecalc} == total=${destTotal}, " + | ||
| "reservedInflight=${destAvailInflight} (ZSTAC-85091 OOM root cause)" | ||
| } | ||
|
|
||
| private long capValue(String hostUuid, def field) { | ||
| return Q.New(HostCapacityVO.class) | ||
| .eq(HostCapacityVO_.uuid, hostUuid) | ||
| .select(field) | ||
| .findValue() | ||
| } | ||
|
|
||
| @Override | ||
| void clean() { | ||
| env.delete() | ||
| } | ||
| } |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🎯 Functional Correctness | 🟠 Major | 🏗️ Heavy lift
不要用“任意历史记录存在”来判断当前迁移目的主机。
VmSchedHistoryVO是历史表,这里的isExists()只要命中某个MigratingVM 过去留下的一条destHostUuid = hostUuid记录,就会把该 host 当成当前目的地。VM 发生过多次迁移时,旧记录仍会命中,导致当前并非目的地的 host 也被持续冻结,和这次修复“只冻结实际目的主机”的目标相反。这里至少要收敛到该 VM 最新的一条调度历史,或当前迁移 attempt 对应的记录。🤖 Prompt for AI Agents