Skip to content

[AMDGPU] Support for nested add in GVS pattern matching#186910

Open
shiltian wants to merge 1 commit intomainfrom
users/shiltian/support-for-nested-gvs-pattern
Open

[AMDGPU] Support for nested add in GVS pattern matching#186910
shiltian wants to merge 1 commit intomainfrom
users/shiltian/support-for-nested-gvs-pattern

Conversation

@shiltian
Copy link
Contributor

@shiltian shiltian commented Mar 16, 2026

This pattern looks like pretty common after the straight-line strength reduction pass.

Fixes ROCM-20181.

@shiltian
Copy link
Contributor Author

shiltian commented Mar 16, 2026

This stack of pull requests is managed by sgh.

@llvmbot
Copy link
Member

llvmbot commented Mar 16, 2026

@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

Changes

Fixes ROCM-20181.


Patch is 485.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/186910.diff

7 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+108)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+135)
  • (modified) llvm/test/CodeGen/AMDGPU/acc-ldst.ll (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll (+41-15)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll (+16-21)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+504-480)
  • (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+1140-2682)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index cc2058a5a1d4a..11d48cd1ed811 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2056,6 +2056,114 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
       }
     }
 
+    // Try to fold an outer offset into a nested i32 voffset:
+    //   add(add(sgpr, ext(i32 vgpr)), const64)
+    //     -> base = sgpr, voffset = add(i32_vgpr, const32)
+    //   add(add(sgpr, ext(i32 vgpr_a)), ext(i32 vgpr_b))
+    //     -> base = sgpr, voffset = add(i32_vgpr_a, i32_vgpr_b)
+    // Valid only when the i32 addition provably does not overflow.
+    if (!SAddr) {
+      bool IsSigned = Subtarget->hasSignedGVSOffset();
+
+      for (unsigned I = 0; I < 2 && !SAddr; ++I) {
+        SDValue OuterOp = Addr.getOperand(I);
+        SDValue InnerAddr = Addr.getOperand(1 - I);
+
+        if (!InnerAddr->isAnyAdd())
+          continue;
+
+        SDValue OuterI32;
+        KnownBits OuterKnown(32);
+        int64_t SplitImmOffset = 0;
+        int64_t ConstVal = 0;
+        bool IsConst = false;
+
+        if (auto *C = dyn_cast<ConstantSDNode>(OuterOp)) {
+          int64_t OuterConst = C->getSExtValue();
+          if (OuterConst <= 0)
+            continue;
+
+          ConstVal = OuterConst;
+          if (NeedIOffset) {
+            const SIInstrInfo *TII = Subtarget->getInstrInfo();
+            std::tie(SplitImmOffset, ConstVal) = TII->splitFlatOffset(
+                OuterConst, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+          }
+
+          if (!(IsSigned ? isInt<32>(ConstVal) : isUInt<32>(ConstVal)))
+            continue;
+
+          IsConst = true;
+          OuterKnown = KnownBits::makeConstant(APInt(32, ConstVal));
+        } else {
+          OuterI32 = matchExtFromI32orI32(OuterOp, IsSigned, CurDAG);
+          if (!OuterI32)
+            continue;
+          OuterKnown = CurDAG->computeKnownBits(OuterI32);
+        }
+
+        for (unsigned J = 0; J < 2; ++J) {
+          SDValue MaybeBase = InnerAddr.getOperand(J);
+          SDValue MaybeExt = InnerAddr.getOperand(1 - J);
+
+          if (MaybeBase->isDivergent())
+            continue;
+
+          SDValue InnerI32 = matchExtFromI32orI32(MaybeExt, IsSigned, CurDAG);
+          if (!InnerI32)
+            continue;
+
+          KnownBits InnerKnown = CurDAG->computeKnownBits(InnerI32);
+
+          bool NoOverflow = false;
+          if (IsSigned) {
+            bool MinOF = false;
+            bool MaxOF = false;
+            (void)InnerKnown.getSignedMinValue().sadd_ov(
+                OuterKnown.getSignedMinValue(), MinOF);
+            (void)InnerKnown.getSignedMaxValue().sadd_ov(
+                OuterKnown.getSignedMaxValue(), MaxOF);
+            NoOverflow = !MinOF && !MaxOF;
+          } else {
+            bool OF = false;
+            (void)InnerKnown.getMaxValue().uadd_ov(OuterKnown.getMaxValue(),
+                                                   OF);
+            NoOverflow = !OF;
+          }
+
+          if (!NoOverflow)
+            continue;
+
+          SDLoc SL(N);
+          SAddr = MaybeBase;
+
+          if (IsConst && ConstVal == 0) {
+            VOffset = InnerI32;
+          } else {
+            SDValue AddOp =
+                IsConst ? getMaterializedScalarImm32(ConstVal, SL) : OuterI32;
+            if (Subtarget->hasAddNoCarryInsts()) {
+              SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+              VOffset = SDValue(
+                  CurDAG->getMachineNode(AMDGPU::V_ADD_U32_e64, SL, MVT::i32,
+                                         {InnerI32, AddOp, Clamp}),
+                  0);
+            } else {
+              SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+              SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+              VOffset =
+                  SDValue(CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, SL,
+                                                 VTs, {InnerI32, AddOp, Clamp}),
+                          0);
+            }
+          }
+
+          ImmOffset = SplitImmOffset;
+          break;
+        }
+      }
+    }
+
     if (SAddr) {
       Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
       return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 81e224355411b..89e7aa0a0a323 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5978,8 +5978,143 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
                  }}};
       }
     }
+
+    // Try to fold an outer offset into a nested i32 voffset:
+    //   G_PTR_ADD(G_PTR_ADD(sgpr, ext(i32 vgpr)), const64)
+    //     -> base = sgpr, voffset = add(i32_vgpr, const32)
+    //   G_PTR_ADD(G_PTR_ADD(sgpr, ext(i32 vgpr_a)), ext(i32 vgpr_b))
+    //     -> base = sgpr, voffset = add(i32_vgpr_a, i32_vgpr_b)
+    // Valid only when the i32 addition provably does not overflow.
+    {
+      bool IsSigned = Subtarget->hasSignedGVSOffset();
+      Register OuterOffset = AddrDef->MI->getOperand(2).getReg();
+      Register InnerPtrAdd =
+          getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+      auto InnerDef = getDefSrcRegIgnoringCopies(InnerPtrAdd, *MRI);
+
+      if (InnerDef && InnerDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+        Register MaybeBase =
+            getSrcRegIgnoringCopies(InnerDef->MI->getOperand(1).getReg(), *MRI);
+        Register InnerOffset = InnerDef->MI->getOperand(2).getReg();
+
+        if (isSGPR(MaybeBase)) {
+          if (Register InnerI32 =
+                  matchExtendFromS32OrS32(InnerOffset, IsSigned)) {
+            Register OuterI32;
+            KnownBits OuterKnown(32);
+            int64_t SplitImmOffset = 0;
+            int64_t ConstVal = 0;
+            bool IsConst = false;
+            bool OuterMatched = false;
+
+            auto OuterConst =
+                getIConstantVRegValWithLookThrough(OuterOffset, *MRI);
+            if (OuterConst && OuterConst->Value.getSExtValue() > 0) {
+              int64_t OuterConstVal = OuterConst->Value.getSExtValue();
+              ConstVal = OuterConstVal;
+              if (NeedIOffset) {
+                std::tie(SplitImmOffset, ConstVal) =
+                    TII.splitFlatOffset(OuterConstVal, AMDGPUAS::GLOBAL_ADDRESS,
+                                        SIInstrFlags::FlatGlobal);
+              }
+
+              if (IsSigned ? isInt<32>(ConstVal) : isUInt<32>(ConstVal)) {
+                IsConst = true;
+                OuterKnown = KnownBits::makeConstant(APInt(32, ConstVal));
+                OuterMatched = true;
+              }
+            } else if ((OuterI32 =
+                            matchExtendFromS32OrS32(OuterOffset, IsSigned))) {
+              OuterKnown = VT->getKnownBits(OuterI32);
+              OuterMatched = true;
+            }
+
+            if (OuterMatched) {
+              KnownBits InnerKnown = VT->getKnownBits(InnerI32);
+
+              bool NoOverflow = false;
+              if (IsSigned) {
+                bool MinOF, MaxOF;
+                (void)InnerKnown.getSignedMinValue().sadd_ov(
+                    OuterKnown.getSignedMinValue(), MinOF);
+                (void)InnerKnown.getSignedMaxValue().sadd_ov(
+                    OuterKnown.getSignedMaxValue(), MaxOF);
+                NoOverflow = !MinOF && !MaxOF;
+              } else {
+                bool OF;
+                (void)InnerKnown.getMaxValue().uadd_ov(OuterKnown.getMaxValue(),
+                                                       OF);
+                NoOverflow = !OF;
+              }
+
+              if (NoOverflow) {
+                MachineInstr *MI = Root.getParent();
+                MachineBasicBlock *MBB = MI->getParent();
+                const DebugLoc &DL = MI->getDebugLoc();
+
+                Register VOffsetReg;
+                if (IsConst && ConstVal == 0) {
+                  VOffsetReg = InnerI32;
+                } else {
+                  Register AddOpReg;
+                  if (IsConst) {
+                    AddOpReg =
+                        MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+                    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32),
+                            AddOpReg)
+                        .addImm(ConstVal);
+                  } else {
+                    AddOpReg = OuterI32;
+                  }
+
+                  VOffsetReg =
+                      MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+                  if (STI.hasAddNoCarryInsts()) {
+                    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ADD_U32_e64),
+                            VOffsetReg)
+                        .addReg(InnerI32)
+                        .addReg(AddOpReg)
+                        .addImm(0);
+                  } else {
+                    Register UnusedCarry =
+                        MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
+                    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64),
+                            VOffsetReg)
+                        .addDef(UnusedCarry, RegState::Dead)
+                        .addReg(InnerI32)
+                        .addReg(AddOpReg)
+                        .addImm(0);
+                  }
+                }
+
+                if (NeedIOffset)
+                  return {{
+                      [=](MachineInstrBuilder &MIB) { MIB.addReg(MaybeBase); },
+                      [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffsetReg); },
+                      [=](MachineInstrBuilder &MIB) {
+                        MIB.addImm(SplitImmOffset);
+                      },
+                      [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+                  }};
+                return {{
+                    [=](MachineInstrBuilder &MIB) { MIB.addReg(MaybeBase); },
+                    [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffsetReg); },
+                    [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+                }};
+              }
+            }
+          }
+        }
+      }
+    }
   }
 
+  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+  // drop this.
+  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+      AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
+    return std::nullopt;
+
   // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
   // drop this.
   if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 4258d1d4bd874..7f3603435d147 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -63,7 +63,7 @@ bb:
 ; GCN-LABEL: {{^}}test_load_store:
 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:     v_accvgpr
-; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}]
 define amdgpu_kernel void @test_load_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -171,7 +171,7 @@ bb:
 ; GCN-NOT:     v_accvgpr_write
 ; GCN:         v_mfma_f32_32x32x1f32
 ; GCN-NOT:     v_accvgpr_read
-; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN-COUNT-16: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
 define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
index bf7cce9877f86..407940631edc0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
@@ -10,14 +10,14 @@ declare void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gaddr,
 define amdgpu_ps void @global_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b8_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b8_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -32,7 +32,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b8_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b8 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -45,14 +45,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b32_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b32_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -67,7 +67,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -80,14 +80,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -102,7 +102,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -115,14 +115,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b128_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b128_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -137,7 +137,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b128_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b128 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -150,7 +150,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -163,7 +163,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -176,7 +176,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_...
[truncated]

@shiltian shiltian force-pushed the users/shiltian/support-for-nested-gvs-pattern branch from 106c22e to 53a5c83 Compare March 16, 2026 23:01
@github-actions
Copy link

github-actions bot commented Mar 16, 2026

🐧 Linux x64 Test Results

  • 192289 tests passed
  • 4931 tests skipped

✅ The build succeeded and all tests passed.

@github-actions
Copy link

github-actions bot commented Mar 16, 2026

🪟 Windows x64 Test Results

  • 132363 tests passed
  • 3009 tests skipped

✅ The build succeeded and all tests passed.

@shiltian shiltian force-pushed the users/shiltian/support-for-nested-gvs-pattern branch 2 times, most recently from cc3eded to 060079d Compare March 17, 2026 00:53
@shiltian shiltian force-pushed the users/shiltian/support-for-nested-gvs-pattern branch from 060079d to 044d40d Compare March 17, 2026 00:55
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants