Wrapper for Blocksparse CuTensor code by kmp5VT · Pull Request #3057 · JuliaGPU/CUDA.jl

kmp5VT · 2026-03-16T21:27:43Z

Hi,

This is a wrapper type and functions to access the newly introduced blocksparse cutensor backend. Right now the code is expert level, i.e. users need to write a type that converts their object to CuTensorBS types or can achieve the low-level operations required by cutensor kernels. I am still writing a test but the code is fully operational.

Thanks,
Karl

…code

…or support now

…ensor

… to make it a union type of CuTensorBS and AbstractArray?

github-actions · 2026-03-16T21:28:21Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/lib/cutensor/src/blocksparse/interfaces.jl b/lib/cutensor/src/blocksparse/interfaces.jl
index c6eef0e5b..0a479ddf8 100644
--- a/lib/cutensor/src/blocksparse/interfaces.jl
+++ b/lib/cutensor/src/blocksparse/interfaces.jl
@@ -1,4 +1,4 @@
-## For now call contract in ITensor and rely on UnallocatedArrays to make 
+## For now call contract in ITensor and rely on UnallocatedArrays to make
 ## C in a dry-run of the contraction.
 # function Base.:(*)(A::CuTensorBS, B::CuTensorBs)
 #     tC = promote_type(eltype(A), eltype(B))
@@ -18,11 +18,13 @@
 using LinearAlgebra
 
 function LinearAlgebra.mul!(C::CuTensorBS, A::CuTensorBS, B::CuTensorBS, α::Number, β::Number)
-   contract!(α, 
-            A, A.inds, CUTENSOR_OP_IDENTITY,
-            B, B.inds, CUTENSOR_OP_IDENTITY, 
-            β,
-            C, C.inds, CUTENSOR_OP_IDENTITY, 
-            CUTENSOR_OP_IDENTITY; jit=CUTENSOR_JIT_MODE_DEFAULT)
-   return C
-end
\ No newline at end of file
+    contract!(
+        α,
+        A, A.inds, CUTENSOR_OP_IDENTITY,
+        B, B.inds, CUTENSOR_OP_IDENTITY,
+        β,
+        C, C.inds, CUTENSOR_OP_IDENTITY,
+        CUTENSOR_OP_IDENTITY; jit = CUTENSOR_JIT_MODE_DEFAULT
+    )
+    return C
+end
diff --git a/lib/cutensor/src/blocksparse/operations.jl b/lib/cutensor/src/blocksparse/operations.jl
index 19542e5de..0f98c92ef 100644
--- a/lib/cutensor/src/blocksparse/operations.jl
+++ b/lib/cutensor/src/blocksparse/operations.jl
@@ -9,23 +9,26 @@ function contract!(
         @nospecialize(beta::Number),
         @nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
         opOut::cutensorOperator_t;
-        jit::cutensorJitMode_t=JIT_MODE_NONE,
-        workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
-        algo::cutensorAlgo_t=ALGO_DEFAULT,
-        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing,
-        plan::Union{CuTensorPlan, Nothing}=nothing)
+        jit::cutensorJitMode_t = JIT_MODE_NONE,
+        workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+        algo::cutensorAlgo_t = ALGO_DEFAULT,
+        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing,
+        plan::Union{CuTensorPlan, Nothing} = nothing
+    )
 
     actual_plan = if plan === nothing
-        plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
-                         jit, workspace, algo, compute_type)
+        plan_contraction(
+            A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
+            jit, workspace, algo, compute_type
+        )
     else
         plan
     end
 
     contractBS!(actual_plan, alpha, nonzero_blocks(A), nonzero_blocks(B), beta, nonzero_blocks(C))
-    
+
     if plan === nothing
-    CUDA.unsafe_free!(actual_plan)
+        CUDA.unsafe_free!(actual_plan)
     end
 
     return C
@@ -33,12 +36,14 @@ end
 
 ## This function assumes A, B, and C are Arrays of pointers to CuArrays.
 ## Please overwrite the `nonzero_blocks` function for your datatype to access this function from contract!
-function contractBS!(plan::CuTensorPlan,
-                   @nospecialize(alpha::Number),
-                   @nospecialize(A::AbstractArray),
-                   @nospecialize(B::AbstractArray),
-                   @nospecialize(beta::Number),
-                   @nospecialize(C::AbstractArray))
+function contractBS!(
+        plan::CuTensorPlan,
+        @nospecialize(alpha::Number),
+        @nospecialize(A::AbstractArray),
+        @nospecialize(B::AbstractArray),
+        @nospecialize(beta::Number),
+        @nospecialize(C::AbstractArray)
+    )
     scalar_type = plan.scalar_type
 
     # Extract GPU pointers from each CuArray block
@@ -46,11 +51,13 @@ function contractBS!(plan::CuTensorPlan,
     A_ptrs = CuPtr{Cvoid}[pointer(block) for block in A]
     B_ptrs = CuPtr{Cvoid}[pointer(block) for block in B]
     C_ptrs = CuPtr{Cvoid}[pointer(block) for block in C]
-    
-    cutensorBlockSparseContract(handle(), plan, 
-                                            Ref{scalar_type}(alpha), A_ptrs, B_ptrs, 
-                                            Ref{scalar_type}(beta),  C_ptrs, C_ptrs, 
-                                            plan.workspace, sizeof(plan.workspace), stream())
+
+    cutensorBlockSparseContract(
+        handle(), plan,
+        Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
+        Ref{scalar_type}(beta), C_ptrs, C_ptrs,
+        plan.workspace, sizeof(plan.workspace), stream()
+    )
     synchronize(stream())
     return C
 end
@@ -60,21 +67,22 @@ function plan_contraction(
         @nospecialize(B), Binds::ModeType, opB::cutensorOperator_t,
         @nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
         opOut::cutensorOperator_t;
-        jit::cutensorJitMode_t=JIT_MODE_NONE,
-        workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
-        algo::cutensorAlgo_t=ALGO_DEFAULT,
-        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing)
+        jit::cutensorJitMode_t = JIT_MODE_NONE,
+        workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+        algo::cutensorAlgo_t = ALGO_DEFAULT,
+        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing
+    )
 
     !is_unary(opA)    && throw(ArgumentError("opA must be a unary op!"))
     !is_unary(opB)    && throw(ArgumentError("opB must be a unary op!"))
     !is_unary(opC)    && throw(ArgumentError("opC must be a unary op!"))
     !is_unary(opOut)  && throw(ArgumentError("opOut must be a unary op!"))
-    
+
     descA = CuTensorBSDescriptor(A)
     descB = CuTensorBSDescriptor(B)
     descC = CuTensorBSDescriptor(C)
     # for now, D must be identical to C (and thus, descD must be identical to descC)
-    
+
     modeA = collect(Cint, Ainds)
     modeB = collect(Cint, Binds)
     modeC = collect(Cint, Cinds)
@@ -87,17 +95,19 @@ function plan_contraction(
 
 
     desc = Ref{cutensorOperationDescriptor_t}()
-    cutensorCreateBlockSparseContraction(handle(),
-    desc, 
-    descA, modeA, opA,
-    descB, modeB, opB,
-    descC, modeC, opC,
-    descC, modeC, actual_compute_type)
+    cutensorCreateBlockSparseContraction(
+        handle(),
+        desc,
+        descA, modeA, opA,
+        descB, modeB, opB,
+        descC, modeC, opC,
+        descC, modeC, actual_compute_type
+    )
 
     plan_pref = Ref{cutensorPlanPreference_t}()
     cutensorCreatePlanPreference(handle(), plan_pref, algo, jit)
 
-    plan = CuTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+    plan = CuTensorPlan(desc[], plan_pref[]; workspacePref = workspace)
     # cutensorDestroyOperationDescriptor(desc[])
     cutensorDestroyPlanPreference(plan_pref[])
     return plan
diff --git a/lib/cutensor/src/blocksparse/types.jl b/lib/cutensor/src/blocksparse/types.jl
index 292dc4d00..41cbebdbd 100644
--- a/lib/cutensor/src/blocksparse/types.jl
+++ b/lib/cutensor/src/blocksparse/types.jl
@@ -12,20 +12,26 @@ mutable struct CuTensorBS{T, N}
     ## This expects a Vector{Tuple(Int)} right now
     nonzero_block_coords
 
-    function CuTensorBS{T, N}(nonzero_data::Vector{<:CuArray}, 
-        blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number, N}
+    function CuTensorBS{T, N}(
+            nonzero_data::Vector{<:CuArray},
+            blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector
+        ) where {T <: Number, N}
         CuArrayT = eltype(nonzero_data)
         @assert eltype(CuArrayT) == T
         # @assert ndims(CuArrayT) == N
         @assert length(block_extents) == N
-        new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
+        return new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
     end
 end
 
-function CuTensorBS(nonzero_data::Vector{<:CuArray{T}}, 
-    blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number}
-    CuTensorBS{T,length(block_extents)}(nonzero_data, 
-    blocks_per_mode, block_extents, nonzero_block_coords, inds)
+function CuTensorBS(
+        nonzero_data::Vector{<:CuArray{T}},
+        blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector
+    ) where {T <: Number}
+    return CuTensorBS{T, length(block_extents)}(
+        nonzero_data,
+        blocks_per_mode, block_extents, nonzero_block_coords, inds
+    )
 end
 # array interface
 function Base.size(T::CuTensorBS)
@@ -39,8 +45,8 @@ Base.strides(T::CuTensorBS) = vcat([[st...] for st in strides.(T.nonzero_data)].
 Base.eltype(T::CuTensorBS) = eltype(eltype(T.nonzero_data))
 
 function block_extents(T::CuTensorBS)
-    extents = Vector{Int64}() 
-    
+    extents = Vector{Int64}()
+
     for ex in T.block_extents
         extents = vcat(extents, ex...)
     end
@@ -66,18 +72,21 @@ mutable struct CuTensorBSDescriptor
     handle::cutensorBlockSparseTensorDescriptor_t
     # inner constructor handles creation and finalizer of the descriptor
     function CuTensorBSDescriptor(
-        numModes,
-        numNonZeroBlocks,
-        numSectionsPerMode,
-        extent,
-        nonZeroCoordinates,
-        stride,
-        eltype)
+            numModes,
+            numNonZeroBlocks,
+            numSectionsPerMode,
+            extent,
+            nonZeroCoordinates,
+            stride,
+            eltype
+        )
 
         desc = Ref{cuTENSOR.cutensorBlockSparseTensorDescriptor_t}()
-        cutensorCreateBlockSparseTensorDescriptor(handle(), desc, 
-        numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
-        stride, eltype)
+        cutensorCreateBlockSparseTensorDescriptor(
+            handle(), desc,
+            numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
+            stride, eltype
+        )
 
         obj = new(desc[])
         finalizer(unsafe_destroy!, obj)
@@ -86,12 +95,13 @@ mutable struct CuTensorBSDescriptor
 end
 
 function CuTensorBSDescriptor(
-    numModes,
-    numNonZeroBlocks,
-    numSectionsPerMode,
-    extent,
-    nonZeroCoordinates,
-    eltype)
+        numModes,
+        numNonZeroBlocks,
+        numSectionsPerMode,
+        extent,
+        nonZeroCoordinates,
+        eltype
+    )
 
     return CuTensorBSDescriptor(numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, C_NULL, eltype)
 end
@@ -101,7 +111,7 @@ Base.show(io::IO, desc::CuTensorBSDescriptor) = @printf(io, "CuTensorBSDescripto
 Base.unsafe_convert(::Type{cutensorBlockSparseTensorDescriptor_t}, obj::CuTensorBSDescriptor) = obj.handle
 
 function unsafe_destroy!(obj::CuTensorBSDescriptor)
-    cutensorDestroyBlockSparseTensorDescriptor(obj)
+    return cutensorDestroyBlockSparseTensorDescriptor(obj)
 end
 
 ## Descriptor function for CuTensorBS type. Please overwrite for custom objects
@@ -110,11 +120,13 @@ function CuTensorBSDescriptor(A::CuTensorBS)
     numNonZeroBlocks = Int64(length(A.nonzero_block_coords))
     numSectionsPerMode = collect(Int32, A.blocks_per_mode)
     extent = block_extents(A)
-    nonZeroCoordinates =  Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
+    nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
     st = strides(A)
-    dataType = eltype(A)#convert(cuTENSOR.cutensorDataType_t, eltype(A))
+    dataType = eltype(A) #convert(cuTENSOR.cutensorDataType_t, eltype(A))
 
     ## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team.
-    CuTensorBSDescriptor(numModes, numNonZeroBlocks, 
-    numSectionsPerMode, extent, nonZeroCoordinates, dataType)
+    return CuTensorBSDescriptor(
+        numModes, numNonZeroBlocks,
+        numSectionsPerMode, extent, nonZeroCoordinates, dataType
+    )
 end
diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl
index b33560b72..4e7ba168d 100644
--- a/lib/cutensor/src/libcutensor.jl
+++ b/lib/cutensor/src/libcutensor.jl
@@ -545,12 +545,12 @@ end
     @gcsafe_ccall libcutensor.cutensorBlockSparseContract(handle::cutensorHandle_t,
                                                           plan::cutensorPlan_t,
                                                           alpha::Ptr{Cvoid},
-                                                          A::Ptr{CuPtr{Cvoid}},
-                                                          B::Ptr{CuPtr{Cvoid}},
+        A::Ptr{CuPtr{Cvoid}},
+        B::Ptr{CuPtr{Cvoid}},
                                                           beta::Ptr{Cvoid},
-                                                          C::Ptr{CuPtr{Cvoid}},
-                                                          D::Ptr{CuPtr{Cvoid}},
-                                                          workspace::CuPtr{Cvoid},
+        C::Ptr{CuPtr{Cvoid}},
+        D::Ptr{CuPtr{Cvoid}},
+        workspace::CuPtr{Cvoid},
                                                           workspaceSize::UInt64,
                                                           stream::cudaStream_t)::cutensorStatus_t
 end
diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl
index 636600a74..baf56949a 100644
--- a/lib/cutensor/test/contractions.jl
+++ b/lib/cutensor/test/contractions.jl
@@ -188,62 +188,73 @@ end
     end
 end
 
-eltypes_compact = [
-    (Float32, Float32, Float32, Float32),
-    (ComplexF32, ComplexF32, ComplexF32, Float32),
-     (Float64, Float64, Float64, Float64),
-     (ComplexF64, ComplexF64, ComplexF64, Float64)
-]
-@testset "Blocksparse Contraction" begin
-    ## There are many unsupported types because this is a new functionality
-    ## So I will test with Float32 and ComplexF32 only
-    @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
-        ## i = [20,20,25]
-        ## k = [10,10,15]
-        ## l = [30,30,35]
-        ## A = Tensor(k,i,l)
-        ## Nonzero blocks are 
-        ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
-        A = Vector{CuArray{eltyA, 3}}()
-        for k in [10,15]
-            for i in [20,25]
-                for l in [30,35]
-                    push!(A, CuArray(ones(eltyA, k,i,l)))
+    eltypes_compact = [
+        (Float32, Float32, Float32, Float32),
+        (ComplexF32, ComplexF32, ComplexF32, Float32),
+        (Float64, Float64, Float64, Float64),
+        (ComplexF64, ComplexF64, ComplexF64, Float64),
+    ]
+    @testset "Blocksparse Contraction" begin
+        ## There are many unsupported types because this is a new functionality
+        ## So I will test with Float32 and ComplexF32 only
+        @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
+            ## i = [20,20,25]
+            ## k = [10,10,15]
+            ## l = [30,30,35]
+            ## A = Tensor(k,i,l)
+            ## Nonzero blocks are
+            ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
+            A = Vector{CuArray{eltyA, 3}}()
+            for k in [10, 15]
+                for i in [20, 25]
+                    for l in [30, 35]
+                        push!(A, CuArray(ones(eltyA, k, i, l)))
+                    end
                 end
             end
-        end
 
-        ## B = Tensor(k,l)
-        ## Nonzero blocks are
-        ## [1,1], [2,3]
-        B = Array{CuArray{eltyB, 2}}(
-            [CuArray(randn(eltyB, 10, 30)),
-            CuArray(randn(eltyB, 10, 35))])
-
-        ## C = Tensor(i)
-        ## Nonzero blocks are 
-        ## [1,], [3,]
-        C = Vector{CuArray{eltyC, 1}}(
-            [CuArray(zeros(eltyC, 20)),
-            CuArray(zeros(eltyC, 25))]
-        )
-        
-        cuTenA = cuTENSOR.CuTensorBS(A, [3,3,3], 
-        [(10,10,15), (20,20,25),  (30,30,35)], 
-        [(1,1,1), (1,1,3), (1,3,1), (1,3,3), (3,1,1), (3,1,3), (3,3,1), (3,3,3)],
-        [1,3,2])
-        cuTenB = cuTENSOR.CuTensorBS(B, [3,3],
-        [(10,10,15), (30,30,35)],
-        [(1,1),(2,3)], [1,2], )
-        cuTenC = cuTENSOR.CuTensorBS(C, [3],
-        [(20,20,25)],[(1,),(3,)], [3])
-
-        mul!(cuTenC, cuTenA, cuTenB, 1, 0)
-        ## C[1] = A[1,1,1] * B[1,1]
-        @test C[1] ≈ reshape(permutedims(A[1], (2,1,3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
-        ## C[3] = A[1,3,1] * B[1,1]
-        @test C[2] ≈ reshape(permutedims(A[3], (2,1,3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+            ## B = Tensor(k,l)
+            ## Nonzero blocks are
+            ## [1,1], [2,3]
+            B = Array{CuArray{eltyB, 2}}(
+                [
+                    CuArray(randn(eltyB, 10, 30)),
+                    CuArray(randn(eltyB, 10, 35)),
+                ]
+            )
+
+            ## C = Tensor(i)
+            ## Nonzero blocks are
+            ## [1,], [3,]
+            C = Vector{CuArray{eltyC, 1}}(
+                [
+                    CuArray(zeros(eltyC, 20)),
+                    CuArray(zeros(eltyC, 25)),
+                ]
+            )
+
+            cuTenA = cuTENSOR.CuTensorBS(
+                A, [3, 3, 3],
+                [(10, 10, 15), (20, 20, 25), (30, 30, 35)],
+                [(1, 1, 1), (1, 1, 3), (1, 3, 1), (1, 3, 3), (3, 1, 1), (3, 1, 3), (3, 3, 1), (3, 3, 3)],
+                [1, 3, 2]
+            )
+            cuTenB = cuTENSOR.CuTensorBS(
+                B, [3, 3],
+                [(10, 10, 15), (30, 30, 35)],
+                [(1, 1), (2, 3)], [1, 2],
+            )
+            cuTenC = cuTENSOR.CuTensorBS(
+                C, [3],
+                [(20, 20, 25)], [(1,), (3,)], [3]
+            )
+
+            mul!(cuTenC, cuTenA, cuTenB, 1, 0)
+            ## C[1] = A[1,1,1] * B[1,1]
+            @test C[1] ≈ reshape(permutedims(A[1], (2, 1, 3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
+            ## C[3] = A[1,3,1] * B[1,1]
+            @test C[2] ≈ reshape(permutedims(A[3], (2, 1, 3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+        end
     end
-end
 
 end

kmp5VT · 2026-03-16T21:33:27Z

There were some issues in the Clang.jl's conversion of the cuTENSOR.h file into Julia wrapper functions. Specifically I had a runtime issue when trying to convert arrays of cuarray into ptr{ptr{cvoid}}. I think this is because CUDA.jl does not expect an array of cuarrays and so the julia side unsafe convert failed. This is not yet ready to merge.

…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor

codecov · 2026-03-17T02:38:40Z

Codecov Report

❌ Patch coverage is 0% with 92 lines in your changes missing coverage. Please review.
✅ Project coverage is 16.41%. Comparing base (5a141fe) to head (680f7ff).
⚠️ Report is 1 commits behind head on master.

Files with missing lines	Patch %	Lines
lib/cutensor/src/blocksparse/types.jl	0.00%	50 Missing ⚠️
lib/cutensor/src/blocksparse/operations.jl	0.00%	39 Missing ⚠️
lib/cutensor/src/blocksparse/interfaces.jl	0.00%	3 Missing ⚠️

Additional details and impacted files

@@            Coverage Diff             @@
##           master    #3057      +/-   ##
==========================================
- Coverage   16.57%   16.41%   -0.16%     
==========================================
  Files         120      123       +3     
  Lines        9586     9678      +92     
==========================================
  Hits         1589     1589              
- Misses       7997     8089      +92

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

github-actions

CUDA.jl Benchmarks

Details

Benchmark suite	Current: `f6bd83a`	Previous: `22a3b2c`	Ratio
`array/accumulate/Float32/1d`	`100654` ns	`101179` ns	`0.99`
`array/accumulate/Float32/dims=1`	`76488.5` ns	`76332` ns	`1.00`
`array/accumulate/Float32/dims=1L`	`1585634.5` ns	`1585653` ns	`1.00`
`array/accumulate/Float32/dims=2`	`143807.5` ns	`143537.5` ns	`1.00`
`array/accumulate/Float32/dims=2L`	`657780` ns	`657611` ns	`1.00`
`array/accumulate/Int64/1d`	`118415` ns	`118480` ns	`1.00`
`array/accumulate/Int64/dims=1`	`80053` ns	`79696` ns	`1.00`
`array/accumulate/Int64/dims=1L`	`1694561` ns	`1706346` ns	`0.99`
`array/accumulate/Int64/dims=2`	`155872` ns	`156785` ns	`0.99`
`array/accumulate/Int64/dims=2L`	`962013` ns	`961500` ns	`1.00`
`array/broadcast`	`20151` ns	`20238` ns	`1.00`
`array/construct`	`1230.25` ns	`1254.35` ns	`0.98`
`array/copy`	`18017` ns	`18020` ns	`1.00`
`array/copyto!/cpu_to_gpu`	`214317` ns	`212483.5` ns	`1.01`
`array/copyto!/gpu_to_cpu`	`283467` ns	`281275` ns	`1.01`
`array/copyto!/gpu_to_gpu`	`10772` ns	`10671` ns	`1.01`
`array/iteration/findall/bool`	`135454` ns	`134806` ns	`1.00`
`array/iteration/findall/int`	`150788` ns	`149640` ns	`1.01`
`array/iteration/findfirst/bool`	`81312` ns	`81104` ns	`1.00`
`array/iteration/findfirst/int`	`83536.5` ns	`83006` ns	`1.01`
`array/iteration/findmin/1d`	`87117` ns	`83919` ns	`1.04`
`array/iteration/findmin/2d`	`117060` ns	`116144` ns	`1.01`
`array/iteration/logical`	`199068.5` ns	`197753` ns	`1.01`
`array/iteration/scalar`	`66508` ns	`67066` ns	`0.99`
`array/permutedims/2d`	`52271` ns	`52114` ns	`1.00`
`array/permutedims/3d`	`52323` ns	`52672` ns	`0.99`
`array/permutedims/4d`	`51254` ns	`51236` ns	`1.00`
`array/random/rand/Float32`	`13399` ns	`13391` ns	`1.00`
`array/random/rand/Int64`	`24921` ns	`25096` ns	`0.99`
`array/random/rand!/Float32`	`8705.666666666666` ns	`9978` ns	`0.87`
`array/random/rand!/Int64`	`21748` ns	`21790` ns	`1.00`
`array/random/randn/Float32`	`36984.5` ns	`43502` ns	`0.85`
`array/random/randn!/Float32`	`30804` ns	`30851` ns	`1.00`
`array/reductions/mapreduce/Float32/1d`	`34371` ns	`34466` ns	`1.00`
`array/reductions/mapreduce/Float32/dims=1`	`39968.5` ns	`48755` ns	`0.82`
`array/reductions/mapreduce/Float32/dims=1L`	`51502` ns	`51181` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=2`	`56453` ns	`56334` ns	`1.00`
`array/reductions/mapreduce/Float32/dims=2L`	`69257` ns	`69218` ns	`1.00`
`array/reductions/mapreduce/Int64/1d`	`42013.5` ns	`41729` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=1`	`41741` ns	`43165.5` ns	`0.97`
`array/reductions/mapreduce/Int64/dims=1L`	`86937` ns	`87029.5` ns	`1.00`
`array/reductions/mapreduce/Int64/dims=2`	`59147` ns	`59231` ns	`1.00`
`array/reductions/mapreduce/Int64/dims=2L`	`84663` ns	`84493` ns	`1.00`
`array/reductions/reduce/Float32/1d`	`34553` ns	`34454` ns	`1.00`
`array/reductions/reduce/Float32/dims=1`	`39546.5` ns	`48965` ns	`0.81`
`array/reductions/reduce/Float32/dims=1L`	`51569.5` ns	`51071` ns	`1.01`
`array/reductions/reduce/Float32/dims=2`	`56571.5` ns	`56187` ns	`1.01`
`array/reductions/reduce/Float32/dims=2L`	`69904` ns	`69359.5` ns	`1.01`
`array/reductions/reduce/Int64/1d`	`42565` ns	`41960` ns	`1.01`
`array/reductions/reduce/Int64/dims=1`	`42029` ns	`43285` ns	`0.97`
`array/reductions/reduce/Int64/dims=1L`	`86852` ns	`86905` ns	`1.00`
`array/reductions/reduce/Int64/dims=2`	`59436` ns	`58992` ns	`1.01`
`array/reductions/reduce/Int64/dims=2L`	`84430` ns	`84104` ns	`1.00`
`array/reverse/1d`	`17564` ns	`17615` ns	`1.00`
`array/reverse/1dL`	`68187` ns	`68235` ns	`1.00`
`array/reverse/1dL_inplace`	`65614` ns	`65656` ns	`1.00`
`array/reverse/1d_inplace`	`8381.333333333334` ns	`10268.666666666666` ns	`0.82`
`array/reverse/2d`	`20639` ns	`20767` ns	`0.99`
`array/reverse/2dL`	`72749` ns	`72880` ns	`1.00`
`array/reverse/2dL_inplace`	`65690` ns	`65670` ns	`1.00`
`array/reverse/2d_inplace`	`9751` ns	`9888` ns	`0.99`
`array/sorting/1d`	`2735249` ns	`2736190` ns	`1.00`
`array/sorting/2d`	`1069013` ns	`1068430` ns	`1.00`
`array/sorting/by`	`3304195` ns	`3304193` ns	`1.00`
`cuda/synchronization/context/auto`	`1125` ns	`1126.4` ns	`1.00`
`cuda/synchronization/context/blocking`	`895` ns	`931.6315789473684` ns	`0.96`
`cuda/synchronization/context/nonblocking`	`8038.700000000001` ns	`7346.299999999999` ns	`1.09`
`cuda/synchronization/stream/auto`	`982.125` ns	`1042.6923076923076` ns	`0.94`
`cuda/synchronization/stream/blocking`	`818.2666666666667` ns	`801.5346534653465` ns	`1.02`
`cuda/synchronization/stream/nonblocking`	`7069.6` ns	`7426.700000000001` ns	`0.95`
`integration/byval/reference`	`143589` ns	`143714` ns	`1.00`
`integration/byval/slices=1`	`145463` ns	`145836` ns	`1.00`
`integration/byval/slices=2`	`284181` ns	`284663` ns	`1.00`
`integration/byval/slices=3`	`422818.5` ns	`423042` ns	`1.00`
`integration/cudadevrt`	`102326.5` ns	`102219` ns	`1.00`
`integration/volumerhs`	`23418364` ns	`23416498.5` ns	`1.00`
`kernel/indexing`	`13103` ns	`13008` ns	`1.01`
`kernel/indexing_checked`	`13836` ns	`13732` ns	`1.01`
`kernel/launch`	`2072.6` ns	`2108.4444444444443` ns	`0.98`
`kernel/occupancy`	`697.6981132075472` ns	`665.65` ns	`1.05`
`kernel/rand`	`14208` ns	`17312` ns	`0.82`
`latency/import`	`3812714028.5` ns	`3818709770` ns	`1.00`
`latency/precompile`	`4583803742.5` ns	`4584771622` ns	`1.00`
`latency/ttfp`	`4394446986.5` ns	`4399290991.5` ns	`1.00`

This comment was automatically generated by workflow using github-action-benchmark.

kshyatt · 2026-03-17T11:03:29Z

Thanks very much for putting this together, I'm happy to help with the header issues if needed!

…ensor

…but the C++ code is still in flux)

kmp5VT · 2026-03-19T20:37:44Z

@kshyatt I removed the extra code, made the functions that linked to the library relatively agnostic (i.e. you are not forced to use CuTensorBS but can buy in if you'd like) and added a unit test. If you could help with the Clang.jl issue, that would be amazing!

kshyatt · 2026-03-23T07:50:08Z

I'll try to take a look today!

kshyatt · 2026-03-23T10:04:03Z

Did you use the scripts in res/wrap to do the wrapping of the C headers?

…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor

kmp5VT · 2026-03-24T02:00:40Z

Did you use the scripts in res/wrap to do the wrapping of the C headers?

Yes I did use the scripts but this produced the Ptr{Ptr{Cvoid}} definition in the libcutensor.jl file returns the following error

ERROR: MethodError: no method matching unsafe_convert(::Type{Ptr{Nothing}}, ::CuPtr{Nothing})
The function `unsafe_convert` exists, but no method is defined for this combination of argument types.

Closest candidates are:
  unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitBlame)
   @ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
  unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitRevWalker)
   @ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
  unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitDiffStats)
   @ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
  ...

Stacktrace:
  [1] Ref{Ptr{Nothing}}(a::Vector{CuPtr{Nothing}})
    @ Base ./refpointer.jl:166
  [2] cconvert
    @ ./refpointer.jl:178 [inlined]
  [3] macro expansion
    @ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:545 [inlined]
  [4] (::cuTENSOR.var"#cutensorBlockSparseContract##0#cutensorBlockSparseContract##1"{…})()
    @ cuTENSOR ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:34
  [5] retry_reclaim
    @ ~/.julia/packages/CUDA/Il00B/src/memory.jl:434 [inlined]
  [6] check
    @ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:22 [inlined]
  [7] cutensorBlockSparseContract
    @ ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:33 [inlined]
  [8] 
    @ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:50
  [9] contract!(alpha::Number, A::Any, Ainds::Vector{…}, opA::cuTENSOR.cutensorOperator_t, B::Any, Binds::Vector{…}, opB::cuTENSOR.cutensorOperator_t, beta::Number, C::Any, Cinds::Vector{…}, opC::cuTENSOR.cutensorOperator_t, opOut::cuTENSOR.cutensorOperator_t; jit::cuTENSOR.cutensorJitMode_t, workspace::cuTENSOR.cutensorWorksizePreference_t, algo::cuTENSOR.cutensorAlgo_t, compute_type::Nothing, plan::Nothing)
    @ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:25
 [10] mul!(C::CuTensorBS{Float64, 1}, A::CuTensorBS{Float64, 3}, B::CuTensorBS{Float64, 2}, α::Float64, β::Float64)
    @ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/interfaces.jl:21

However, I found that If I modify the code to be Ptr{CuPtr{CVoid}} that the blocksparse functionality works as expected with no error in either julia or C. This makes the function look closer to the cutensorContract function. Do you why clang.jl doesn't properly write these as Ptr{CuPtr{CVoid}}?

kshyatt · 2026-03-24T06:21:23Z

Probably you missed some of the weird esoterica in res/wrap, haha. I'll fix it and make a PR to your PR?

lkdvos

Left some remaining comments, but for me I think most of the parts that I would use are there, since I don't really see myself going through the CuTensorBS construction (we also never used the CuTensor in TensorOperations so that is completely fine)

Remove left over code. Will need to make something like this to define mul! in the future Co-authored-by: Lukas Devos <ldevos98@gmail.com>

…ensor

…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor

…to a contigous memory block)

kmp5VT · 2026-04-13T18:13:52Z

@kshyatt ~~In the tests, I skip the blocksparse test on CUDA versions that have a bug in the C library.~~
The C side error happens on the CUDA version that I know works. I am wondering if it is related to the cutensor version being picked up. Looking into this now.

kmp5VT and others added 5 commits January 22, 2026 15:27

Working on implementing the wrapper for the new blocksparse cutensor …

c4918d5

…code

Revert to cutensor_jll.libcutensor as this has the blocksparse cutens…

c15fea2

…or support now

Remove redudant convert function

82752ad

Merge branch 'JuliaGPU:master' into kmp5/feature/wrap_blocksparse_cut…

9678ecf

…ensor

Make blocksparse code more generic (generic case). Would it be better…

affc3d4

… to make it a union type of CuTensorBS and AbstractArray?

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

a3a3f07

Merge branch 'kmp5/feature/wrap_blocksparse_cutensor' of github.com:k…

67013c8

…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor

github-actions Bot reviewed Mar 17, 2026

View reviewed changes

kshyatt self-requested a review March 17, 2026 10:52

kmp5VT and others added 5 commits March 19, 2026 11:41

Merge branch 'JuliaGPU:master' into kmp5/feature/wrap_blocksparse_cut…

f6f5c5f

…ensor

Working on simplyfying and making accessors

1ec69cf

Fix problem with stride

8f5ef88

Small comment reminder

9285b07

Add a contraction test for the blocksparse system (not comprehensive …

cda4a4e

…but the C++ code is still in flux)

kmp5VT added 3 commits March 23, 2026 21:13

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

94b8152

Closer to clang.jl construction

138edaf

Merge branch 'kmp5/feature/wrap_blocksparse_cutensor' of github.com:k…

ce2eeec

…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

3c11bec

maleadt force-pushed the master branch from f1e7455 to 5a6f767 Compare March 26, 2026 08:13

Update cutensor.toml for block sparse contraction

cc4b826

kshyatt force-pushed the kmp5/feature/wrap_blocksparse_cutensor branch from 17806da to cc4b826 Compare March 26, 2026 10:35

lkdvos reviewed Mar 26, 2026

View reviewed changes

kmp5VT and others added 2 commits March 26, 2026 17:36

Apply suggestion from @lkdvos

c493659

Remove left over code. Will need to make something like this to define mul! in the future Co-authored-by: Lukas Devos <ldevos98@gmail.com>

Merge branch 'JuliaGPU:master' into kmp5/feature/wrap_blocksparse_cut…

f6fb806

…ensor

mtfishman mentioned this pull request Apr 3, 2026

Blocksparse CuTensor contraction backend ITensor/ITensors.jl#1721

Open

kmp5VT and others added 9 commits April 3, 2026 11:42

Document C_NULL cutensorBSDescriptor

4231b4e

Remove comment

f9ca018

Merge branch 'JuliaGPU:master' into kmp5/feature/wrap_blocksparse_cut…

21a5c81

…ensor

Merge branch 'kmp5/feature/wrap_blocksparse_cutensor' of github.com:k…

327f6d7

…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor

Fix issues with new CUDA organization

04accbe

Add type restrictions to CuTensorBS type to make downstream easier

3ebd626

I believe this is the "generic" stride (i.e. all blocks are packed in…

26735e0

…to a contigous memory block)

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

8a74b2b

Skip blocksparse tests for failing versions.

52e58c2

kmp5VT added 4 commits April 13, 2026 23:30

More broken versions. Will send Mathias a message

aaee14e

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

d1be8ae

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

5d6044b

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

04a39d7

kshyatt reviewed Apr 22, 2026

View reviewed changes

Comment thread lib/cutensor/src/blocksparse/operations.jl Outdated

kshyatt reviewed Apr 22, 2026

View reviewed changes

Comment thread lib/cutensor/src/blocksparse/operations.jl Outdated

kshyatt reviewed Apr 22, 2026

View reviewed changes

Comment thread lib/cutensor/test/contractions.jl Outdated

kmp5VT and others added 5 commits April 22, 2026 05:30

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

9a3282e

Remove synchronize

0b16670

Add destroy descriptor

00726d7

Removing skipped versions for now.

f6bd83a

Merge branch 'master' into kmp5/feature/wrap_blocksparse_cutensor

680f7ff

kshyatt approved these changes Apr 22, 2026

View reviewed changes

kshyatt enabled auto-merge (squash) April 22, 2026 18:15

kshyatt merged commit 22b2689 into JuliaGPU:master Apr 22, 2026
1 of 2 checks passed

kmp5VT deleted the kmp5/feature/wrap_blocksparse_cutensor branch April 23, 2026 01:22

Conversation

kmp5VT commented Mar 16, 2026

Uh oh!

github-actions Bot commented Mar 16, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

kmp5VT commented Mar 16, 2026

Uh oh!

codecov Bot commented Mar 17, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

github-actions Bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

kshyatt commented Mar 17, 2026

Uh oh!

kmp5VT commented Mar 19, 2026

Uh oh!

kshyatt commented Mar 23, 2026

Uh oh!

kshyatt commented Mar 23, 2026

Uh oh!

kmp5VT commented Mar 24, 2026

Uh oh!

kshyatt commented Mar 24, 2026

Uh oh!

lkdvos left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

kmp5VT commented Apr 13, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

github-actions Bot commented Mar 16, 2026 •

edited

Loading

codecov Bot commented Mar 17, 2026 •

edited

Loading

github-actions Bot left a comment •

edited

Loading

kmp5VT commented Apr 13, 2026 •

edited

Loading