QuantumKitHub · lkdvos · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/Project.toml b/Project.toml
@@ -26,6 +26,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 EnzymeTestUtils = "12d8515a-0907-448a-8884-5fe00fdf1c5a"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 
 [extensions]
@@ -35,6 +36,7 @@ TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitEnzymeExt = "Enzyme"
 TensorKitEnzymeTestUtilsExt = "EnzymeTestUtils"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
+TensorKitGPUArraysExt = "GPUArrays"
 TensorKitMooncakeExt = "Mooncake"
 
 [workspace]
@@ -49,6 +51,7 @@ Dictionaries = "0.4"
 Enzyme = "0.13.157"
 EnzymeTestUtils = "0.2.8"
 FiniteDifferences = "0.12"
+GPUArrays = "11.4.1"
 LRUCache = "1.6"
 LinearAlgebra = "1"
 MatrixAlgebraKit = "0.6.8"

diff --git a/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
@@ -1,15 +1,14 @@
 module TensorKitAMDGPUExt
 
 using AMDGPU, AMDGPU.rocBLAS, AMDGPU.rocSOLVER, LinearAlgebra
-using AMDGPU: @allowscalar
 import AMDGPU: rand as rocrand, rand! as rocrand!, randn as rocrandn, randn! as rocrandn!
 
 using TensorKit
 using TensorKit.Factorizations
 using Strided
 using MatrixAlgebraKit
 using MatrixAlgebraKit: AbstractAlgorithm
-using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype
 import TensorKit: randisometry
 using Base: rand, randn
 

diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -7,16 +7,6 @@ function ROCTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂
     return ROCTensorMap{T, S, N₁, N₂}(ROCArray{T}(t.data), space(t))
 end
 
-# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
-function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: ROCVector{T}}
-    h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
-    h_t = TensorKit.project_symmetric!(h_t, Array(data))
-    # verify result
-    isapprox(Array(reshape(data, dims(h_t))), convert(Array, h_t); atol = tol) ||
-        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
-    return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
-end
-
 for (fname, felt) in ((:zeros, :zero), (:ones, :one))
     @eval begin
         function AMDGPU.$fname(
@@ -92,13 +82,6 @@ for randfun in (:rocrand, :rocrandn)
     end
 end
 
-# Scalar implementation
-#-----------------------
-function TensorKit.scalar(t::ROCTensorMap{T, S, 0, 0}) where {T, S}
-    inds = findall(!iszero, t.data)
-    return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
-end
-
 function Base.convert(
         TT::Type{ROCTensorMap{T, S, N₁, N₂}},
         t::AbstractTensorMap{<:Any, S, N₁, N₂}

diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -1,10 +1,8 @@
 module TensorKitCUDAExt
 
 using CUDA, CUDA.cuBLAS, CUDA.cuSOLVER, CUDA.cuRAND, LinearAlgebra
-using CUDA: @allowscalar
 import CUDA.cuRAND: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
 using Strided: StridedViews
-using CUDA.CUDACore.KernelAbstractions: @kernel, @index, get_backend
 
 using Adapt: Adapt
 
@@ -20,20 +18,5 @@ using TensorKit: MatrixAlgebraKit
 using Random
 
 include("cutensormap.jl")
-include("truncation.jl")
-
-function TensorKit.fill_braidingsubblock!(data::TD, val) where {T, TD <: Union{<:CuMatrix{T}, <:StridedViews.StridedView{T, 4, <:CuArray{T}}}}
-    # COV_EXCL_START
-    # kernels are not reachable by coverage
-    @kernel function fill_subblock_kernel!(subblock, val)
-        idx = @index(Global, Cartesian)
-        idx_val = idx[1] == idx[4] && idx[2] == idx[3] ? val : zero(val)
-        @inbounds subblock[idx] = idx_val
-    end
-    # COV_EXCL_STOP
-    kernel = fill_subblock_kernel!(get_backend(data))
-    kernel(data, val; ndrange = size(data))
-    return data
-end
 
 end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -7,16 +7,6 @@ function CuTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂,
     return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
 end
 
-# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
-function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
-    h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
-    h_t = TensorKit.project_symmetric!(h_t, Array(data))
-    # verify result
-    isapprox(Array(reshape(data, dims(h_t))), convert(Array, h_t); atol = tol) ||
-        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
-    return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
-end
-
 for (fname, felt) in ((:zeros, :zero), (:ones, :one))
     @eval begin
         function CUDA.$fname(
@@ -94,13 +84,6 @@ for randfun in (:curand, :curandn)
     end
 end
 
-# Scalar implementation
-#-----------------------
-function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
-    inds = findall(!iszero, t.data)
-    return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
-end
-
 function LinearAlgebra.isposdef(t::CuTensorMap)
     domain(t) == codomain(t) ||
         throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))

diff --git a/ext/TensorKitCUDAExt/truncation.jl b/ext/TensorKitCUDAExt/truncation.jl
diff --git a/ext/TensorKitGPUArraysExt.jl b/ext/TensorKitGPUArraysExt.jl
@@ -0,0 +1,117 @@
+module TensorKitGPUArraysExt
+
+using GPUArrays
+using GPUArrays: @allowscalar
+using GPUArrays.KernelAbstractions: @kernel, @index, get_backend
+
+using Strided: StridedViews
+using MatrixAlgebraKit, Adapt
+using TensorKit
+using TensorKit.Factorizations
+using TensorKit.Factorizations: AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
+import TensorKit: randisometry, rand, randn, fill_braidingsubblock!
+
+function TensorKit.fill_braidingsubblock!(data::TD, val) where {T, TD <: Union{<:AnyGPUMatrix{T}, <:StridedViews.StridedView{T, 4, <:AnyGPUArray{T}}}}
+    # COV_EXCL_START
+    # kernels are not reachable by coverage
+    @kernel function fill_subblock_kernel!(subblock, val)
+        idx = @index(Global, Cartesian)
+        idx_val = idx[1] == idx[4] && idx[2] == idx[3] ? val : zero(val)
+        @inbounds subblock[idx] = idx_val
+    end
+    # COV_EXCL_STOP
+    kernel = fill_subblock_kernel!(get_backend(data))
+    kernel(data, val; ndrange = size(data))
+    return data
+end
+
+const GPUSectorVector{T, I} = TensorKit.SectorVector{T, I, <:AnyGPUVector{T}}
+
+function MatrixAlgebraKit.findtruncated(
+        values::GPUSectorVector, strategy::MatrixAlgebraKit.TruncationByOrder
+    )
+    I = sectortype(values)
+
+    dims = similar(values, Base.promote_op(dim, I))
+    for (c, v) in pairs(dims)
+        fill!(v, dim(c))
+    end
+
+    isempty(parent(values)) && return similar(values, Bool)
+
+    perm = sortperm(parent(values); strategy.by, strategy.rev)
+    cumulative_dim = cumsum(Base.permute!(parent(dims), perm))
+
+    result = similar(values, Bool)
+    parent(result)[perm] .= cumulative_dim .<= strategy.howmany
+    return result
+end
+
+function MatrixAlgebraKit.findtruncated(
+        values::GPUSectorVector, strategy::MatrixAlgebraKit.TruncationByError
+    )
+    (isfinite(strategy.p) && strategy.p > 0) ||
+        throw(ArgumentError(lazy"p-norm with p = $(strategy.p) is currently not supported."))
+    ϵᵖmax = max(strategy.atol^strategy.p, strategy.rtol^strategy.p * norm(values, strategy.p))
+    ϵᵖ = similar(values, typeof(ϵᵖmax))
+
+    # dimensions are all 1 so no need to account for weight
+    if FusionStyle(sectortype(values)) isa UniqueFusion
+        parent(ϵᵖ) .= abs.(parent(values)) .^ strategy.p
+    else
+        for (c, v) in pairs(values)
+            v′ = ϵᵖ[c]
+            v′ .= abs.(v) .^ strategy.p .* dim(c)
+        end
+    end
+
+    isempty(parent(values)) && return similar(values, Bool)
+
+    perm = sortperm(parent(values); by = abs, rev = false)
+    cumulative_err = cumsum(Base.permute!(parent(ϵᵖ), perm))
+
+    result = similar(values, Bool)
+    parent(result)[perm] .= cumulative_err .> ϵᵖmax
+    return result
+end
+
+function MatrixAlgebraKit.findtruncated_svd(values::GPUSectorVector, strategy::S) where {S <: MatrixAlgebraKit.TruncationStrategy}
+    # returning a GPUSectorVector wrecks things in truncate_{co}domain
+    # because of scalar indexing
+    return Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
+end
+
+for strat in (:(MatrixAlgebraKit.TruncationByOrder), :(MatrixAlgebraKit.TruncationByError), :(MatrixAlgebraKit.TruncationIntersection), :(TensorKit.Factorizations.TruncationSpace))
+    @eval function MatrixAlgebraKit.findtruncated_svd(values::GPUSectorVector, strategy::$strat)
+        # returning a GPUSectorVector wrecks things in truncate_{co}domain
+        # because of scalar indexing
+        return Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
+    end
+end
+
+function MatrixAlgebraKit.findtruncated_svd(values::GPUSectorVector, strategy::MatrixAlgebraKit.TruncationByValue)
+    atol = TensorKit.Factorizations.rtol_to_atol(values, strategy.p, strategy.atol, strategy.rtol)
+    strategy′ = trunctol(; atol, strategy.by, strategy.keep_below)
+    return SectorDict(c => Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated_svd(d, strategy′)) for (c, d) in pairs(values))
+end
+
+# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
+function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: AnyGPUVector{T}}
+    h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
+    h_t = TensorKit.project_symmetric!(h_t, Array(data))
+    # verify result
+    isapprox(Array(reshape(data, dims(h_t))), convert(Array, h_t); atol = tol) ||
+        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
+    return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::TensorMap{T, S, 0, 0, <:AnyGPUArray}) where {T, S}
+    inds = findall(!iszero, t.data)
+    return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
+end
+
+
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -38,7 +38,6 @@ AllocCheck = "0.2"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
 cuTENSOR = "6"
-GPUArrays = "11.3.1"
 JET = "0.9, 0.10, 0.11"
 ParallelTestRunner = "2"
 Test = "1"