diff --git a/Project.toml b/Project.toml index 3d6c2266..968ae226 100644 --- a/Project.toml +++ b/Project.toml @@ -46,7 +46,7 @@ Atomix = "0.1, 1" CEnum = "0.4, 0.5" ChainRulesCore = "1" ExprTools = "0.1" -GPUArrays = "11.1" +GPUArrays = "11.2" GPUCompiler = "0.27, 1.0" KernelAbstractions = "0.9.2" LLD_jll = "15, 16, 17" diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 0cf80fab..c3c45d6e 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -140,8 +140,6 @@ include("ROCKernels.jl") import .ROCKernels: ROCBackend export ROCBackend -# include("cache_allocator.jl") - function __init__() # Used to shutdown hostcalls if any is running. atexit(() -> begin Runtime.RT_EXITING[] = true end) diff --git a/src/array.jl b/src/array.jl index d2a9eb43..6780f0b8 100644 --- a/src/array.jl +++ b/src/array.jl @@ -3,31 +3,17 @@ mutable struct ROCArray{T, N, B} <: AbstractGPUArray{T, N} dims::Dims{N} offset::Int # Offset is in number of elements (not bytes). - function ROCArray{T, N, B}( - ::UndefInitializer, dims::Dims{N}, - ) where {T, N, B <: Mem.AbstractAMDBuffer} + function ROCArray{T, N, B}(::UndefInitializer, dims::Dims{N}) where {T, N, B <: Mem.AbstractAMDBuffer} @assert isbitstype(T) "ROCArray only supports bits types" - function _alloc_f() - sz::Int64 = prod(dims) * sizeof(T) + sz::Int64 = prod(dims) * sizeof(T) + return GPUArrays.cached_alloc((ROCArray, AMDGPU.device(), T, B, sz)) do @debug "Allocate `T=$T`, `dims=$dims`: $(Base.format_bytes(sz))" data = DataRef(pool_free, pool_alloc(B, sz)) - finalizer(unsafe_free!, new{T, N, B}(data, dims, 0)) - end - return _alloc_f() - - # name = GPUArrays.CacheAllocatorName[] - # # Do not use caching allocator if it is not set or - # # the buffer is not a device memory. - # return if !(B <: Mem.HIPBuffer) || name == :none - # _alloc_f() - # else - # GPUArrays.alloc!(_alloc_f, ROCBackend(), name, T, dims)::ROCArray{T, N, B} - # end + return finalizer(unsafe_free!, new{T, N, B}(data, dims, 0)) + end::ROCArray{T, N, B} end - function ROCArray{T, N}( - buf::DataRef{Managed{B}}, dims::Dims{N}; offset::Integer = 0, - ) where {T, N, B <: Mem.AbstractAMDBuffer} + function ROCArray{T, N}(buf::DataRef{Managed{B}}, dims::Dims{N}; offset::Integer = 0) where {T, N, B <: Mem.AbstractAMDBuffer} @assert isbitstype(T) "ROCArray only supports bits types" xs = new{T, N, B}(buf, dims, offset) return finalizer(unsafe_free!, xs) diff --git a/src/cache_allocator.jl b/src/cache_allocator.jl deleted file mode 100644 index c0d52401..00000000 --- a/src/cache_allocator.jl +++ /dev/null @@ -1,5 +0,0 @@ -const ROCCacheAllocator = GPUArrays.PerDeviceCacheAllocator(ROCArray; free_immediately=false) - -GPUArrays.cache_allocator(::ROCBackend) = ROCCacheAllocator - -GPUArrays.device(::ROCBackend) = AMDGPU.device() diff --git a/src/exception_handler.jl b/src/exception_handler.jl index 14f24330..bbbea8b4 100644 --- a/src/exception_handler.jl +++ b/src/exception_handler.jl @@ -40,9 +40,9 @@ struct ExceptionHolder n_str_buffers = 100 exception_flag = Mem.HostBuffer(sizeof(Int32), HIP.hipHostAllocDefault) - gate, buffers_counter, str_buffers_counter = (#GPUArrays.@no_cache_scope begin + gate, buffers_counter, str_buffers_counter = GPUArrays.@uncached begin ROCArray(UInt64[0]), ROCArray(Int32[0]), ROCArray(Int32[0]) - ) + end errprintf_buffers = [ Mem.HostBuffer(buf_len, HIP.hipHostAllocDefault) @@ -51,9 +51,9 @@ struct ExceptionHolder Mem.HostBuffer(str_len, HIP.hipHostAllocDefault) for _ in 1:n_str_buffers] - errprintf_buffers_dev, str_buffers_dev = (#GPUArrays.@no_cache_scope begin + errprintf_buffers_dev, str_buffers_dev = GPUArrays.@uncached begin ROCArray(Mem.device_ptr.(errprintf_buffers)), ROCArray(Mem.device_ptr.(str_buffers)) - ) + end new( exception_flag, gate, buffers_counter, str_buffers_counter, diff --git a/test/gpuarrays_tests.jl b/test/gpuarrays_tests.jl index 830d9acf..b07643a0 100644 --- a/test/gpuarrays_tests.jl +++ b/test/gpuarrays_tests.jl @@ -84,3 +84,6 @@ end @testitem "gpuarrays - uniformscaling" setup=[TSGPUArrays] begin gpuarrays_test("uniformscaling") end +@testitem "gpuarrays - alloc cache" setup=[TSGPUArrays] begin + gpuarrays_test("alloc cache") +end