diff --git a/CHANGELOG b/CHANGELOG index 5d880e1..4970fe0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,12 @@ +Version 6.0 (2016-06-29): +- Added the possibility to use Event pointers, adjusted the Kernel::Launch function to do so +- Added a new constructor for Program based on a binary (OpenCL only) +- Fixed a bug when OpenCL 2.0 or newer is installed but the device doesn't support it +- Added new methods to the API: + * Device::VersionNumber (integer version of the string-getter Device::Version) + * Device::IsCPU, Device::IsGPU, Device::IsAMD, Device::IsNVIDIA, Device::IsIntel, Device::IsARM + Version 5.0 (2016-04-21): - Buffers can now also be 'not owned' to disable automatic memory freeing afterwards - Made 'Buffer::Read' and 'Buffer::ReadAsync' constant methods diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e8426b..c68e96c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ # CMake project details cmake_minimum_required(VERSION 2.8.10) project("CLCudaAPI" CXX) -set(CLCudaAPI_VERSION_MAJOR 5) +set(CLCudaAPI_VERSION_MAJOR 6) set(CLCudaAPI_VERSION_MINOR 0) # ================================================================================================== @@ -51,32 +51,32 @@ endif() # ================================================================================================== # Compiler-version check (requires at least CMake 2.8.10) -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) +if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) message(FATAL_ERROR "GCC version must be at least 4.7") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) message(FATAL_ERROR "Clang version must be at least 3.3") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) message(FATAL_ERROR "AppleClang version must be at least 5.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) +elseif(MSVC) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") endif() endif() # C++ compiler settings -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") +if(MSVC) set(FLAGS "/Ox") -else () +else() set(FLAGS "-O3 -std=c++11 -Wall -Wno-comment") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") @@ -127,7 +127,7 @@ endforeach() # ================================================================================================== # Optional: Enable inclusion of the test-suite -if (ENABLE_TESTS) +if(ENABLE_TESTS) enable_testing() include_directories(${CLCudaAPI_SOURCE_DIR}/test) add_executable(unit_tests test/unit_tests.cc) diff --git a/include/clpp11.h b/include/clpp11.h index 1815ca1..f4ecff7 100644 --- a/include/clpp11.h +++ b/include/clpp11.h @@ -12,7 +12,7 @@ // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // -// This is version 5.0 of CLCudaAPI. +// This is version 6.0 of CLCudaAPI. // // ================================================================================================= // @@ -162,6 +162,15 @@ class Device { // Methods to retrieve device information std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } + size_t VersionNumber() const + { + std::string version_string = Version().substr(7); + // Space separates the end of the OpenCL version number from the beginning of the + // vendor-specific information. + size_t next_whitespace = version_string.find(' '); + size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace))); + return version; + } std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } std::string Type() const { @@ -206,6 +215,14 @@ class Device { return true; } + // Query for a specific type of device or brand + bool IsCPU() const { return Type() == "CPU"; } + bool IsGPU() const { return Type() == "GPU"; } + bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } + bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; } + bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; } + bool IsARM() const { return Vendor() == "ARM"; } + // Accessor to the private data-member const cl_device_id& operator()() const { return device_; } private: @@ -268,10 +285,14 @@ class Context { // Accessor to the private data-member const cl_context& operator()() const { return *context_; } + cl_context* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; +// Pointer to an OpenCL context +using ContextPointer = cl_context*; + // ================================================================================================= // Enumeration of build statuses of the run-time compilation process @@ -282,7 +303,7 @@ class Program { public: // Note that there is no constructor based on the regular OpenCL data-type because of extra state - // Regular constructor with memory management + // Source-based constructor with memory management explicit Program(const Context &context, std::string source): program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), length_(source.length()), @@ -293,6 +314,22 @@ class Program { CheckError(status); } + // Binary-based constructor with memory management + explicit Program(const Device &device, const Context &context, const std::string& binary): + program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + length_(binary.length()), + source_(binary), + source_ptr_(&source_[0]) { + auto status1 = CL_SUCCESS; + auto status2 = CL_SUCCESS; + const cl_device_id dev = device(); + *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, + reinterpret_cast(&source_ptr_), + &status1, &status2); + CheckError(status1); + CheckError(status2); + } + // Compiles the device program and returns whether or not there where any warnings/errors BuildStatus Build(const Device &device, std::vector &options) { auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); @@ -321,7 +358,7 @@ class Program { return result; } - // Retrieves an intermediate representation of the compiled program + // Retrieves a binary or an intermediate representation of the compiled program std::string GetIR() const { auto bytes = size_t{0}; CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); @@ -337,7 +374,7 @@ class Program { private: std::shared_ptr program_; size_t length_; - std::string source_; + std::string source_; // Note: the source can also be a binary or IR const char* source_ptr_; }; @@ -359,8 +396,16 @@ class Queue { delete s; }) { auto status = CL_SUCCESS; #ifdef CL_VERSION_2_0 - cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; - *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status); + size_t ocl_version = device.VersionNumber(); + if (ocl_version >= 200) + { + cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status); + } + else + { + *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); + } #else *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); #endif @@ -611,15 +656,15 @@ class Kernel { // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event) { + const std::vector &local, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), local.data(), - 0, nullptr, &(event()))); + 0, nullptr, event)); } // As above, but with an event waiting list void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event, + const std::vector &local, EventPointer event, std::vector& waitForEvents) { if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } @@ -632,15 +677,16 @@ class Kernel { // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), local.data(), - waitForEventsPlain.size(), waitForEventsPlain.data(), - &(event()))); + static_cast(waitForEventsPlain.size()), + waitForEventsPlain.data(), + event)); } // As above, but with the default local workgroup size - void Launch(const Queue &queue, const std::vector &global, Event &event) { + void Launch(const Queue &queue, const std::vector &global, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), nullptr, - 0, nullptr, &(event()))); + 0, nullptr, event)); } // Accessor to the private data-member diff --git a/include/cupp11.h b/include/cupp11.h index 635c47b..76066d7 100644 --- a/include/cupp11.h +++ b/include/cupp11.h @@ -12,7 +12,7 @@ // Portability here means that a similar header exists for OpenCL with the same classes and // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change. // -// This is version 5.0 of CLCudaAPI. +// This is version 6.0 of CLCudaAPI. // // ================================================================================================= // @@ -107,13 +107,14 @@ class Event { // Accessors to the private data-members const CUevent& start() const { return *start_; } const CUevent& end() const { return *end_; } + Event* pointer() { return this; } private: std::shared_ptr start_; std::shared_ptr end_; }; // Pointer to a CUDA event -using EventPointer = CUevent*; +using EventPointer = Event*; // ================================================================================================= @@ -160,6 +161,11 @@ class Device { CheckError(cuDriverGetVersion(&result)); return "CUDA driver "+std::to_string(result); } + size_t VersionNumber() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return static_cast(result); + } std::string Vendor() const { return "NVIDIA Corporation"; } std::string Name() const { auto result = std::string{}; @@ -207,6 +213,14 @@ class Device { return true; } + // Query for a specific type of device or brand + bool IsCPU() const { return false; } + bool IsGPU() const { return true; } + bool IsAMD() const { return false; } + bool IsNVIDIA() const { return true; } + bool IsIntel() const { return false; } + bool IsARM() const { return false; } + // Accessor to the private data-member const CUdevice& operator()() const { return device_; } private: @@ -240,10 +254,14 @@ class Context { // Accessor to the private data-member const CUcontext& operator()() const { return *context_; } + CUcontext* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; +// Pointer to an OpenCL context +using ContextPointer = CUcontext*; + // ================================================================================================= // Enumeration of build statuses of the run-time compilation process @@ -254,7 +272,7 @@ class Program { public: // Note that there is no constructor based on the regular CUDA data-type because of extra state - // Regular constructor with memory management + // Source-based constructor with memory management explicit Program(const Context &, std::string source): program_(new nvrtcProgram, [](nvrtcProgram* p) { CheckError(nvrtcDestroyProgram(p)); delete p; }), @@ -556,7 +574,7 @@ class Kernel { // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event) { + const std::vector &local, EventPointer event) { // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) auto grid = std::vector{1, 1, 1}; @@ -572,16 +590,16 @@ class Kernel { } // Launches the kernel, its execution time is recorded by events - CheckError(cuEventRecord(event.start(), queue())); + CheckError(cuEventRecord(event->start(), queue())); CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, queue(), pointers.data(), nullptr)); - CheckError(cuEventRecord(event.end(), queue())); + CheckError(cuEventRecord(event->end(), queue())); } // As above, but with an event waiting list // TODO: Implement this function void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event, + const std::vector &local, EventPointer event, std::vector& waitForEvents) { if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } Error("launching with an event waiting list is not implemented for the CUDA back-end"); @@ -589,7 +607,7 @@ class Kernel { // As above, but with the default local workgroup size // TODO: Implement this function - void Launch(const Queue &, const std::vector &, Event &) { + void Launch(const Queue &, const std::vector &, EventPointer) { Error("launching with a default workgroup size is not implemented for the CUDA back-end"); } diff --git a/samples/advanced.cc b/samples/advanced.cc index 69d9607..79e36e6 100644 --- a/samples/advanced.cc +++ b/samples/advanced.cc @@ -179,7 +179,7 @@ int main() { // Enqueues the kernel and waits for the result. Note that launching the kernel is always // a-synchronous and thus requires finishing the queue in order to complete the operation. printf("## Running the kernel...\n"); - kernel.Launch(queue, global, local, event); + kernel.Launch(queue, global, local, event.pointer()); queue.Finish(event); printf(" > Took %.3lf ms\n", event.GetElapsedTime()); diff --git a/samples/simple.cc b/samples/simple.cc index b885862..af44038 100644 --- a/samples/simple.cc +++ b/samples/simple.cc @@ -128,7 +128,7 @@ int main() { // Enqueues the kernel and waits for the result. Note that launching the kernel is always // a-synchronous and thus requires finishing the queue in order to complete the operation. printf("## Running the kernel...\n"); - kernel.Launch(queue, global, local, event); + kernel.Launch(queue, global, local, event.pointer()); queue.Finish(event); printf(" > Took %.3lf ms\n", event.GetElapsedTime()); diff --git a/samples/smallest.cc b/samples/smallest.cc index 1866e6c..80c62cd 100644 --- a/samples/smallest.cc +++ b/samples/smallest.cc @@ -73,7 +73,7 @@ int main() { program.Build(device, compiler_options); auto kernel = CLCudaAPI::Kernel(program, "add"); kernel.SetArguments(a, b, c); - kernel.Launch(queue, {elements}, {128}, event); + kernel.Launch(queue, {elements}, {128}, event.pointer()); queue.Finish(event); // Reads the results back to the host memory