Merge pull request #5 from CNugteren/development

Updated to version 6.0
CNugteren · Jun 29, 2016 · 68a3882 · 68a3882
2 parents 4a81c2b + dc02f78
commit 68a3882
Show file tree

Hide file tree

Showing 7 changed files with 110 additions and 38 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,12 @@
 
+Version 6.0 (2016-06-29):
+- Added the possibility to use Event pointers, adjusted the Kernel::Launch function to do so
+- Added a new constructor for Program based on a binary (OpenCL only)
+- Fixed a bug when OpenCL 2.0 or newer is installed but the device doesn't support it
+- Added new methods to the API:
+  * Device::VersionNumber (integer version of the string-getter Device::Version)
+  * Device::IsCPU, Device::IsGPU, Device::IsAMD, Device::IsNVIDIA, Device::IsIntel, Device::IsARM
+
 Version 5.0 (2016-04-21):
 - Buffers can now also be 'not owned' to disable automatic memory freeing afterwards
 - Made 'Buffer::Read' and 'Buffer::ReadAsync' constant methods

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,7 +31,7 @@
 # CMake project details
 cmake_minimum_required(VERSION 2.8.10)
 project("CLCudaAPI" CXX)
-set(CLCudaAPI_VERSION_MAJOR 5)
+set(CLCudaAPI_VERSION_MAJOR 6)
 set(CLCudaAPI_VERSION_MINOR 0)
 
 # ==================================================================================================
@@ -51,32 +51,32 @@ endif()
 # ==================================================================================================
 
 # Compiler-version check (requires at least CMake 2.8.10)
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
+if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
     message(FATAL_ERROR "GCC version must be at least 4.7")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
     message(FATAL_ERROR "Clang version must be at least 3.3")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
     message(FATAL_ERROR "AppleClang version must be at least 5.0")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
     message(FATAL_ERROR "ICC version must be at least 14.0")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
+elseif(MSVC)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
     message(FATAL_ERROR "MS Visual Studio version must be at least 18.0")
   endif()
 endif()
 
 # C++ compiler settings
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+if(MSVC)
   set(FLAGS "/Ox")
-else ()
+else()
   set(FLAGS "-O3 -std=c++11 -Wall -Wno-comment")
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
@@ -127,7 +127,7 @@ endforeach()
 # ==================================================================================================
 
 # Optional: Enable inclusion of the test-suite
-if (ENABLE_TESTS)
+if(ENABLE_TESTS)
   enable_testing()
   include_directories(${CLCudaAPI_SOURCE_DIR}/test)
   add_executable(unit_tests test/unit_tests.cc)

diff --git a/include/clpp11.h b/include/clpp11.h
@@ -12,7 +12,7 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This is version 5.0 of CLCudaAPI.
+// This is version 6.0 of CLCudaAPI.
 //
 // =================================================================================================
 //
@@ -162,6 +162,15 @@ class Device {
 
   // Methods to retrieve device information
   std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
+  size_t VersionNumber() const
+  {
+    std::string version_string = Version().substr(7);
+    // Space separates the end of the OpenCL version number from the beginning of the
+    // vendor-specific information.
+    size_t next_whitespace = version_string.find(' ');
+    size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace)));
+    return version;
+  }
   std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
   std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
   std::string Type() const {
@@ -206,6 +215,14 @@ class Device {
     return true;
   }
 
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return Type() == "CPU"; }
+  bool IsGPU() const { return Type() == "GPU"; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
+  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsARM() const { return Vendor() == "ARM"; }
+
   // Accessor to the private data-member
   const cl_device_id& operator()() const { return device_; }
  private:
@@ -268,10 +285,14 @@ class Context {
 
   // Accessor to the private data-member
   const cl_context& operator()() const { return *context_; }
+  cl_context* pointer() const { return &(*context_); }
  private:
   std::shared_ptr<cl_context> context_;
 };
 
+// Pointer to an OpenCL context
+using ContextPointer = cl_context*;
+
 // =================================================================================================
 
 // Enumeration of build statuses of the run-time compilation process
@@ -282,7 +303,7 @@ class Program {
  public:
   // Note that there is no constructor based on the regular OpenCL data-type because of extra state
 
-  // Regular constructor with memory management
+  // Source-based constructor with memory management
   explicit Program(const Context &context, std::string source):
       program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
       length_(source.length()),
@@ -293,6 +314,22 @@ class Program {
     CheckError(status);
   }
 
+  // Binary-based constructor with memory management
+  explicit Program(const Device &device, const Context &context, const std::string& binary):
+      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      length_(binary.length()),
+      source_(binary),
+      source_ptr_(&source_[0]) {
+    auto status1 = CL_SUCCESS;
+    auto status2 = CL_SUCCESS;
+    const cl_device_id dev = device();
+    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
+                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
+                                          &status1, &status2);
+    CheckError(status1);
+    CheckError(status2);
+  }
+
   // Compiles the device program and returns whether or not there where any warnings/errors
   BuildStatus Build(const Device &device, std::vector<std::string> &options) {
     auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
@@ -321,7 +358,7 @@ class Program {
     return result;
   }
 
-  // Retrieves an intermediate representation of the compiled program
+  // Retrieves a binary or an intermediate representation of the compiled program
   std::string GetIR() const {
     auto bytes = size_t{0};
     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
@@ -337,7 +374,7 @@ class Program {
  private:
   std::shared_ptr<cl_program> program_;
   size_t length_;
-  std::string source_;
+  std::string source_; // Note: the source can also be a binary or IR
   const char* source_ptr_;
 };
 
@@ -359,8 +396,16 @@ class Queue {
                                                              delete s; }) {
     auto status = CL_SUCCESS;
     #ifdef CL_VERSION_2_0
-      cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-      *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+      size_t ocl_version = device.VersionNumber();
+      if (ocl_version >= 200)
+      {
+        cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+        *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+      }
+      else
+      {
+        *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+      }
     #else
       *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
     #endif
@@ -611,15 +656,15 @@ class Kernel {
 
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event) {
+              const std::vector<size_t> &local, EventPointer event) {
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), local.data(),
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
   }
 
   // As above, but with an event waiting list
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event,
+              const std::vector<size_t> &local, EventPointer event,
               std::vector<Event>& waitForEvents) {
     if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
 
@@ -632,15 +677,16 @@ class Kernel {
     // Launches the kernel while waiting for other events
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), local.data(),
-                                      waitForEventsPlain.size(), waitForEventsPlain.data(),
-                                      &(event())));
+                                      static_cast<cl_uint>(waitForEventsPlain.size()),
+                                      waitForEventsPlain.data(),
+                                      event));
   }
 
   // As above, but with the default local workgroup size
-  void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
+  void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), nullptr,
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
   }
 
   // Accessor to the private data-member

diff --git a/include/cupp11.h b/include/cupp11.h
@@ -12,7 +12,7 @@
 // Portability here means that a similar header exists for OpenCL with the same classes and
 // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change.
 //
-// This is version 5.0 of CLCudaAPI.
+// This is version 6.0 of CLCudaAPI.
 //
 // =================================================================================================
 //
@@ -107,13 +107,14 @@ class Event {
   // Accessors to the private data-members
   const CUevent& start() const { return *start_; }
   const CUevent& end() const { return *end_; }
+  Event* pointer() { return this; }
  private:
   std::shared_ptr<CUevent> start_;
   std::shared_ptr<CUevent> end_;
 };
 
 // Pointer to a CUDA event
-using EventPointer = CUevent*;
+using EventPointer = Event*;
 
 // =================================================================================================
 
@@ -160,6 +161,11 @@ class Device {
     CheckError(cuDriverGetVersion(&result));
     return "CUDA driver "+std::to_string(result);
   }
+  size_t VersionNumber() const {
+    auto result = 0;
+    CheckError(cuDriverGetVersion(&result));
+    return static_cast<size_t>(result);
+  }
   std::string Vendor() const { return "NVIDIA Corporation"; }
   std::string Name() const {
     auto result = std::string{};
@@ -207,6 +213,14 @@ class Device {
     return true;
   }
 
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return false; }
+  bool IsGPU() const { return true; }
+  bool IsAMD() const { return false; }
+  bool IsNVIDIA() const { return true; }
+  bool IsIntel() const { return false; }
+  bool IsARM() const { return false; }
+
   // Accessor to the private data-member
   const CUdevice& operator()() const { return device_; }
  private:
@@ -240,10 +254,14 @@ class Context {
 
   // Accessor to the private data-member
   const CUcontext& operator()() const { return *context_; }
+  CUcontext* pointer() const { return &(*context_); }
  private:
   std::shared_ptr<CUcontext> context_;
 };
 
+// Pointer to an OpenCL context
+using ContextPointer = CUcontext*;
+
 // =================================================================================================
 
 // Enumeration of build statuses of the run-time compilation process
@@ -254,7 +272,7 @@ class Program {
  public:
   // Note that there is no constructor based on the regular CUDA data-type because of extra state
 
-  // Regular constructor with memory management
+  // Source-based constructor with memory management
   explicit Program(const Context &, std::string source):
       program_(new nvrtcProgram, [](nvrtcProgram* p) { CheckError(nvrtcDestroyProgram(p));
                                                        delete p; }),
@@ -556,7 +574,7 @@ class Kernel {
 
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event) {
+              const std::vector<size_t> &local, EventPointer event) {
 
     // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
     auto grid = std::vector<size_t>{1, 1, 1};
@@ -572,24 +590,24 @@ class Kernel {
     }
 
     // Launches the kernel, its execution time is recorded by events
-    CheckError(cuEventRecord(event.start(), queue()));
+    CheckError(cuEventRecord(event->start(), queue()));
     CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
                               0, queue(), pointers.data(), nullptr));
-    CheckError(cuEventRecord(event.end(), queue()));
+    CheckError(cuEventRecord(event->end(), queue()));
   }
 
   // As above, but with an event waiting list
   // TODO: Implement this function
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event,
+              const std::vector<size_t> &local, EventPointer event,
               std::vector<Event>& waitForEvents) {
     if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
     Error("launching with an event waiting list is not implemented for the CUDA back-end");
   }
 
   // As above, but with the default local workgroup size
   // TODO: Implement this function
-  void Launch(const Queue &, const std::vector<size_t> &, Event &) {
+  void Launch(const Queue &, const std::vector<size_t> &, EventPointer) {
     Error("launching with a default workgroup size is not implemented for the CUDA back-end");
   }
 

diff --git a/samples/advanced.cc b/samples/advanced.cc
@@ -179,7 +179,7 @@ int main() {
   // Enqueues the kernel and waits for the result. Note that launching the kernel is always
   // a-synchronous and thus requires finishing the queue in order to complete the operation.
   printf("## Running the kernel...\n");
-  kernel.Launch(queue, global, local, event);
+  kernel.Launch(queue, global, local, event.pointer());
   queue.Finish(event);
   printf(" > Took %.3lf ms\n", event.GetElapsedTime());
 

diff --git a/samples/simple.cc b/samples/simple.cc
@@ -128,7 +128,7 @@ int main() {
   // Enqueues the kernel and waits for the result. Note that launching the kernel is always
   // a-synchronous and thus requires finishing the queue in order to complete the operation.
   printf("## Running the kernel...\n");
-  kernel.Launch(queue, global, local, event);
+  kernel.Launch(queue, global, local, event.pointer());
   queue.Finish(event);
   printf(" > Took %.3lf ms\n", event.GetElapsedTime());
 

diff --git a/samples/smallest.cc b/samples/smallest.cc
@@ -73,7 +73,7 @@ int main() {
   program.Build(device, compiler_options);
   auto kernel = CLCudaAPI::Kernel(program, "add");
   kernel.SetArguments(a, b, c);
-  kernel.Launch(queue, {elements}, {128}, event);
+  kernel.Launch(queue, {elements}, {128}, event.pointer());
   queue.Finish(event);
 
   // Reads the results back to the host memory