Merge pull request #3 from CNugteren/development

Updated to version 4.0
CNugteren · Nov 1, 2015 · 0ad67ce · 0ad67ce
2 parents 793c5b9 + 159c426
commit 0ad67ce
Show file tree

Hide file tree

Showing 11 changed files with 506 additions and 117 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,14 @@
 
+Version 4.0 (2015-11-01):
+- Made 'CopyTo' and 'CopyToAsync' constant methods
+- Added offset support to the Buffer class (credits go to 'ielhelw')
+- Added unit tests for {Event, Device, Context, Queue} classes
+- Added compact OpenCL example
+- Fixed compiler warnings and errors for Windows using MSVC
+- Fixed several general compiler warnings
+- Added new methods to the API:
+  * Device::MaxAllocSize
+
 Version 3.0 (2015-09-04):
 - Renamed the project from 'Claduc' into 'CLCudaAPI'
 - SetArgument now takes both l-value and r-value arguments
@@ -13,13 +23,13 @@ Version 2.0 (2015-07-13):
 - Allows device program string to be moved into Program at construction
 - Cleaned-up device-information methods
 - Added new methods to the API:
-  * Device::CoreClock,
-  * Device::ComputeUnits,
-  * Device::MemorySize,
-  * Device::MemoryClock,
+  * Device::CoreClock
+  * Device::ComputeUnits
+  * Device::MemorySize
+  * Device::MemoryClock
   * Device::MemoryBusWidth
   * Program::GetIR
   * Kernel::SetArguments
 
 Version 1.0 (2015-07-09):
-- Initial version
+- Initial version
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,7 +31,7 @@
 # CMake project details
 cmake_minimum_required(VERSION 2.8.10)
 project("CLCudaAPI" CXX)
-set(CLCudaAPI_VERSION_MAJOR 3)
+set(CLCudaAPI_VERSION_MAJOR 4)
 set(CLCudaAPI_VERSION_MINOR 0)
 
 # ==================================================================================================
@@ -74,7 +74,11 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 endif()
 
 # C++ compiler settings
-set(FLAGS "-O3 -std=c++11 -Wall -Wno-comment")
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  set(FLAGS "/Ox")
+else ()
+  set(FLAGS "-O3 -std=c++11 -Wall -Wno-comment")
+endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
 
 # ==================================================================================================
@@ -109,7 +113,7 @@ endif()
 # ==================================================================================================
 
 # Adds the sample programs
-set(SAMPLE_PROGRAMS device_info simple advanced)
+set(SAMPLE_PROGRAMS device_info simple advanced smallest)
 foreach(SAMPLE ${SAMPLE_PROGRAMS})
   add_executable(${SAMPLE} samples/${SAMPLE}.cc)
   if(USE_OPENCL)

diff --git a/README.md b/README.md
@@ -13,9 +13,9 @@ What does it look like?
 To get started, include either of the two headers:
 
 ```c++
-#include <clpp11.h>
+#include "clpp11.h"
 // or:
-#include <cupp11.h>
+#include "cupp11.h"
 ```
 
 Here is a simple example of setting-up platform 0 and selecting device 2:
@@ -67,9 +67,9 @@ What are the pre-requisites?
 
 The requirements to use the CLCudaAPI headers are:
 
-* CUDA 7.0 or higher (for run-time compilation)
+* CUDA 7.0 or higher
 * OpenCL 1.1 or higher
-* A C++11 compiler (e.g. GCC 4.7 or newer)
+* A C++11 compiler (e.g. GCC 4.7, Clang 3.3, MSVC 2015 or newer)
 
 If you also want to compile the samples and tests using the provided infrastructure, you'll also need:
 

diff --git a/doc/api.md b/doc/api.md
@@ -68,7 +68,10 @@ Retrieves the device's core clock frequency in MHz.
 Retrieves the number of compute units (OpenCL terminology) or multi-processors (CUDA terminology) in the device.
 
 * `size_t MemorySize() const`:
-Retrieves the global memory size (CUDA back-end) or the maximum amount of allocatable global memory per allocation (OpenCL back-end).
+Retrieves the total global memory size.
+
+* `size_t MaxAllocSize() const`:
+Retrieves the maximum amount of allocatable global memory per allocation.
 
 * `size_t MemoryClock() const`:
 Retrieves the device's memory clock frequency in MHz (CUDA back-end) or 0 (OpenCL back-end).
@@ -195,10 +198,10 @@ Copies `size` elements from a host buffer to the current device buffer. The devi
 `void Write(const Queue &queue, const size_t size, const BufferHost<T> &host)`:
 As above, but now completes the operation before returning.
 
-* `void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination)`:
+* `void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const`:
 Copies `size` elements from the current device buffer to another device buffer given by `destination`. The destination buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed.
 
-* `void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination)`:
+* `void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const`:
 As above, but now completes the operation before returning.
 
 * `size_t GetSize() const`:

diff --git a/include/clpp11.h b/include/clpp11.h
@@ -12,7 +12,7 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This is version 3.0 of CLCudaAPI.
+// This is version 4.0 of CLCudaAPI.
 //
 // =================================================================================================
 //
@@ -75,7 +75,7 @@ class Event {
   explicit Event(const cl_event event): event_(event) { }
 
   // Regular constructor
-  explicit Event() { }
+  explicit Event(): event_(nullptr) { }
 
   // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
   // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
@@ -122,7 +122,7 @@ class Platform {
   size_t NumDevices() const {
     auto result = cl_uint{0};
     CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
-    return result;
+    return static_cast<size_t>(result);
   }
 
   // Accessor to the private data-member
@@ -145,7 +145,8 @@ class Device {
     auto num_devices = platform.NumDevices();
     if (num_devices == 0) { Error("no devices found"); }
     auto devices = std::vector<cl_device_id>(num_devices);
-    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, num_devices, devices.data(), nullptr));
+    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
+                              devices.data(), nullptr));
     if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
     device_ = devices[device_id];
   }
@@ -177,6 +178,7 @@ class Device {
   size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
   size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
   size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
+  size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
 
@@ -347,7 +349,12 @@ class Queue {
       queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
                                                              delete s; }) {
     auto status = CL_SUCCESS;
-    *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    #ifdef CL_VERSION_2_0
+      cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+      *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+    #else
+      *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    #endif
     CheckError(status);
   }
 
@@ -447,73 +454,79 @@ class Buffer {
   // Constructs a new buffer based on an existing host-container
   template <typename Iterator>
   explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
-    Buffer(context, BufferAccess::kReadWrite, end - start) {
-    auto size = end - start;
+    Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
+    auto size = static_cast<size_t>(end - start);
     auto pointer = &*start;
     CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
                                     nullptr, nullptr));
     queue.Finish();
   }
 
   // Copies from device to host: reading the device buffer a-synchronously
-  void ReadAsync(const Queue &queue, const size_t size, T* host) {
+  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
     if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
-    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
-                                   nullptr, nullptr));
+    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
+                                   host, 0, nullptr, nullptr));
   }
-  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host) {
+  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
+                 const size_t offset = 0) {
     if (host.size() < size) { Error("target host buffer is too small"); }
-    ReadAsync(queue, size, host.data());
+    ReadAsync(queue, size, host.data(), offset);
   }
-  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host) {
+  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
+                 const size_t offset = 0) {
     if (host.size() < size) { Error("target host buffer is too small"); }
-    ReadAsync(queue, size, host.data());
+    ReadAsync(queue, size, host.data(), offset);
   }
 
   // Copies from device to host: reading the device buffer
-  void Read(const Queue &queue, const size_t size, T* host) {
-    ReadAsync(queue, size, host);
+  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+    ReadAsync(queue, size, host, offset);
     queue.Finish();
   }
-  void Read(const Queue &queue, const size_t size, std::vector<T> &host) {
-    Read(queue, size, host.data());
+  void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
+    Read(queue, size, host.data(), offset);
   }
-  void Read(const Queue &queue, const size_t size, BufferHost<T> &host) {
-    Read(queue, size, host.data());
+  void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
+    Read(queue, size, host.data(), offset);
   }
 
   // Copies from host to device: writing the device buffer a-synchronously
-  void WriteAsync(const Queue &queue, const size_t size, const T* host) {
+  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
     if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
-    if (GetSize() < size*sizeof(T)) { Error("target device buffer is too small"); }
-    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
-                                    nullptr, nullptr));
+    if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
+    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
+                                    host, 0, nullptr, nullptr));
   }
-  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host) {
-    WriteAsync(queue, size, host.data());
+  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
+                  const size_t offset = 0) {
+    WriteAsync(queue, size, host.data(), offset);
   }
-  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host) {
-    WriteAsync(queue, size, host.data());
+  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
+                  const size_t offset = 0) {
+    WriteAsync(queue, size, host.data(), offset);
   }
 
   // Copies from host to device: writing the device buffer
-  void Write(const Queue &queue, const size_t size, const T* host) {
-    WriteAsync(queue, size, host);
+  void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+    WriteAsync(queue, size, host, offset);
     queue.Finish();
   }
-  void Write(const Queue &queue, const size_t size, const std::vector<T> &host) {
-    Write(queue, size, host.data());
+  void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
+             const size_t offset = 0) {
+    Write(queue, size, host.data(), offset);
   }
-  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host) {
-    Write(queue, size, host.data());
+  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
+             const size_t offset = 0) {
+    Write(queue, size, host.data(), offset);
   }
 
   // Copies the contents of this buffer into another device buffer
-  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) {
+  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
     CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
                                    nullptr, nullptr));
   }
-  void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) {
+  void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
     CopyToAsync(queue, size, destination);
     queue.Finish();
   }