Add stack & cat support for CPU Half (pytorch#16389)

ssnl · facebook-github-bot · commit ded6fb0293f4 · 2019-01-29T13:06:29.000-08:00
Summary: Fixes pytorch#6968 Needed for pytorch#14705 Pull Request resolved: pytorch#16389 Differential Revision: D13861446 Pulled By: gchanan fbshipit-source-id: 7b8700b95aaf252d9669693dbddccb2302e58409
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
@@ -2872,6 +2872,7 @@
   name: _th_cat
   cname: catArray
   variants: [function]
+  cpu_half: True
   return: self
   arguments:
     - arg: THTensor* self
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
@@ -668,6 +668,131 @@ scalar_t THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_
   return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
 }
 
+
+/* Shape manipulation methods */
+void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension)
+{
+  THTensor* inputs[2];
+  inputs[0] = ta;
+  inputs[1] = tb;
+  THTensor_(catArray)(r_, inputs, 2, dimension);
+}
+
+void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension);
+inline void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension)
+{
+  int first_dims = first->dim();
+  int second_dims = second->dim();
+  THArgCheck(first_dims == second_dims, 0,
+      "Tensors must have same number of dimensions: got %d and %d",
+      first_dims, second_dims);
+  for (int dim = 0; dim < first_dims; dim++) {
+    if (dim == dimension) {
+      continue;
+    }
+    int64_t first_dim_size = first->size(dim);
+    int64_t second_dim_size = second->size(dim);
+    THArgCheck(first_dim_size == second_dim_size, 0,
+        "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d",
+        dimension, (long long)first_dim_size, (long long)second_dim_size, dim);
+  }
+}
+
+void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension)
+{
+  // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
+  // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
+  // to be "skipped".  We maintain this behavior for backwards compatibility, but only for this specific
+  // size (i.e. other empty sizes are not skipped).
+  // FIXME: warn if this is the case
+  bool allSkipped= true;
+  int64_t nDims = 0;
+  THTensor *notSkippedTensor;  // non-owning reference
+  auto should_skip = [](THTensor *t) { return t->is_empty() && t->dim() == 1; };
+  for (int i = 0; i < numInputs; i++) {
+    if (should_skip(inputs[i])) {
+      continue;
+    }
+    // We've found a non-empty tensor
+    allSkipped = false;
+    notSkippedTensor = inputs[i];
+    nDims = notSkippedTensor->dim();
+    break;
+  }
+  if (allSkipped) {
+    return;
+  }
+
+  // Compute cat_dimension based on the non-empty tensor
+  THArgCheck(dimension < nDims, 4, "invalid dimension %d", dimension);
+  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
+
+  // Compute size of the result in the cat dimension
+  int64_t cat_dim_size = 0;
+  for (int i = 0; i < numInputs; i++) {
+    THTensor *tensor = inputs[i];
+    if (should_skip(tensor)) {
+      continue;
+    }
+    THTensor_(check_shape_except_dim)(notSkippedTensor, tensor, dimension);
+    cat_dim_size += tensor->size(dimension);
+  }
+
+  // Compute the size of the result
+  std::vector<int64_t> size(nDims);
+  for (int dim = 0; dim < nDims; dim++) {
+    int64_t result_dim_size = notSkippedTensor->size(dim);
+    if (dim == dimension) {
+      result_dim_size = cat_dim_size;
+    }
+    size[dim] = result_dim_size;
+  }
+  THTensor_(resize)(result, size, {});
+
+  // Check contiguity of all inputs and result
+  bool allContiguous = true;
+  for (int i = 0; i < numInputs; i++) {
+    if(!should_skip(inputs[i])) {
+      allContiguous = allContiguous && THTensor_(isContiguous)(inputs[i]);
+    }
+  }
+  allContiguous = allContiguous && THTensor_(isContiguous)(result);
+
+  // First path is for contiguous inputs along dim 0
+  // Second path for non-contiguous
+  int64_t offset;
+  if (dimension == 0 && allContiguous) {
+    scalar_t* result_data = THStorage_(data)(THTensor_getStoragePtr(result)) + result->storage_offset();
+    offset = 0;
+    for (int j = 0; j < numInputs; j++) {
+      if (!should_skip(inputs[j])) {
+        THTensor* input0 = inputs[j];
+        scalar_t* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset();
+        int64_t input0_size = THTensor_(nElement)(input0);
+        // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this.
+        if (input0_size != 0) {
+          memcpy(result_data + offset, input0_data, input0_size*sizeof(scalar_t));
+        }
+        offset += input0_size;
+      }
+    }
+  } else {
+    offset = 0;
+    for (int j = 0; j < numInputs; j++) {
+      if (!should_skip(inputs[j])) {
+        int64_t dimSize = inputs[j]->size(dimension);
+        THTensor *nt = THTensor_(newWithTensor)(result);
+        THTensor_(narrow)(nt, NULL, dimension, offset, dimSize);
+        at::Tensor nt__wrap = THTensor_wrap(nt);
+        at::Tensor inputs_wrap = THTensor_wrap(inputs[j]);
+        at::_copy_same_type_(nt__wrap, inputs_wrap);
+        c10::raw::intrusive_ptr::decref(nt);
+        offset += dimSize;
+      }
+    }
+  }
+}
+
 THDescBuff THTensor_(desc)(const THTensor *tensor) {
   const int L = TH_DESC_BUFF_LEN;
   THDescBuff buf;
diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h
@@ -125,6 +125,10 @@ TH_API scalar_t THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1)
 TH_API scalar_t THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
 TH_API scalar_t THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
 
+/* Shape manipulation methods */
+TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
+TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
+
 /* Debug methods */
 TH_API THDescBuff THTensor_(desc)(const THTensor *tensor);
 TH_API THDescBuff THTensor_(sizeDesc)(const THTensor *tensor);
diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h
@@ -103,8 +103,6 @@ TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n
 TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
 TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
 TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
-TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
-TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
 
 TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
 
diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp
@@ -1238,129 +1238,6 @@ void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
   }
 }
 
-void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension)
-{
-  THTensor* inputs[2];
-  inputs[0] = ta;
-  inputs[1] = tb;
-  THTensor_(catArray)(r_, inputs, 2, dimension);
-}
-
-void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension);
-inline void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension)
-{
-  int first_dims = first->dim();
-  int second_dims = second->dim();
-  THArgCheck(first_dims == second_dims, 0,
-      "Tensors must have same number of dimensions: got %d and %d",
-      first_dims, second_dims);
-  for (int dim = 0; dim < first_dims; dim++) {
-    if (dim == dimension) {
-      continue;
-    }
-    int64_t first_dim_size = first->size(dim);
-    int64_t second_dim_size = second->size(dim);
-    THArgCheck(first_dim_size == second_dim_size, 0,
-        "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d",
-        dimension, (long long)first_dim_size, (long long)second_dim_size, dim);
-  }
-}
-
-void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension)
-{
-  // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
-  // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
-  // to be "skipped".  We maintain this behavior for backwards compatibility, but only for this specific
-  // size (i.e. other empty sizes are not skipped).
-  // FIXME: warn if this is the case
-  bool allSkipped= true;
-  int64_t nDims = 0;
-  THTensor *notSkippedTensor;  // non-owning reference
-  auto should_skip = [](THTensor *t) { return t->is_empty() && t->dim() == 1; };
-  for (int i = 0; i < numInputs; i++) {
-    if (should_skip(inputs[i])) {
-      continue;
-    }
-    // We've found a non-empty tensor
-    allSkipped = false;
-    notSkippedTensor = inputs[i];
-    nDims = notSkippedTensor->dim();
-    break;
-  }
-  if (allSkipped) {
-    return;
-  }
-
-  // Compute cat_dimension based on the non-empty tensor
-  THArgCheck(dimension < nDims, 4, "invalid dimension %d", dimension);
-  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
-
-  // Compute size of the result in the cat dimension
-  int64_t cat_dim_size = 0;
-  for (int i = 0; i < numInputs; i++) {
-    THTensor *tensor = inputs[i];
-    if (should_skip(tensor)) {
-      continue;
-    }
-    THTensor_(check_shape_except_dim)(notSkippedTensor, tensor, dimension);
-    cat_dim_size += tensor->size(dimension);
-  }
-
-  // Compute the size of the result
-  std::vector<int64_t> size(nDims);
-  for (int dim = 0; dim < nDims; dim++) {
-    int64_t result_dim_size = notSkippedTensor->size(dim);
-    if (dim == dimension) {
-      result_dim_size = cat_dim_size;
-    }
-    size[dim] = result_dim_size;
-  }
-  THTensor_(resize)(result, size, {});
-
-  // Check contiguity of all inputs and result
-  bool allContiguous = true;
-  for (int i = 0; i < numInputs; i++) {
-    if(!should_skip(inputs[i])) {
-      allContiguous = allContiguous && THTensor_(isContiguous)(inputs[i]);
-    }
-  }
-  allContiguous = allContiguous && THTensor_(isContiguous)(result);
-
-  // First path is for contiguous inputs along dim 0
-  // Second path for non-contiguous
-  int64_t offset;
-  if (dimension == 0 && allContiguous) {
-    scalar_t* result_data = THStorage_(data)(THTensor_getStoragePtr(result)) + result->storage_offset();
-    offset = 0;
-    for (int j = 0; j < numInputs; j++) {
-      if (!should_skip(inputs[j])) {
-        THTensor* input0 = inputs[j];
-        scalar_t* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset();
-        int64_t input0_size = THTensor_(nElement)(input0);
-        // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this.
-        if (input0_size != 0) {
-          memcpy(result_data + offset, input0_data, input0_size*sizeof(scalar_t));
-        }
-        offset += input0_size;
-      }
-    }
-  } else {
-    offset = 0;
-    for (int j = 0; j < numInputs; j++) {
-      if (!should_skip(inputs[j])) {
-        int64_t dimSize = inputs[j]->size(dimension);
-        THTensor *nt = THTensor_(newWithTensor)(result);
-        THTensor_(narrow)(nt, NULL, dimension, offset, dimSize);
-        at::Tensor nt__wrap = THTensor_wrap(nt);
-        at::Tensor inputs_wrap = THTensor_wrap(inputs[j]);
-        at::_copy_same_type_(nt__wrap, inputs_wrap);
-        c10::raw::intrusive_ptr::decref(nt);
-        offset += dimSize;
-      }
-    }
-  }
-}
-
 int THTensor_(equal)(THTensor *ta, THTensor* tb)
 {
   int equal = 1;
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -392,22 +392,28 @@ def assertEqual(self, x, y, prec=None, message='', allow_inf=False):
             def assertTensorsEqual(a, b):
                 super(TestCase, self).assertEqual(a.size(), b.size(), message)
                 if a.numel() > 0:
-                    b = b.type_as(a)
-                    b = b.cuda(device=a.get_device()) if a.is_cuda else b.cpu()
-                    # check that NaNs are in the same locations
-                    nan_mask = a != a
-                    self.assertTrue(torch.equal(nan_mask, b != b), message)
+                    if a.device.type == 'cpu' and a.dtype == torch.float16:
+                        # CPU half tensors don't have the methods we need below
+                        a = a.to(torch.float32)
+                    if TEST_WITH_ROCM:
+                        # Workaround for bug https://github.com/pytorch/pytorch/issues/16448
+                        # TODO: remove after the bug is resolved.
+                        b = b.to(a.dtype).to(a.device)
+                    else:
+                        b = b.to(a)
                     diff = a - b
-                    diff[nan_mask] = 0
-                    # inf check if allow_inf=True
-                    if allow_inf:
-                        inf_mask = (a == float("inf")) | (a == float("-inf"))
-                        self.assertTrue(torch.equal(inf_mask,
-                                                    (b == float("inf")) | (b == float("-inf"))),
-                                        message)
-                        diff[inf_mask] = 0
-                    # TODO: implement abs on CharTensor
-                    if diff.is_signed() and 'CharTensor' not in diff.type():
+                    if a.is_floating_point():
+                        # check that NaNs are in the same locations
+                        nan_mask = torch.isnan(a)
+                        self.assertTrue(torch.equal(nan_mask, torch.isnan(b)), message)
+                        diff[nan_mask] = 0
+                        # inf check if allow_inf=True
+                        if allow_inf:
+                            inf_mask = torch.isinf(a)
+                            self.assertTrue(torch.equal(inf_mask, torch.isinf(b)), message)
+                            diff[inf_mask] = 0
+                    # TODO: implement abs on CharTensor (int8)
+                    if diff.is_signed() and diff.dtype != torch.int8:
                         diff = diff.abs()
                     max_err = diff.max()
                     self.assertLessEqual(max_err, prec, message)
diff --git a/test/test_torch.py b/test/test_torch.py