Remove ActorMsg::user_data

Oneflow-Inc · Jan 16, 2023 · 0559f15 · 0559f15
2 parents f939cae + 1e1bddf
commit 0559f15
Show file tree

Hide file tree

Showing 95 changed files with 2,914 additions and 805 deletions.
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -40,6 +40,7 @@ if(BUILD_CUDA)
   if(OF_CUDA_LINK_DYNAMIC_LIBRARY)
     list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cublas)
     list(APPEND VENDOR_CUDA_LIBRARIES CUDA::curand)
+    list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cusolver)
     list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cufft)
     if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
       list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cublasLt)
@@ -53,6 +54,7 @@ if(BUILD_CUDA)
     list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cublas_static)
     list(APPEND VENDOR_CUDA_LIBRARIES CUDA::curand_static)
     list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cufft_static)
+    list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cusolver_static)
     if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
       list(APPEND VENDOR_CUDA_LIBRARIES CUDA::cublasLt_static)
     endif()

diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
@@ -190,7 +190,7 @@ generate_functional_api_and_pybind11_cpp(FUNCTIONAL_GENERATED_SRCS FUNCTIONAL_GE
                                          FUNCTIONAL_PYBIND11_SRCS ${PROJECT_SOURCE_DIR})
 oneflow_add_library(of_functional_obj STATIC ${FUNCTIONAL_GENERATED_SRCS}
                     ${FUNCTIONAL_GENERATED_HRCS})
-target_link_libraries(of_functional_obj LLVMSupportWithHeader glog::glog)
+target_link_libraries(of_functional_obj LLVMSupportWithHeader glog::glog fmt)
 add_dependencies(of_functional_obj prepare_oneflow_third_party)
 
 if(BUILD_PYTHON)
@@ -207,7 +207,7 @@ if(BUILD_PYTHON)
     of_functional_tensor_obj STATIC ${FUNCTIONAL_TENSOR_GENERATED_SRCS}
     ${FUNCTIONAL_TENSOR_GENERATED_HRCS} ${FUNCTIONAL_OPS_GENERATED_SRCS}
     ${FUNCTIONAL_OPS_GENERATED_HRCS})
-  target_link_libraries(of_functional_tensor_obj LLVMSupportWithHeader glog::glog)
+  target_link_libraries(of_functional_tensor_obj LLVMSupportWithHeader glog::glog fmt)
   add_dependencies(of_functional_tensor_obj prepare_oneflow_third_party)
   target_include_directories(of_functional_tensor_obj PRIVATE ${Python_INCLUDE_DIRS}
                                                               ${Python_NumPy_INCLUDE_DIRS})

diff --git a/cmake/op_schema.cmake b/cmake/op_schema.cmake
@@ -87,5 +87,5 @@ set_source_files_properties(${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP}
                                                                                            TRUE)
 
 oneflow_add_library(of_op_schema OBJECT ${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP})
-target_link_libraries(of_op_schema LLVMSupportWithHeader glog::glog)
+target_link_libraries(of_op_schema LLVMSupportWithHeader glog::glog fmt)
 add_dependencies(of_op_schema prepare_oneflow_third_party)
diff --git a/cmake/third_party/trt_flash_attention.cmake b/cmake/third_party/trt_flash_attention.cmake
@@ -5,10 +5,10 @@ find_package(Threads)
 set(TRT_FLASH_ATTENTION_PROJECT trt_flash_attention)
 
 set(TRT_FLASH_ATTENTION_URL
-    https://github.com/Oneflow-Inc/trt_flash_attention/archive/dac29803b711535ed11fcd6ca4a0acfb0f2f8d12.zip
+    https://github.com/Oneflow-Inc/trt_flash_attention/archive/d8b74631eb811c95a0d20f247238db6e91acafe3.zip
 )
 use_mirror(VARIABLE TRT_FLASH_ATTENTION_URL URL ${TRT_FLASH_ATTENTION_URL})
-set(TRT_FLASH_ATTENTION_MD5 85d2bcb87f21a58cd8c4dbfa4ae8f2a8)
+set(TRT_FLASH_ATTENTION_MD5 9e0e822ce1450e11515533fbe32e58a9)
 
 set(TRT_FLASH_ATTENTION_INSTALL_DIR ${THIRD_PARTY_DIR}/trt_flash_attention)
 set(TRT_FLASH_ATTENTION_INCLUDE_DIR ${TRT_FLASH_ATTENTION_INSTALL_DIR}/include CACHE PATH "" FORCE)

diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
@@ -20,3 +20,4 @@ Matrix Properties
     diagonal
     inv
     cross
+    det
diff --git a/docs/source/oneflow.rst b/docs/source/oneflow.rst
@@ -317,6 +317,7 @@ Reduction Ops
     min  
     mean  
     median
+    mode
     prod
     nansum
     std  

diff --git a/docs/source/tensor.rst b/docs/source/tensor.rst
@@ -285,6 +285,7 @@ Tensor class reference
     Tensor.min
     Tensor.minimum
     Tensor.mish
+    Tensor.mode
     Tensor.mul
     Tensor.mul_
     Tensor.nansum

diff --git a/external/fmt/CMakeLists.txt b/external/fmt/CMakeLists.txt
@@ -6,6 +6,9 @@ FetchContent_Declare(fmt URL ${FMT_URL} URL_HASH MD5=${FMT_MD5})
 
 FetchContent_MakeAvailable(fmt)
 
+# Clang doesn't support __float128 when compiling CUDA
+target_compile_definitions(fmt PUBLIC FMT_USE_FLOAT128=0)
+
 install(
   TARGETS fmt
   EXPORT oneflow

diff --git a/oneflow/api/python/env/env.cpp b/oneflow/api/python/env/env.cpp
@@ -23,6 +23,7 @@ limitations under the License.
 #include "oneflow/core/vm/virtual_machine.h"
 #include "oneflow/core/framework/shut_down_util.h"
 #include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/common/mem_util.h"
 
 #ifdef WITH_CUDA
 #include <cuda.h>
@@ -94,6 +95,7 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
   m.def("SetCudaDeviceIndex", &SetCudaDeviceIndex);
   m.def("CudaSynchronize", &CudaSynchronize);
   m.def("GetCUDAMemoryUsed", &GetCUDAMemoryUsed);
+  m.def("GetCPUMemoryUsed", &GetCPUMemoryUsed);
   m.def(
       "_get_device_properties",
       [](int device) -> cudaDeviceProp* { return GetDeviceProperties(device); },

diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp
@@ -241,6 +241,28 @@ static PyObject* PyTensorObject_is_pinned(PyObject* self, PyObject* unused) {
   END_HANDLE_ERRORS
 }
 
+static PyObject* PyTensorObject_offload(PyObject* self, PyObject* unused) {
+  HANDLE_ERRORS
+  const auto& t = PyTensor_Unpack(self);
+  CHECK_JUST(t->offload());
+  Py_RETURN_NONE;
+  END_HANDLE_ERRORS
+}
+
+static PyObject* PyTensorObject_load(PyObject* self, PyObject* unused) {
+  HANDLE_ERRORS
+  const auto& t = PyTensor_Unpack(self);
+  CHECK_JUST(t->load());
+  Py_RETURN_NONE;
+  END_HANDLE_ERRORS
+}
+
+static PyObject* PyTensorObject_is_offloaded(PyObject* self, PyObject* unused) {
+  HANDLE_ERRORS
+  return functional::CastToPyObject(CHECK_JUST(PyTensor_Unpack(self)->is_offloaded()));
+  END_HANDLE_ERRORS
+}
+
 static PyObject* PyTensorObject_is_floating_point(PyObject* self, PyObject* unused) {
   HANDLE_ERRORS
   if (PyTensor_Unpack(self)->dtype()->is_floating_point()) {
@@ -509,6 +531,9 @@ static PyMethodDef PyTensorObject_methods[] = {
     {"contiguous_", PyTensorObject_contiguous_, METH_NOARGS, NULL},
     {"pin_memory", PyTensorObject_pin_memory, METH_NOARGS, NULL},
     {"is_pinned", PyTensorObject_is_pinned, METH_NOARGS, NULL},
+    {"offload", PyTensorObject_offload, METH_NOARGS, NULL},
+    {"load", PyTensorObject_load, METH_NOARGS, NULL},
+    {"is_offloaded", PyTensorObject_is_offloaded, METH_NOARGS, NULL},
     {"is_floating_point", PyTensorObject_is_floating_point, METH_NOARGS, NULL},
     {"requires_grad_", (PyCFunction)PyTensorObject_requires_grad_, METH_VARARGS | METH_KEYWORDS,
      NULL},

diff --git a/oneflow/api/python/framework/tensor_functions.cpp b/oneflow/api/python/framework/tensor_functions.cpp
@@ -318,6 +318,7 @@ DIRECT_PASS_FUNC(PyTensorObject_unsqueeze, functional::unsqueeze)
 DIRECT_PASS_FUNC(PyTensorObject_max, functional::max)
 DIRECT_PASS_FUNC(PyTensorObject_min, functional::min)
 DIRECT_PASS_FUNC(PyTensorObject_median, functional::median)
+DIRECT_PASS_FUNC(PyTensorObject_mode, functional::mode)
 DIRECT_PASS_FUNC(PyTensorObject_pow, functional::pow)
 DIRECT_PASS_FUNC(PyTensorObject_chunk, functional::chunk)
 DIRECT_PASS_FUNC(PyTensorObject_split, functional::split)
@@ -1005,6 +1006,7 @@ PyMethodDef PyTensorObject_extra_methods[] = {
     {"max", (PyCFunction)PyTensorObject_max, METH_VARARGS | METH_KEYWORDS, NULL},
     {"min", (PyCFunction)PyTensorObject_min, METH_VARARGS | METH_KEYWORDS, NULL},
     {"median", (PyCFunction)PyTensorObject_median, METH_VARARGS | METH_KEYWORDS, NULL},
+    {"mode", (PyCFunction)PyTensorObject_mode, METH_VARARGS | METH_KEYWORDS, NULL},
     {"pow", (PyCFunction)PyTensorObject_pow, METH_VARARGS | METH_KEYWORDS, NULL},
     {"chunk", (PyCFunction)PyTensorObject_chunk, METH_VARARGS | METH_KEYWORDS, NULL},
     {"split", (PyCFunction)PyTensorObject_split, METH_VARARGS | METH_KEYWORDS, NULL},

diff --git a/oneflow/core/autograd/gradient_funcs/det.cpp b/oneflow/core/autograd/gradient_funcs/det.cpp
@@ -0,0 +1,63 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/common/container_util.h"
+#include "oneflow/core/functional/functional_api.yaml.h"
+
+namespace oneflow {
+namespace one {
+
+struct DetCaptureState : public AutoGradCaptureState {
+  bool requires_grad = false;
+  size_t input_index = 0;
+  size_t output_index = 0;
+};
+
+class Det : public OpExprGradFunction<DetCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(DetCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs,
+                      const AttrMap& attrs) const override {
+    ctx->requires_grad = JUST(VectorAt(inputs, 0))->requires_grad();
+    if (ctx->requires_grad) {
+      ctx->input_index = ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 0)));
+      ctx->output_index = ctx->SaveTensorForBackward(JUST(VectorAt(outputs, 0)));
+    }
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const DetCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (ctx->requires_grad) {
+      const auto& output = JUST(VectorAt(ctx->SavedTensors(), ctx->output_index));
+      const auto& input = JUST(VectorAt(ctx->SavedTensors(), ctx->input_index));
+      const auto& dy = JUST(VectorAt(out_grads, 0));
+      const auto& dy_unsqueeze = JUST(functional::UnsqueezeMultiple(dy, {-2, -1}, dy->ndim() + 2));
+      const auto& output_unsqueeze =
+          JUST(functional::UnsqueezeMultiple(output, {-2, -1}, output->ndim() + 2));
+      JUST(VectorAt(*in_grads, 0)) = JUST(functional::Transpose2dim(
+          JUST(functional::Mul(
+              dy_unsqueeze, JUST(functional::Mul(JUST(functional::Inv(input)), output_unsqueeze)))),
+          -2, -1));
+    }
+    return Maybe<void>::Ok();
+  }
+};
+
+REGISTER_OP_EXPR_GRAD_FUNCTION("det", Det);
+
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/core/autograd/gradient_funcs/mode.cpp b/oneflow/core/autograd/gradient_funcs/mode.cpp
@@ -0,0 +1,62 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/functional/sequence_function.h"
+#include "oneflow/core/common/container_util.h"
+
+namespace oneflow {
+namespace one {
+
+struct ModeCaptureState : public AutoGradCaptureState {
+  bool requires_grad = false;
+};
+
+class Mode : public OpExprGradFunction<ModeCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(ModeCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs,
+                      const AttrMap& attrs) const override {
+    ctx->requires_grad = JUST(VectorAt(inputs, 0))->requires_grad();
+    if (ctx->requires_grad) {
+      ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 0)));
+      ctx->SaveTensorForBackward(JUST(VectorAt(outputs, 1)));
+    }
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const ModeCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (ctx->requires_grad) {
+      in_grads->resize(1);
+      const auto& input = JUST(VectorAt(ctx->SavedTensors(), 0));
+      const auto& indices = JUST(functional::Unsqueeze(JUST(VectorAt(ctx->SavedTensors(), 1)), -1));
+      const auto& dout = JUST(functional::Unsqueeze(JUST(VectorAt(out_grads, 0)), -1));
+      JUST(VectorAt(*in_grads, 0)) = JUST(functional::DimScatterUpdate(
+          JUST(functional::Constant(*(input->shape()), Scalar(0), *dout->dtype(),
+                                    JUST(dout->device()))),
+          -1, indices, dout, /*inplace*/ false));
+    }
+    return Maybe<void>::Ok();
+  }
+};
+
+REGISTER_OP_EXPR_GRAD_FUNCTION("mode", Mode);
+
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/core/common/mem_util.cpp b/oneflow/core/common/mem_util.cpp
@@ -14,6 +14,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/common/mem_util.h"
+#include "oneflow/core/vm/vm_util.h"
+#include "oneflow/core/vm/virtual_machine.h"
 
 #include <unistd.h>
 #include <sys/sysinfo.h>
@@ -30,6 +32,11 @@ struct ProcStat {
   long rss = 0;
 };
 
+Maybe<void> CPUSynchronize() {
+  if (Singleton<VirtualMachine>::Get() != nullptr) { return vm::CurrentRankSync(); }
+  return Maybe<void>::Ok();
+}
+
 }  // namespace
 
 // Reference: https://stackoverflow.com/questions/669438/how-to-get-memory-usage-at-runtime-using-c
@@ -59,4 +66,11 @@ void ProcessMemUsage(double* vm_usage, double* resident_set) {
 #endif  // __linux__
 }
 
+Maybe<double> GetCPUMemoryUsed() {
+  JUST(CPUSynchronize());
+  double vm_ = 0, rss_ = 0;
+  ProcessMemUsage(&vm_, &rss_);
+  return rss_;
+}
+
 }  // namespace oneflow
diff --git a/oneflow/core/common/mem_util.h b/oneflow/core/common/mem_util.h
@@ -21,9 +21,11 @@ limitations under the License.
 #include <string>
 
 #include "oneflow/core/common/util.h"
+#include "oneflow/core/common/maybe.h"
 
 namespace oneflow {
 void ProcessMemUsage(double* vm_usage, double* resident_set);
+Maybe<double> GetCPUMemoryUsed();
 }  // namespace oneflow
 
 #define LOG_MEM(...)                                                                \

diff --git a/oneflow/core/common/small_vector.h b/oneflow/core/common/small_vector.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_
 #define ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_
 
+#include <glog/logging.h>
 #include "llvm/ADT/SmallVector.h"
 
 namespace oneflow {

diff --git a/oneflow/core/common/stride.cpp b/oneflow/core/common/stride.cpp
@@ -21,20 +21,24 @@ limitations under the License.
 
 namespace oneflow {
 
+Stride::Stride(const ShapeView& shape) {
+  const int64_t ndim = shape.NumAxes();
+  resize(ndim);
+  if (ndim > 0 && shape.elem_cnt() > 0) {
+    std::exclusive_scan(shape.rbegin(), shape.rend(), rbegin(), (int64_t)1, std::multiplies<>{});
+  } else if (ndim > 0 && shape.elem_cnt() == 0) {
+    // 0-size shape
+    small_vector<int64_t, kMaxNumDims> tmp_shape(ndim);
+    for (int64_t i = 0; i < ndim; ++i) { tmp_shape[i] = shape.At(i) > 0 ? shape.At(i) : 1; }
+    std::exclusive_scan(tmp_shape.rbegin(), tmp_shape.rend(), rbegin(), (int64_t)1,
+                        std::multiplies<>{});
+  }
+}
+
 Stride::Stride(const Shape& shape) {
   if (shape.is_initialized()) {
-    const int64_t ndim = shape.NumAxes();
-    resize(shape.NumAxes());
-    if (ndim > 0 && shape.elem_cnt() > 0) {
-      std::exclusive_scan(shape.dim_vec().rbegin(), shape.dim_vec().rend(), rbegin(), (int64_t)1,
-                          std::multiplies<>{});
-    } else if (ndim > 0 && shape.elem_cnt() == 0) {
-      // 0-size shape
-      small_vector<int64_t, kMaxNumDims> tmp_shape(ndim);
-      for (int64_t i = 0; i < ndim; ++i) { tmp_shape[i] = shape.At(i) > 0 ? shape.At(i) : 1; }
-      std::exclusive_scan(tmp_shape.rbegin(), tmp_shape.rend(), rbegin(), (int64_t)1,
-                          std::multiplies<>{});
-    }
+    ShapeView shape_view(shape);
+    new (this) Stride(shape_view);
   }
 }
 
@@ -65,4 +69,9 @@ void Stride::ToProto(Int64ListProto* ret) const {
   *(ret->mutable_dim()) = PbRf<int64_t>(begin(), end());
 }
 
+std::ostream& operator<<(std::ostream& out, const Stride& stride) {
+  out << stride.ToString();
+  return out;
+}
+
 }  // namespace oneflow
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,3 +20,4 @@ Matrix Properties @@
         diagonal
         inv
         cross
+        det