Merged PR 11929: Move around code to make later comparison with FP16 …

…code easier This does not introduce any new functionality, just moves code around, so that future PRs are easier to compare. Moving old GraphGroup code to training/deprecated. Once it is clear there is nothing in there that's worth saving, this will be deleted. Replace -Ofast with -O3 and make sure ffinite-math is turned off.
marian-nmt · May 20, 2020 · 66711b5 · 66711b5
1 parent 2586af7
commit 66711b5
Show file tree

Hide file tree

Showing 17 changed files with 107 additions and 200 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -167,19 +167,19 @@ else(MSVC)
   endif(CMAKE_COMPILER_IS_GNUCC)
 
   set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_CXX_FLAGS_RELEASE         "-Ofast -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
-  set(CMAKE_CXX_FLAGS_SLIM            "-Ofast -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
+  set(CMAKE_CXX_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELEASE}")
   set(CMAKE_CXX_FLAGS_PROFILE         "${CMAKE_CXX_FLAGS_RELEASE} -pg")
   set(CMAKE_CXX_FLAGS_PROFGEN         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
   set(CMAKE_CXX_FLAGS_PROFUSE         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
 
   # these need to be set separately
   set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
-  set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
+  set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
   set(CMAKE_C_FLAGS_RELWITHDEBINFO  "${CMAKE_C_FLAGS_RELEASE}")
   set(CMAKE_C_FLAGS_PROFILE         "${CMAKE_C_FLAGS_RELEASE} -pg")
   set(CMAKE_C_FLAGS_PROFGEN         "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -86,11 +86,9 @@ add_library(marian STATIC
   translator/scorers.cpp
 
   training/graph_group_async.cpp
-  training/graph_group_async_drop.cpp
   training/graph_group_sync.cpp
+  training/graph_group.cpp
   training/graph_group_singleton.cpp
-  training/graph_group_multinode.cpp
-  training/graph_group_multinode_sync.cpp
   training/validator.cpp
   training/communicator.cpp
 
@@ -145,8 +143,6 @@ cuda_add_library(marian_cuda
   tensors/gpu/cudnn_wrappers.cu
   translator/nth_element.cu
   translator/helpers.cu
-  training/gradient_dropping/gpu/dropper.cu
-  training/gradient_dropping/gpu/sparse_algorithm.cu
   STATIC)
 
   target_compile_options(marian_cuda PUBLIC ${ALL_WARNINGS})

diff --git a/src/command/marian_train.cpp b/src/command/marian_train.cpp
@@ -3,16 +3,10 @@
 
 #include "common/signal_handling.h"
 #include "training/graph_group_async.h"
-#include "training/graph_group_multinode_sync.h"
 #include "training/graph_group_singleton.h"
 #include "training/graph_group_sync.h"
 #include "training/training.h"
 
-#ifdef CUDA_FOUND
-#include "training/graph_group_async_drop.h"
-#include "training/graph_group_multinode.h"
-#endif
-
 #include "3rd_party/ExceptionWithCallStack.h"
 
 int main(int argc, char** argv) {
@@ -28,26 +22,15 @@ int main(int argc, char** argv) {
   // MultiNodeGraphGroupSync.
   if(options->get<bool>("multi-node")) {
     LOG(warn, "[experimental] Using old multi-node training implementations that are not up-to-date");
-
-    if(options->get<bool>("sync-sgd")) {
-      LOG(info, "[training] Using multi-node synchronous training");
-      New<Train<MultiNodeGraphGroupSync>>(options)->run();
-    } else {
-#ifdef CUDA_FOUND
-      LOG(info, "[training] Using multi-node asynchronous training");
-      New<Train<MultiNodeGraphGroup>>(options)->run();
-#else
-      ABORT("Asynchronous multi-node training requires CUDA");
-#endif
-    }
+    ABORT("Old multi-node training code disabled");
   }
   // --sync-sgd always selects SyncGraphGroup
   //
   // If given, then this implementation is used for all combinations of (single, multiple) MPI
   // processes x (single, multiple) GPUs per MPI process.  This variant is presently up-to-date and
   // best supported.
   else if (options->get<bool>("sync-sgd")) {
-    LOG(info, "[training] Using synchronous training");
+    LOG(info, "Using synchronous SGD");
     New<Train<SyncGraphGroup>>(options)->run();
   }
   else {
@@ -56,17 +39,8 @@ int main(int argc, char** argv) {
       LOG(info, "[training] Using single-device training");
       New<Train<SingletonGraph>>(options)->run();
     } else {
-      if(options->get<float>("grad-dropping-rate") > 0.0) {
-#ifdef CUDA_FOUND
-        LOG(info, "[training] Using asynchronous training with gradient dropping");
-        New<Train<AsyncGraphGroupDrop>>(options)->run();
-#else
-        ABORT("Asynchronous training with gradient dropping requires CUDA");
-#endif
-      } else {
-        LOG(info, "[training] Using asynchronous training");
-        New<Train<AsyncGraphGroup>>(options)->run();
-      }
+      LOG(info, "Using asynchronous training");
+      New<Train<AsyncGraphGroup>>(options)->run();
     }
   }
 

diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h
@@ -123,7 +123,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
 #endif  // USE_FBGEMM
   }
 };
-  ;
+
 // Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
 // PackMatrix packMat_: the type of packed matrix - A or B matrix
 // marian::Type packType_: the type the input matrix is packed - packed8avx2 or packed8avx512
@@ -132,7 +132,6 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
 // int ncol_: the number of columns
 // uint64_t packsize_: the size of the packed matrix
 //                    (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
-
 struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
   PackMatrix packMat_;
   marian::Type packType_;

diff --git a/src/training/gradient_dropping/dropper.h → ...ng/deprecated/gradient_dropping/dropper.h b/src/training/gradient_dropping/dropper.h → ...ng/deprecated/gradient_dropping/dropper.h
diff --git a/...training/gradient_dropping/gpu/dropper.cu → ...precated/gradient_dropping/gpu/dropper.cu b/...training/gradient_dropping/gpu/dropper.cu → ...precated/gradient_dropping/gpu/dropper.cu
diff --git a/...gradient_dropping/gpu/sparse_algorithm.cu → ...gradient_dropping/gpu/sparse_algorithm.cu b/...gradient_dropping/gpu/sparse_algorithm.cu → ...gradient_dropping/gpu/sparse_algorithm.cu
diff --git a/.../gradient_dropping/gpu/sparse_algorithm.h → .../gradient_dropping/gpu/sparse_algorithm.h b/.../gradient_dropping/gpu/sparse_algorithm.h → .../gradient_dropping/gpu/sparse_algorithm.h
diff --git a/...raining/gradient_dropping/sparse_tensor.h → ...recated/gradient_dropping/sparse_tensor.h b/...raining/gradient_dropping/sparse_tensor.h → ...recated/gradient_dropping/sparse_tensor.h
diff --git a/src/training/graph_group_async_drop.cpp → ...ing/deprecated/graph_group_async_drop.cpp b/src/training/graph_group_async_drop.cpp → ...ing/deprecated/graph_group_async_drop.cpp
diff --git a/src/training/graph_group_async_drop.h → ...ining/deprecated/graph_group_async_drop.h b/src/training/graph_group_async_drop.h → ...ining/deprecated/graph_group_async_drop.h
diff --git a/src/training/graph_group_multinode.cpp → ...ning/deprecated/graph_group_multinode.cpp b/src/training/graph_group_multinode.cpp → ...ning/deprecated/graph_group_multinode.cpp
diff --git a/src/training/graph_group_multinode.h → ...aining/deprecated/graph_group_multinode.h b/src/training/graph_group_multinode.h → ...aining/deprecated/graph_group_multinode.h
diff --git a/src/training/graph_group_multinode_sync.cpp → ...deprecated/graph_group_multinode_sync.cpp b/src/training/graph_group_multinode_sync.cpp → ...deprecated/graph_group_multinode_sync.cpp
diff --git a/src/training/graph_group_multinode_sync.h → ...g/deprecated/graph_group_multinode_sync.h b/src/training/graph_group_multinode_sync.h → ...g/deprecated/graph_group_multinode_sync.h
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
@@ -0,0 +1,89 @@
+#include "training/graph_group.h"
+
+namespace marian {
+
+GraphGroup::GraphGroup(Ptr<Options> options) : options_(options), opt_(Optimizer(options)) {}
+
+void GraphGroup::validate() {
+  ABORT_IF(finalized_, "Training has already finished.");
+}
+
+void GraphGroup::finalize() {
+  finalized_ = true;
+}
+
+Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
+                                               Ptr<models::ICriterionFunction> model,
+                                               const std::vector<Ptr<Vocab>>& vocabs,
+                                               double multiplier) {
+  auto stats = New<data::BatchStats>();
+
+  size_t numFiles = options_->get<std::vector<std::string>>("train-sets").size();
+
+  // Initialize first batch to step size
+  size_t first = options_->get<size_t>("mini-batch-fit-step");
+
+  // Increase batch size and sentence length by this step size
+  size_t step = options_->get<size_t>("mini-batch-fit-step");
+
+  size_t maxLength = options_->get<size_t>("max-length");
+  maxLength = (size_t)(std::ceil(maxLength / (float)step) * step);
+
+  // this should be only one class label per line on input, hence restricting length to 1
+  std::vector<size_t> localMaxes(numFiles, maxLength);
+  auto inputTypes = options_->get<std::vector<std::string>>("input-types", {});
+  for(int i = 0; i < inputTypes.size(); ++i)
+    if(inputTypes[i] == "class")
+      localMaxes[i] = 1;
+
+  size_t maxBatch = 512;
+  bool fits = true;
+  while(fits) {
+    std::vector<size_t> lengths(numFiles, first);
+    for(int j = 0; j < lengths.size(); ++j) // apply length restrictions
+      lengths[j] = std::min(lengths[j], localMaxes[j]);
+
+    auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, maxBatch, options_);
+    auto cost = model->build(graph, batch);
+    fits = graph->fits();
+    if(fits)
+      maxBatch *= 2;
+  }
+
+  // Do a binary search for maxmimum batch size that fits into given workspace memory
+  // for a tested sentence length.
+  for(size_t i = step; i <= maxLength; i += step) {
+    size_t start = 1;
+    size_t end = maxBatch;
+
+    std::vector<size_t> lengths(numFiles, i);
+    for(int j = 0; j < lengths.size(); ++j)  // apply length restrictions
+      lengths[j] = std::min(lengths[j], localMaxes[j]);
+    fits = true;
+
+    do {
+      size_t current = (start + end) / 2;
+      auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, current, options_);
+      auto cost = model->build(graph, batch);
+      fits = graph->fits();
+
+      LOG(debug, "[batching] length: {} - size: {} - fits: {}", lengths[0], current, fits);
+
+      if(fits) {
+        stats->add(batch, multiplier);
+        start = current + 1;
+      } else {
+        end = current - 1;
+      }
+    } while(end - start > step);
+
+    maxBatch = start;
+  }
+  return stats;
+}
+
+void GraphGroup::setTypicalTrgBatchWords(size_t typicalTrgBatchWords) { // needed for dynamic MB scaling
+  typicalTrgBatchWords_ = typicalTrgBatchWords;
+}
+
+}