Skip to content

Commit

Permalink
Merged PR 11929: Move around code to make later comparison with FP16 …
Browse files Browse the repository at this point in the history
…code easier

This does not introduce any new functionality, just moves code around, so that future PRs are easier to compare. Moving old GraphGroup code to training/deprecated. Once it is clear there is nothing in there that's worth saving, this will be deleted.

Replace -Ofast with -O3 and make sure ffinite-math is turned off.
  • Loading branch information
emjotde authored and ugermann committed May 20, 2020
1 parent 2586af7 commit 66711b5
Show file tree
Hide file tree
Showing 17 changed files with 107 additions and 200 deletions.
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -167,19 +167,19 @@ else(MSVC)
endif(CMAKE_COMPILER_IS_GNUCC)

set(CMAKE_CXX_FLAGS "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_SLIM "-Ofast -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
set(CMAKE_CXX_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE}")
set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg")
set(CMAKE_CXX_FLAGS_PROFGEN "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")

# these need to be set separately
set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_SLIM "-O3 -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
set(CMAKE_C_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE}")
set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE} -pg")
set(CMAKE_C_FLAGS_PROFGEN "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
Expand Down
6 changes: 1 addition & 5 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,9 @@ add_library(marian STATIC
translator/scorers.cpp

training/graph_group_async.cpp
training/graph_group_async_drop.cpp
training/graph_group_sync.cpp
training/graph_group.cpp
training/graph_group_singleton.cpp
training/graph_group_multinode.cpp
training/graph_group_multinode_sync.cpp
training/validator.cpp
training/communicator.cpp

Expand Down Expand Up @@ -145,8 +143,6 @@ cuda_add_library(marian_cuda
tensors/gpu/cudnn_wrappers.cu
translator/nth_element.cu
translator/helpers.cu
training/gradient_dropping/gpu/dropper.cu
training/gradient_dropping/gpu/sparse_algorithm.cu
STATIC)

target_compile_options(marian_cuda PUBLIC ${ALL_WARNINGS})
Expand Down
34 changes: 4 additions & 30 deletions src/command/marian_train.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,10 @@

#include "common/signal_handling.h"
#include "training/graph_group_async.h"
#include "training/graph_group_multinode_sync.h"
#include "training/graph_group_singleton.h"
#include "training/graph_group_sync.h"
#include "training/training.h"

#ifdef CUDA_FOUND
#include "training/graph_group_async_drop.h"
#include "training/graph_group_multinode.h"
#endif

#include "3rd_party/ExceptionWithCallStack.h"

int main(int argc, char** argv) {
Expand All @@ -28,26 +22,15 @@ int main(int argc, char** argv) {
// MultiNodeGraphGroupSync.
if(options->get<bool>("multi-node")) {
LOG(warn, "[experimental] Using old multi-node training implementations that are not up-to-date");

if(options->get<bool>("sync-sgd")) {
LOG(info, "[training] Using multi-node synchronous training");
New<Train<MultiNodeGraphGroupSync>>(options)->run();
} else {
#ifdef CUDA_FOUND
LOG(info, "[training] Using multi-node asynchronous training");
New<Train<MultiNodeGraphGroup>>(options)->run();
#else
ABORT("Asynchronous multi-node training requires CUDA");
#endif
}
ABORT("Old multi-node training code disabled");
}
// --sync-sgd always selects SyncGraphGroup
//
// If given, then this implementation is used for all combinations of (single, multiple) MPI
// processes x (single, multiple) GPUs per MPI process. This variant is presently up-to-date and
// best supported.
else if (options->get<bool>("sync-sgd")) {
LOG(info, "[training] Using synchronous training");
LOG(info, "Using synchronous SGD");
New<Train<SyncGraphGroup>>(options)->run();
}
else {
Expand All @@ -56,17 +39,8 @@ int main(int argc, char** argv) {
LOG(info, "[training] Using single-device training");
New<Train<SingletonGraph>>(options)->run();
} else {
if(options->get<float>("grad-dropping-rate") > 0.0) {
#ifdef CUDA_FOUND
LOG(info, "[training] Using asynchronous training with gradient dropping");
New<Train<AsyncGraphGroupDrop>>(options)->run();
#else
ABORT("Asynchronous training with gradient dropping requires CUDA");
#endif
} else {
LOG(info, "[training] Using asynchronous training");
New<Train<AsyncGraphGroup>>(options)->run();
}
LOG(info, "Using asynchronous training");
New<Train<AsyncGraphGroup>>(options)->run();
}
}

Expand Down
3 changes: 1 addition & 2 deletions src/tensors/cpu/fbgemm/expanded_gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
#endif // USE_FBGEMM
}
};
;

// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
// PackMatrix packMat_: the type of packed matrix - A or B matrix
// marian::Type packType_: the type the input matrix is packed - packed8avx2 or packed8avx512
Expand All @@ -132,7 +132,6 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
// int ncol_: the number of columns
// uint64_t packsize_: the size of the packed matrix
// (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)

struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
PackMatrix packMat_;
marian::Type packType_;
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
89 changes: 89 additions & 0 deletions src/training/graph_group.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#include "training/graph_group.h"

namespace marian {

GraphGroup::GraphGroup(Ptr<Options> options) : options_(options), opt_(Optimizer(options)) {}

void GraphGroup::validate() {
ABORT_IF(finalized_, "Training has already finished.");
}

void GraphGroup::finalize() {
finalized_ = true;
}

Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
Ptr<models::ICriterionFunction> model,
const std::vector<Ptr<Vocab>>& vocabs,
double multiplier) {
auto stats = New<data::BatchStats>();

size_t numFiles = options_->get<std::vector<std::string>>("train-sets").size();

// Initialize first batch to step size
size_t first = options_->get<size_t>("mini-batch-fit-step");

// Increase batch size and sentence length by this step size
size_t step = options_->get<size_t>("mini-batch-fit-step");

size_t maxLength = options_->get<size_t>("max-length");
maxLength = (size_t)(std::ceil(maxLength / (float)step) * step);

// this should be only one class label per line on input, hence restricting length to 1
std::vector<size_t> localMaxes(numFiles, maxLength);
auto inputTypes = options_->get<std::vector<std::string>>("input-types", {});
for(int i = 0; i < inputTypes.size(); ++i)
if(inputTypes[i] == "class")
localMaxes[i] = 1;

size_t maxBatch = 512;
bool fits = true;
while(fits) {
std::vector<size_t> lengths(numFiles, first);
for(int j = 0; j < lengths.size(); ++j) // apply length restrictions
lengths[j] = std::min(lengths[j], localMaxes[j]);

auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, maxBatch, options_);
auto cost = model->build(graph, batch);
fits = graph->fits();
if(fits)
maxBatch *= 2;
}

// Do a binary search for maxmimum batch size that fits into given workspace memory
// for a tested sentence length.
for(size_t i = step; i <= maxLength; i += step) {
size_t start = 1;
size_t end = maxBatch;

std::vector<size_t> lengths(numFiles, i);
for(int j = 0; j < lengths.size(); ++j) // apply length restrictions
lengths[j] = std::min(lengths[j], localMaxes[j]);
fits = true;

do {
size_t current = (start + end) / 2;
auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, current, options_);
auto cost = model->build(graph, batch);
fits = graph->fits();

LOG(debug, "[batching] length: {} - size: {} - fits: {}", lengths[0], current, fits);

if(fits) {
stats->add(batch, multiplier);
start = current + 1;
} else {
end = current - 1;
}
} while(end - start > step);

maxBatch = start;
}
return stats;
}

void GraphGroup::setTypicalTrgBatchWords(size_t typicalTrgBatchWords) { // needed for dynamic MB scaling
typicalTrgBatchWords_ = typicalTrgBatchWords;
}

}
Loading

0 comments on commit 66711b5

Please sign in to comment.