Skip to content

Commit

Permalink
Optimize CPU LayerNormalization 6x with -ffast-math and lifting branc…
Browse files Browse the repository at this point in the history
…hes (#689)

* Optimize LayerNormalization with -ffast-math and lifting branches
* LayerNormalization changelog

gprof:
Now
  1.65     30.11     0.57                             void marian::cpu::LayerNormalizationImpl<1, 1, true>(float*, float const*, float const*, float const*,       float, int, int)
Baseline
  9.08     22.31     3.49                             marian::cpu::LayerNormalization(IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>,          IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, float)

That's 3.49 seconds to 0.57 second

* LayerNormalization: longer comments, @frankseide-style if
  • Loading branch information
kpu authored Aug 2, 2020
1 parent e5c9e0b commit c944633
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 17 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]

### Added
- Optimize LayerNormalization on CPU by 6x through vectorization (ffast-math) and fixing performance regression introduced with strides in 77a420
- Decoding multi-source models in marian-server with --tsv
- GitHub workflows on Ubuntu, Windows, and MacOS
- LSH indexing to replace short list
Expand Down
55 changes: 55 additions & 0 deletions src/common/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,61 @@
#define DONT_OPTIMIZE // silently ignore on Visual Studio, where this is less of a problem
#endif

// Use these macros to enable faster floating-point math. Put them around one
// or more functions.
//
// Usage:
// MARIAN_FFAST_MATH_BEGIN
// void LayerNormalization(float *arg) { *arg += 1.0; }
// void SomethingElse() {}
// MARIAN_FFAST_MATH_END
//
// ffast-math allows the compiler to assume associative arithmetic and finite
// values.
//
// Associative arithmetic is particularly important to vectorize i.e. a sum:
// for (const float f : range) sum += f;
// Without ffast-math, the sum will be done one value at a time. On x86 it
// still uses vector math, but only uses the first slot and wastes the rest.
//
// With ffast-math, the compiler can sum in batches of 4, 8, or 16 floats.
// Also, it can run multiple adds in parallel e.g. vaddps has latency 4 and
// throughput 0.5 on Skylake so multiple vector adds can run at once.
//
// On average, a vectorized sum is more numerically stable because it sums in
// batches. Vectorized floats can still produce NaNs and infs (remember even
// scalar operations are implemented with vector instructions).
//
// Allowing the compiler to assume finite values means functions like isnan or
// isinf do not work as expected. Do not enable this for a function that
// depends upon fully standard float behavior.
//
// It can also change the sign of zeros.
//
// Fast math also makes results more architecture dependent because different
// register widths mean different results. They also depend on the compiler
// and compiler version more. For example, clang <= 10 does not support the
// float_control pragma below so it will still be conservative.
//
// There is a more conservative option for just associativity:
// llvm introduced "#pragma clang fp reassociate" that goes inside a function.
// However, llvm <11 considers that pragma an error so we'd need some ugly
// version test (which they don't recommend) or a compilation test. Moreover,
// it has to be in the function to keep scope.
// gcc supports "-fassociative-math" that has to be outside a function.
// I didn't find a MSVC equivalent.
#if defined(_MSC_VER)
#define MARIAN_FFAST_MATH_BEGIN __pragma(float_control(precise, off, push))
#define MARIAN_FFAST_MATH_END __pragma(float_control(pop))
#elif defined(__clang__)
#define MARIAN_FFAST_MATH_BEGIN _Pragma("float_control(precise, off, push)")
#define MARIAN_FFAST_MATH_END _Pragma("float_control(pop)")
#elif defined(__GNUC__)
// Also available as __attribute__((optimize("-ffast-math"))) but done as pragmas for consistency
#define MARIAN_FFAST_MATH_BEGIN _Pragma("GCC push_options") _Pragma("GCC optimize(\"-ffast-math\")")
#define MARIAN_FFAST_MATH_END _Pragma("GCC pop_options")
#endif

namespace marian {

// Type to be used for all index types, e.g. for integer tensors for rows operator.
Expand Down
68 changes: 51 additions & 17 deletions src/tensors/cpu/tensor_operators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1032,21 +1032,15 @@ void AttBack(Tensor gVa_,
}
}

void LayerNormalization(Tensor out_,
Tensor in_,
Tensor gamma_,
Tensor beta_,
float eps) {
float* out = out_->data();
const float* in = in_->data();
const float* alpha = gamma_->data();
const float* beta = beta_ ? beta_->data() : nullptr;
const int alphaStride = gamma_->shape().back() > 1; // broadcasting for alpha and beta
const int betaStride = beta_ && beta_->shape().back() > 1;

int rows = in_->shape().elements() / in_->shape().back();
int cols = in_->shape().back();

MARIAN_FFAST_MATH_BEGIN
template <int alphaStride, int betaStride, bool hasBeta>
void LayerNormalizationImpl(float* out,
const float* in,
const float* alpha,
const float* beta,
float eps,
int rows,
int cols) {
#pragma omp parallel for
for(int j = 0; j < rows; ++j) {
float* so = out + j * cols;
Expand All @@ -1071,15 +1065,54 @@ void LayerNormalization(Tensor out_,
#pragma omp simd
for(int i = 0; i < cols; ++i) {
float t = alpha[alphaStride * i] * ((sp[i] - mean) / sigma);
if(beta != nullptr) {
if(hasBeta)
t += beta[betaStride * i];
}

so[i] = t;
}
}
}
MARIAN_FFAST_MATH_END

template <int alphaStride>
inline void LayerNormalizationDispatchBeta(float* out,
const float* in,
const float* alpha,
Tensor beta,
float eps,
int rows,
int cols) {
if (beta) {
if (beta->shape().back() > 1) {
LayerNormalizationImpl<alphaStride, 1, true>(out, in, alpha, beta->data(), eps, rows, cols);
} else {
LayerNormalizationImpl<alphaStride, 0, true>(out, in, alpha, beta->data(), eps, rows, cols);
}
} else {
LayerNormalizationImpl<alphaStride, 0, false>(out, in, alpha, nullptr, eps, rows, cols);
}
}

void LayerNormalization(Tensor out_,
Tensor in_,
Tensor gamma_,
Tensor beta,
float eps) {
float* out = out_->data();
const float* in = in_->data();
const float* alpha = gamma_->data();
const int alphaStride = gamma_->shape().back() > 1; // broadcasting for alpha and beta

int rows = in_->shape().elements() / in_->shape().back();
int cols = in_->shape().back();
if (alphaStride == 0) {
LayerNormalizationDispatchBeta<0>(out, in, alpha, beta, eps, rows, cols);
} else {
LayerNormalizationDispatchBeta<1>(out, in, alpha, beta, eps, rows, cols);
}
}

MARIAN_FFAST_MATH_BEGIN
void LayerNormalizationGrad(Tensor gradX_,
Tensor gradGamma_,
Tensor gradBeta_,
Expand Down Expand Up @@ -1191,6 +1224,7 @@ void LayerNormalizationGrad(Tensor gradX_,
}
}
}
MARIAN_FFAST_MATH_END

void Shift(Tensor out_,
Tensor in_,
Expand Down

0 comments on commit c944633

Please sign in to comment.