Optimize CPU LayerNormalization 6x with -ffast-math and lifting branc…

…hes (#689) * Optimize LayerNormalization with -ffast-math and lifting branches * LayerNormalization changelog gprof: Now 1.65 30.11 0.57 void marian::cpu::LayerNormalizationImpl<1, 1, true>(float*, float const*, float const*, float const*, float, int, int) Baseline 9.08 22.31 3.49 marian::cpu::LayerNormalization(IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, float) That's 3.49 seconds to 0.57 second * LayerNormalization: longer comments, @frankseide-style if
marian-nmt · Aug 2, 2020 · c944633 · c944633
1 parent e5c9e0b
commit c944633
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Optimize LayerNormalization on CPU by 6x through vectorization (ffast-math) and fixing performance regression introduced with strides in 77a420
 - Decoding multi-source models in marian-server with --tsv
 - GitHub workflows on Ubuntu, Windows, and MacOS
 - LSH indexing to replace short list

diff --git a/src/common/definitions.h b/src/common/definitions.h
@@ -22,6 +22,61 @@
 #define DONT_OPTIMIZE // silently ignore on Visual Studio, where this is less of a problem
 #endif
 
+// Use these macros to enable faster floating-point math. Put them around one
+// or more functions.
+//
+// Usage:
+// MARIAN_FFAST_MATH_BEGIN
+// void LayerNormalization(float *arg) { *arg += 1.0; }
+// void SomethingElse() {}
+// MARIAN_FFAST_MATH_END
+//
+// ffast-math allows the compiler to assume associative arithmetic and finite
+// values.
+//
+// Associative arithmetic is particularly important to vectorize i.e. a sum:
+//   for (const float f : range) sum += f;
+// Without ffast-math, the sum will be done one value at a time.  On x86 it
+// still uses vector math, but only uses the first slot and wastes the rest.
+//
+// With ffast-math, the compiler can sum in batches of 4, 8, or 16 floats.
+// Also, it can run multiple adds in parallel e.g. vaddps has latency 4 and
+// throughput 0.5 on Skylake so multiple vector adds can run at once.
+//
+// On average, a vectorized sum is more numerically stable because it sums in
+// batches. Vectorized floats can still produce NaNs and infs (remember even
+// scalar operations are implemented with vector instructions).
+//
+// Allowing the compiler to assume finite values means functions like isnan or
+// isinf do not work as expected. Do not enable this for a function that
+// depends upon fully standard float behavior.
+//
+// It can also change the sign of zeros.
+//
+// Fast math also makes results more architecture dependent because different
+// register widths mean different results. They also depend on the compiler
+// and compiler version more. For example, clang <= 10 does not support the
+// float_control pragma below so it will still be conservative.
+//
+// There is a more conservative option for just associativity:
+// llvm introduced "#pragma clang fp reassociate" that goes inside a function.
+// However, llvm <11 considers that pragma an error so we'd need some ugly
+// version test (which they don't recommend) or a compilation test.  Moreover,
+// it has to be in the function to keep scope.
+// gcc supports "-fassociative-math" that has to be outside a function.
+// I didn't find a MSVC equivalent.
+#if defined(_MSC_VER)
+#define MARIAN_FFAST_MATH_BEGIN __pragma(float_control(precise, off, push))
+#define MARIAN_FFAST_MATH_END __pragma(float_control(pop))
+#elif defined(__clang__)
+#define MARIAN_FFAST_MATH_BEGIN _Pragma("float_control(precise, off, push)")
+#define MARIAN_FFAST_MATH_END _Pragma("float_control(pop)")
+#elif defined(__GNUC__)
+// Also available as __attribute__((optimize("-ffast-math"))) but done as pragmas for consistency
+#define MARIAN_FFAST_MATH_BEGIN _Pragma("GCC push_options") _Pragma("GCC optimize(\"-ffast-math\")")
+#define MARIAN_FFAST_MATH_END _Pragma("GCC pop_options")
+#endif
+
 namespace marian {
 
 // Type to be used for all index types, e.g. for integer tensors for rows operator.

diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
@@ -1032,21 +1032,15 @@ void AttBack(Tensor gVa_,
   }
 }
 
-void LayerNormalization(Tensor out_,
-                        Tensor in_,
-                        Tensor gamma_,
-                        Tensor beta_,
-                        float eps) {
-  float* out = out_->data();
-  const float* in = in_->data();
-  const float* alpha = gamma_->data();
-  const float* beta = beta_ ? beta_->data() : nullptr;
-  const int alphaStride = gamma_->shape().back() > 1;  // broadcasting for alpha and beta
-  const int betaStride = beta_ && beta_->shape().back() > 1;
-
-  int rows = in_->shape().elements() / in_->shape().back();
-  int cols = in_->shape().back();
-
+MARIAN_FFAST_MATH_BEGIN
+template <int alphaStride, int betaStride, bool hasBeta>
+void LayerNormalizationImpl(float* out,
+                            const float* in,
+                            const float* alpha,
+                            const float* beta,
+                            float eps,
+                            int rows,
+                            int cols) {
 #pragma omp parallel for
   for(int j = 0; j < rows; ++j) {
     float* so = out + j * cols;
@@ -1071,15 +1065,54 @@ void LayerNormalization(Tensor out_,
 #pragma omp simd
     for(int i = 0; i < cols; ++i) {
       float t = alpha[alphaStride * i] * ((sp[i] - mean) / sigma);
-      if(beta != nullptr) {
+      if(hasBeta)
         t += beta[betaStride * i];
-      }
 
       so[i] = t;
     }
   }
 }
+MARIAN_FFAST_MATH_END
+
+template <int alphaStride>
+inline void LayerNormalizationDispatchBeta(float* out,
+                                           const float* in,
+                                           const float* alpha,
+                                           Tensor beta,
+                                           float eps,
+                                           int rows,
+                                           int cols) {
+  if (beta) {
+    if (beta->shape().back() > 1) {
+      LayerNormalizationImpl<alphaStride, 1, true>(out, in, alpha, beta->data(), eps, rows, cols);
+    } else {
+      LayerNormalizationImpl<alphaStride, 0, true>(out, in, alpha, beta->data(), eps, rows, cols);
+    }
+  } else {
+    LayerNormalizationImpl<alphaStride, 0, false>(out, in, alpha, nullptr, eps, rows, cols);
+  }
+}
+
+void LayerNormalization(Tensor out_,
+                        Tensor in_,
+                        Tensor gamma_,
+                        Tensor beta,
+                        float eps) {
+  float* out = out_->data();
+  const float* in = in_->data();
+  const float* alpha = gamma_->data();
+  const int alphaStride = gamma_->shape().back() > 1;  // broadcasting for alpha and beta
+
+  int rows = in_->shape().elements() / in_->shape().back();
+  int cols = in_->shape().back();
+  if (alphaStride == 0) {
+    LayerNormalizationDispatchBeta<0>(out, in, alpha, beta, eps, rows, cols);
+  } else {
+    LayerNormalizationDispatchBeta<1>(out, in, alpha, beta, eps, rows, cols);
+  }
+}
 
+MARIAN_FFAST_MATH_BEGIN
 void LayerNormalizationGrad(Tensor gradX_,
                             Tensor gradGamma_,
                             Tensor gradBeta_,
@@ -1191,6 +1224,7 @@ void LayerNormalizationGrad(Tensor gradX_,
     }
   }
 }
+MARIAN_FFAST_MATH_END
 
 void Shift(Tensor out_,
            Tensor in_,