mrshenli
diff --git a/‎CMakeLists.txt
+4 b/‎CMakeLists.txt
+4
diff --git a/‎aten/src/ATen/native/QuantizedLinear.cpp
+308 b/‎aten/src/ATen/native/QuantizedLinear.cpp
+308
diff --git a/‎aten/src/ATen/native/native_functions.yaml
+8 b/‎aten/src/ATen/native/native_functions.yaml
+8
diff --git a/‎cmake/Dependencies.cmake
+3 b/‎cmake/Dependencies.cmake
+3
@@ -198,6 +198,10 @@ include(ExternalProject)
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
 
+if(USE_FBGEMM)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_FBGEMM")
+endif()
+
 # ---[ Whitelist file if whitelist is specified
 include(cmake/Whitelist.cmake)
 
 
@@ -0,0 +1,308 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtilsMulti.h"
+
+#ifdef USE_FBGEMM
+#include "fbgemm/Fbgemm.h"
+#include "fbgemm/QuantUtils.h"
+#endif // USE_FBGEMM
+
+#include <array>
+#include <cctype>
+#include <cmath>
+#include <cstddef>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <chrono>
+namespace at {
+namespace native {
+
+#ifdef USE_FBGEMM
+
+Tensor fbgemm_linear_int8_weight(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& packed,
+    const Tensor& col_offsets,
+    Scalar weight_scale,
+    Scalar weight_zero_point,
+    const Tensor& bias) {
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  AT_ASSERTM(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+
+  // We call contiguous on `input` and `weight` here because these APIs all
+  // expect row-major tensor buffers.
+  auto* input_ptr = input.contiguous().data<float>();
+  auto* weight_ptr = weight.contiguous().data<int8_t>();
+
+  AT_ASSERT(input.dim() >= 2);
+  int64_t M = 1;
+  for (size_t i = 0; i < input.dim() - 1; ++i) {
+    M *= input.size(i);
+  }
+  int64_t K = input.size(input.dim() - 1);
+  AT_ASSERT(weight.dim() == 2);
+  AT_ASSERT(K == weight.size(1));
+  auto N = weight.size(0);
+  AT_ASSERT(bias.dim() == 1);
+  AT_ASSERT(bias.size(0) == N);
+  AT_ASSERT(weight_scale.isFloatingPoint());
+  AT_ASSERT(weight_zero_point.isIntegral());
+
+  // Calculate statistics for quantization of the input Tensor
+  float x_min, x_max;
+  fbgemm::FindMinMax(
+      /*m=*/input_ptr,
+      /*min=*/&x_min,
+      /*max=*/&x_max,
+      /*len=*/input.numel());
+
+  // Input tensor is quantized as 8-bit unsigned values
+  static constexpr int precision = 8;
+  static constexpr bool is_signed = false;
+
+  // Calculate scale and zero point for quantization of input tensor
+  auto q_params = fbgemm::ChooseQuantizationParams(
+      /*min=*/x_min,
+      /*max=*/x_max,
+      /*qmin=*/is_signed ? -(1 << (precision - 1)) : 0,
+      /*qmax=*/is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
+      /*preserve_sparsity=*/false);
+
+  q_params.precision = precision;
+
+  // This operation does the following:
+  // 1) Quantizes the input matrix given the statistics we've calculated above
+  // 2) Creates a "row buffer" vector with offset values that must be added
+  //    to the integer matrix multiplication operation to ensure correctness
+  // 3) Packs the resulting quantized matrix into vector-register and cache
+  //    friendly tiles.
+  //
+  //  Note this is not executed eagerly, but rather within the fbgemmPacked call
+  //  below.
+  fbgemm::PackAWithQuantRowOffset<uint8_t> packA(
+      /*trans=*/fbgemm::matrix_op_t::NoTranspose,
+      /*nRow=*/M,
+      /*nCol=*/K,
+      /*smat=*/input_ptr,
+      /*ld=*/K,
+      /*pmat=*/nullptr, // packA manages ownership of `pmat`
+      /*scale=*/q_params.scale,
+      /*zero_pt=*/q_params.zero_point);
+
+  // ReQuantizeForFloat requires pointers to the scale and zero point values,
+  // since in the case of rowwise quantization these will be arrays rather than
+  // scalars. But in this case, we're doing whole-tensor quantization so we just
+  // pass a pointer to the scale values (and internally ReQuantizeFor Float
+  // won't index past 0
+  float weight_scale_float = static_cast<float>(weight_scale.to<double>());
+  int32_t weight_zero_point_int32 =
+      static_cast<int32_t>(weight_zero_point.to<int64_t>());
+
+  // This is the end of the pipeline, pass the resulting matrix through
+  fbgemm::DoNothing<float, float> doNothingObj{};
+
+  // After the uint8 * int8 matrix multiplication is performed, this operation
+  // does:
+  //  1) Add in row and column offsets to the rows and columns, respectively
+  //  2) Dequantize the results into floating point
+  //  3) Add in the bias term
+  fbgemm::ReQuantizeForFloat<false /* FUSE_RELU*/> outputProcObj(
+      /*nextop=*/doNothingObj,
+      /*Aq_scale=*/q_params.scale,
+      /*Bq_scale=*/&weight_scale_float,
+      /*Aq_zero_point=*/q_params.zero_point,
+      /*Bq_zero_point=*/&weight_zero_point_int32,
+      /*row_offsets=*/packA.getRowOffsetBuffer(),
+      /*col_offsets=*/col_offsets.data<int32_t>(),
+      /*bias=*/bias.contiguous().data<float>(),
+      /*ncol=*/N);
+
+  // Allocate output Tensor and a buffer for fbgemmPacked to use
+  auto output = at::zeros_like(bias).to(at::kFloat).expand({M, N}).contiguous();
+  auto buffer = at::zeros_like(output).to(at::kInt).contiguous();
+
+  // Pull out the PackBMatrix instance from the owning tensor
+  auto* packB = reinterpret_cast<fbgemm::PackBMatrix<int8_t>*>(
+      packed.storage().data_ptr().get());
+
+  // Do the GEMM
+  fbgemm::fbgemmPacked(
+      /*packA=*/packA,
+      /*packB=*/*packB,
+      /*C=*/output.data<float>(),
+      /*C_buffer=*/buffer.data<int32_t>(),
+      /*ldc=*/N,
+      /*outProcess=*/outputProcObj,
+      /*thread_id=*/0,
+      /*num_threads=*/1);
+
+  // The resulting matrix here is 2-D, let's view it with the original
+  // left hand dimensions of the input.
+  std::vector<int64_t> out_sizes = input.sizes().vec();
+  out_sizes.back() = N;
+  return output.view(out_sizes);
+}
+
+namespace {
+// Calculate the column offsets
+// Note this includes the sum of the columns as well as the scalar term
+// B_zero_point * K, whereas the row_offsets created by PackAWithQuantRowOffset
+// is only the sum of the A rows.
+void calc_col_offsets_transpose(
+    int K,
+    int N,
+    const int8_t* Bint8,
+    int32_t B_zero_point,
+    int32_t* col_offsets) {
+  for (size_t i = 0; i < N; ++i) {
+    int32_t sum = 0;
+    for (size_t j = 0; j < K; ++j) {
+      sum += Bint8[i * K + j];
+    }
+    col_offsets[i] = sum - B_zero_point * K;
+  }
+}
+} // namespace
+
+std::tuple<Tensor, Tensor, double, int64_t> fbgemm_linear_quantize_weight(
+    const Tensor& weight) {
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  AT_ASSERTM(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+  auto weight_contig = weight.contiguous();
+
+  // Calculate weight statistics
+  float w_min, w_max;
+  fbgemm::FindMinMax(
+      /*m=*/weight_contig.data<float>(),
+      /*min=*/&w_min,
+      /*max=*/&w_max,
+      /*len=*/weight_contig.numel());
+
+  // Choose parameters for quantizing the weight as 8-bit signed integer
+  static constexpr bool is_signed = true;
+  static constexpr int precision = 8;
+  auto q_params = fbgemm::ChooseQuantizationParams(
+      /*min=*/w_min,
+      /*max=*/w_max,
+      /*qmin=*/is_signed ? -(1 << (precision - 1)) : 0,
+      /*qmax=*/is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
+      /*preserve_sparsity=*/false);
+
+  q_params.precision = precision;
+
+  auto quantized = at::zeros_like(weight_contig).to(at::kChar).contiguous();
+  fbgemm::Quantize<int8_t>(
+      /*src=*/weight_contig.data<float>(),
+      /*dst=*/quantized.data<int8_t>(),
+      /*len=*/weight_contig.numel(),
+      /*qparams=*/q_params);
+
+  // Calculate column offsets of the weight and store them away in a tensor.
+  // Similarly to quantization, this can be done once and cached.
+  auto col_offsets =
+      at::zeros_like(quantized).sum({1}).to(at::kInt).contiguous();
+  calc_col_offsets_transpose(
+      /*K=*/quantized.size(1),
+      /*N=*/quantized.size(0),
+      /*Bint8=*/quantized.data<int8_t>(),
+      /*B_zero_point=*/q_params.zero_point,
+      /*col_offsets=*/col_offsets.data<int32_t>());
+
+  return std::make_tuple(
+      quantized, col_offsets, q_params.scale, q_params.zero_point);
+}
+
+bool fbgemm_is_cpu_supported() {
+  return fbgemm::fbgemmSupportedCPU();
+}
+
+Tensor fbgemm_pack_quantized_matrix(
+    const Tensor& weight,
+    int64_t K,
+    int64_t N) {
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  AT_ASSERTM(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+  auto contiguous_ptr = weight.contiguous().data<int8_t>();
+  auto* ptr = new fbgemm::PackBMatrix<int8_t>(
+      /*trans=*/fbgemm::matrix_op_t::Transpose,
+      /*nRow=*/K,
+      /*nCol=*/N,
+      /*smat=*/contiguous_ptr,
+      /*ld=*/K,
+      /*pmat=*/nullptr, // PackBMatrix manages ownership of pmat
+      /*groups=*/1);
+
+  // We store this instance away in a Tensor and register a deleter function
+  // so that we do not leak memory. On the other side, we pull out the storage's
+  // data_ptr and get the PackBMatrix's pointer.
+  at::DataPtr at_ptr(
+      ptr,
+      ptr,
+      [](void* ptr) {
+        fbgemm::PackBMatrix<int8_t>* typed_ptr =
+            reinterpret_cast<fbgemm::PackBMatrix<int8_t>*>(ptr);
+        delete typed_ptr;
+      },
+      at::kCPU);
+
+  auto retval = at::empty(
+      {sizeof(fbgemm::PackBMatrix<int8_t>)}, weight.options().dtype(at::kByte));
+
+  retval.storage().set_data_ptr(std::move(at_ptr));
+
+  return retval;
+}
+
+#else // USE_FBGEMM
+
+Tensor fbgemm_linear_int8_weight(
+    const Tensor& /*input*/,
+    const Tensor& /*weight*/,
+    const Tensor& /*packed*/,
+    const Tensor& /*col_offsets*/,
+    Scalar /*weight_scale*/,
+    Scalar /*weight_zero_point*/,
+    const Tensor& /*bias*/) {
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  AT_ASSERTM(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+}
+
+std::tuple<Tensor, Tensor, double, int64_t> fbgemm_linear_quantize_weight(
+    const Tensor& /*weight*/) {
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  AT_ASSERTM(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+}
+
+Tensor fbgemm_pack_quantized_matrix(
+    const Tensor& /*input*/,
+    int64_t /*K*/,
+    int64_t /*N*/) {
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  AT_ASSERTM(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+}
+
+bool fbgemm_is_cpu_supported() {
+  return false;
+}
+
+#endif // USE_FBGEMM
+}
+} // namespace at
@@ -956,6 +956,14 @@
 
 - func: linear(Tensor input, Tensor weight, Tensor? bias={}) -> Tensor
 
+- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, double, int64_t)
+
+- func: fbgemm_pack_quantized_matrix(Tensor input, int64_t K, int64_t N) -> Tensor
+
+- func: fbgemm_is_cpu_supported() -> bool
+
 - func: linspace(Scalar start, Scalar end, int64_t steps=100, TensorOptions options={}) -> Tensor
 
 - func: linspace_out(Tensor result, Scalar start, Scalar end, int64_t steps=100) -> Tensor
 
@@ -350,6 +350,9 @@ endif()
 if(USE_FBGEMM)
   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
   include_directories(SYSTEM "${CAFFE2_THIRD_PARTY_ROOT}")
+  caffe2_update_option(USE_FBGEMM ON)
+else()
+  caffe2_update_option(USE_FBGEMM OFF)
 endif()