From 446fe6cf8b61b1d553b3b483125bb29ef5c4e8b3 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 2 May 2024 14:02:56 +0800
Subject: [PATCH 1/2] Fix

---
 paddle/fluid/operators/lookup_table_v2_op.cu | 254 -----------------
 paddle/fluid/operators/lookup_table_v2_op.h  | 285 -------------------
 2 files changed, 539 deletions(-)
 delete mode 100644 paddle/fluid/operators/lookup_table_v2_op.cu
 delete mode 100644 paddle/fluid/operators/lookup_table_v2_op.h
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
deleted file mode 100644
index 8628965251ee75..00000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lookup_table_v2_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename IdT, bool PaddingFlag>
-__global__ void LookupTableV2(T *output,
-                              const T *table,
-                              const IdT *ids,
-                              const int64_t N,
-                              const int64_t K,
-                              const int64_t D,
-                              const int64_t padding_idx) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * gridDim.x;
-
-  while (idy < K) {
-    auto id = static_cast<int64_t>(ids[idy]);
-    T *out = output + idy * D;
-    const T *tab = table + id * D;
-    for (int i = idx; i < D; i += blockDim.x) {
-      if (PaddingFlag) {
-        if (id == padding_idx)
-          out[i] = static_cast<T>(0);
-        else
-          out[i] = tab[i];
-      } else {
-        out[i] = tab[i];
-      }
-    }
-    idy += blockDim.y * gridDim.x;
-  }
-}
-
-template <typename T, typename IdT>
-__global__ void LookupTableV2Grad(T *table,
-                                  const T *output,
-                                  const IdT *ids,
-                                  const int64_t N,
-                                  const int64_t K,
-                                  const int64_t D) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * gridDim.x;
-
-  while (idy < K) {
-    auto id = static_cast<int64_t>(ids[idy]);
-    const T *out = output + idy * D;
-    T *tab = table + id * D;
-#ifdef PADDLE_WITH_CUDA
-    phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
-#else
-    for (int i = idx; i < D; i += blockDim.x) {
-      phi::CudaAtomicAdd(&tab[i], out[i]);
-    }
-#endif
-    idy += blockDim.y * gridDim.x;
-  }
-}
-
-template <typename T>
-struct LookupTableV2CUDAFunctor {
-  LookupTableV2CUDAFunctor(const framework::ExecutionContext &context,
-                           const phi::DenseTensor *ids_t)
-      : context_(context), ids_t_(ids_t) {}
-
-  template <typename IdT>
-  void apply() {
-    auto *table_t = context_.Input<phi::DenseTensor>("W");
-    auto *output_t = context_.Output<phi::DenseTensor>("Out");
-    int64_t padding_idx = context_.Attr<int64_t>("padding_idx");
-
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t_->numel();
-
-    const int gridx = 2 * context_.cuda_device_context().GetSMCount();
-    dim3 threads(256, 4);
-    dim3 grids(gridx, 1);
-
-    const auto *table = table_t->template data<T>();
-    const auto *ids = ids_t_->template data<IdT>();
-    auto *output = output_t->template mutable_data<T>(context_.GetPlace());
-    auto stream = context_.cuda_device_context().stream();
-
-    if (padding_idx == -1) {
-      LookupTableV2<T, IdT, false><<<grids, threads, 0, stream>>>(
-          output, table, ids, N, K, D, padding_idx);
-    } else {
-      LookupTableV2<T, IdT, true><<<grids, threads, 0, stream>>>(
-          output, table, ids, N, K, D, padding_idx);
-    }
-  }
-
- private:
-  const framework::ExecutionContext &context_;
-  const phi::DenseTensor *ids_t_;
-};
-
-template <typename T>
-class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-    LookupTableV2CUDAFunctor<T> functor(context, ids_t);
-    framework::VisitIntDataType(framework::TransToProtoVarType(ids_t->dtype()),
-                                functor);
-  }
-};
-
-template <typename InT, typename OutT>
-__global__ void InputTypeConvert(const InT *in_ids,
-                                 const int64_t K,
-                                 OutT *out_ids) {
-  for (int i = 0; i < K; i++) {
-    out_ids[i] = static_cast<OutT>(in_ids[i]);
-  }
-}
-
-template <typename T>
-struct LookupTableV2GradCUDAFunctor {
-  LookupTableV2GradCUDAFunctor(const framework::ExecutionContext &context,
-                               const phi::DenseTensor *ids_t)
-      : context_(context), ids_t_(ids_t) {}
-
-  template <typename IdT>
-  void apply() {
-    auto &dev_ctx = context_.template device_context<phi::GPUContext>();
-    bool is_sparse = context_.Attr<bool>("is_sparse");
-
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto *table = context_.Input<phi::DenseTensor>("W");
-      auto *d_output =
-          context_.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      auto *d_table =
-          context_.Output<phi::SelectedRows>(framework::GradVarName("W"));
-
-      const auto *ids_data = ids_t_->template data<IdT>();
-      int64_t ids_num = ids_t_->numel();
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
-      auto stream = dev_ctx.stream();
-      phi::Vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      auto gpu_place = context_.GetPlace();
-
-      phi::MixVector<int64_t> mixv_new_rows(&new_rows);
-      if (!std::is_same<IdT, int64_t>::value) {
-        InputTypeConvert<<<grids, threads, 0, stream>>>(
-            ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
-      } else {
-        memory::Copy(gpu_place,
-                     mixv_new_rows.CUDAMutableData(gpu_place),
-                     gpu_place,
-                     ids_data,
-                     ids_num * sizeof(int64_t),
-                     stream);
-      }
-
-      mixv_new_rows.CopyToCPU();
-      d_table->set_rows(new_rows);
-
-      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_num, table->dims()[1]});
-      d_table_value->template mutable_data<T>(gpu_place);
-
-      auto *d_table_data = d_table_value->template data<T>();
-      auto *d_output_data = d_output->template data<T>();
-      auto d_output_dims = d_output->dims();
-      auto d_output_dims_2d =
-          common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
-      PADDLE_ENFORCE_EQ(d_table_value->dims(),
-                        d_output_dims_2d,
-                        phi::errors::InvalidArgument(
-                            "ShapeError: The shape of lookup_table@Grad and "
-                            "output@Grad should be same. "
-                            "But received lookup_table@Grad's shape = [%s], "
-                            "output@Grad's shape = [%s].",
-                            d_table_value->dims(),
-                            d_output_dims_2d));
-      memory::Copy(gpu_place,
-                   d_table_data,
-                   gpu_place,
-                   d_output_data,
-                   d_output->numel() * sizeof(T),
-                   stream);
-
-    } else {
-      auto d_output_t =
-          context_.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      auto d_table_t =
-          context_.Output<phi::DenseTensor>(framework::GradVarName("W"));
-
-      int N = d_table_t->dims()[0];
-      int D = d_table_t->dims()[1];
-      int K = ids_t_->numel();
-
-      const T *d_output = d_output_t->template data<T>();
-      const auto *ids = ids_t_->template data<IdT>();
-      T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
-#endif
-
-      const int gridx = 2 * dev_ctx.GetSMCount();
-      dim3 threads(128, 8);
-      dim3 grids(gridx, 1);
-      LookupTableV2Grad<T, IdT><<<grids, threads, 0, dev_ctx.stream()>>>(
-          d_table, d_output, ids, N, K, D);
-    }
-  }
-
- private:
-  const framework::ExecutionContext &context_;
-  const phi::DenseTensor *ids_t_;
-};
-
-template <typename T>
-class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-    LookupTableV2GradCUDAFunctor<T> functor(context, ids_t);
-    framework::VisitIntDataType(framework::TransToProtoVarType(ids_t->dtype()),
-                                functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
deleted file mode 100644
index 8e3ce198e060bf..00000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace paddle {
-namespace operators {
-
-using SelectedRows = phi::SelectedRows;
-using DDim = framework::DDim;
-
-constexpr int64_t kNoPadding = -1;
-
-template <typename InT, typename OutT>
-static std::vector<OutT> CopyIdsToVector(const phi::DenseTensor &ids) {
-  auto numel = ids.numel();
-  const auto *src = ids.data<InT>();
-  std::vector<OutT> ret(numel);
-  if (std::is_same<InT, OutT>::value) {
-    std::memcpy(ret.data(), src, numel * sizeof(InT));
-  } else {
-    for (decltype(numel) i = 0; i < numel; ++i) {
-      ret[i] = src[i];
-    }
-  }
-  return ret;
-}
-
-template <typename T>
-struct LookupTableV2CPUFunctor {
-  LookupTableV2CPUFunctor(const framework::ExecutionContext &context,
-                          const phi::DenseTensor *ids_t)
-      : context_(context), ids_t_(ids_t) {}
-
-  template <typename IdT>
-  void apply() {
-    auto *output_t = context_.Output<phi::DenseTensor>("Out");  // float tensor
-    auto *table_var = context_.InputVar("W");
-
-    int64_t padding_idx = context_.Attr<int64_t>("padding_idx");
-
-    auto ids = CopyIdsToVector<IdT, int64_t>(*ids_t_);
-    auto ids_numel = static_cast<int64_t>(ids.size());
-
-    if (table_var->template IsType<phi::DenseTensor>()) {
-      const auto &table_t = table_var->template Get<phi::DenseTensor>();
-      int64_t row_number = table_t.dims()[0];
-      int64_t row_width = table_t.dims()[1];
-
-      auto *table = table_t.template data<T>();
-      auto *output = output_t->template mutable_data<T>(context_.GetPlace());
-
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_LT(
-              ids[i],
-              row_number,
-              phi::errors::InvalidArgument(
-                  "Variable value (input) of OP(fluid.layers.embedding) "
-                  "expected >= 0 and < %ld, but got %ld. Please check input "
-                  "value.",
-                  row_number,
-                  ids[i]));
-          PADDLE_ENFORCE_GE(
-              ids[i],
-              0,
-              phi::errors::InvalidArgument(
-                  "Variable value (input) of OP(fluid.layers.embedding) "
-                  "expected >= 0 and < %ld, but got %ld. Please check input "
-                  "value.",
-                  row_number,
-                  ids[i]));
-          memcpy(output + i * row_width,
-                 table + ids[i] * row_width,
-                 row_width * sizeof(T));
-        }
-      }
-    } else if (table_var->template IsType<phi::SelectedRows>()) {
-      const auto &table_t = table_var->template Get<phi::SelectedRows>();
-      int64_t row_width = table_t.value().dims()[1];
-      const auto *table = table_t.value().template data<T>();
-      auto *output = output_t->template mutable_data<T>(context_.GetPlace());
-      auto input_data_type =
-          framework::TransToProtoVarType(table_t.value().dtype());
-
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_GE(
-              ids[i],
-              0,
-              phi::errors::InvalidArgument(
-                  "Variable value (input) of OP(fluid.layers.embedding) "
-                  "expected >= 0. But received %ld",
-                  ids[i]));
-          auto id_index = table_t.Index(ids[i]);
-          PADDLE_ENFORCE_GE(
-              id_index,
-              0,
-              phi::errors::InvalidArgument(
-                  "the input key should be exists. But received %d.",
-                  id_index));
-
-          if (input_data_type == framework::proto::VarType::BF16) {
-            memcpy(output + i * row_width,
-                   table + id_index * row_width,
-                   row_width * sizeof(T));
-          } else {
-            auto &dev_ctx = context_.template device_context<phi::CPUContext>();
-            auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
-            blas.VCOPY(row_width,
-                       table + id_index * row_width,
-                       output + i * row_width);
-          }
-        }
-      }
-    }
-  }
-
- private:
-  const framework::ExecutionContext &context_;
-  const phi::DenseTensor *ids_t_;
-};
-
-template <typename T>
-class LookupTableV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids = context.Input<phi::DenseTensor>("Ids");
-    LookupTableV2CPUFunctor<T> functor(context, ids);
-    framework::VisitIntDataType(framework::TransToProtoVarType(ids->dtype()),
-                                functor);
-  }
-};
-
-template <typename T>
-struct LookupTableV2GradCPUFunctor {
-  LookupTableV2GradCPUFunctor(const framework::ExecutionContext &context,
-                              const phi::DenseTensor *ids_t)
-      : context_(context), ids_t_(ids_t) {}
-
-  template <typename IdT>
-  void apply() {
-    auto *table_var = context_.InputVar("W");
-    DDim table_dim;
-    if (table_var->template IsType<phi::DenseTensor>()) {
-      table_dim = context_.Input<phi::DenseTensor>("W")->dims();
-    } else if (table_var->template IsType<phi::SelectedRows>()) {
-      auto *table_t = context_.Input<phi::SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "The parameter W of a LookupTableV2 "
-          "must be either phi::DenseTensor or SelectedRows"));
-    }
-
-    int64_t padding_idx = context_.Attr<int64_t>("padding_idx");
-    bool is_sparse = context_.Attr<bool>("is_sparse");
-
-    auto ids = CopyIdsToVector<IdT, int64_t>(*ids_t_);
-    auto ids_num = static_cast<int64_t>(ids.size());
-
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto *d_output =
-          context_.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      auto *d_table =
-          context_.Output<phi::SelectedRows>(framework::GradVarName("W"));
-
-      d_table->set_rows(ids);
-
-      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_num, table_dim[1]});
-
-      d_table_value->template mutable_data<T>(context_.GetPlace());
-
-      d_table->set_height(table_dim[0]);
-
-      auto *d_output_data = d_output->template data<T>();
-      auto *d_table_data = d_table_value->template data<T>();
-
-      auto d_output_dims = d_output->dims();
-      auto d_output_dims_2d =
-          common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
-      PADDLE_ENFORCE_EQ(d_table_value->dims(),
-                        d_output_dims_2d,
-                        phi::errors::InvalidArgument(
-                            "ShapeError: The shape of lookup_table@Grad and "
-                            "output@Grad should be same. "
-                            "But received lookup_table@Grad's shape = [%s], "
-                            "output@Grad's shape = [%s].",
-                            d_table_value->dims(),
-                            d_output_dims_2d));
-      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
-
-    } else {
-      auto *d_output =
-          context_.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      auto *d_table =
-          context_.Output<phi::DenseTensor>(framework::GradVarName("W"));
-      auto *ids_data = ids.data();
-
-      int64_t N = table_dim[0];
-      int64_t D = table_dim[1];
-
-      auto *d_output_data = d_output->template data<T>();
-      auto *d_table_data =
-          d_table->template mutable_data<T>(context_.GetPlace());
-
-      memset(d_table_data, 0, d_table->numel() * sizeof(T));
-
-      for (int64_t i = 0; i < ids_num; ++i) {
-        if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
-          // the gradient of padding_idx should be 0, already done by memset, so
-          // do nothing.
-        } else {
-          PADDLE_ENFORCE_LT(
-              ids_data[i],
-              N,
-              phi::errors::InvalidArgument(
-                  "Variable value (input) of OP(fluid.layers.embedding) "
-                  "expected >= 0 and < %ld, but got %ld. Please check input "
-                  "value.",
-                  N,
-                  ids_data[i]));
-          PADDLE_ENFORCE_GE(
-              ids_data[i],
-              0,
-              phi::errors::InvalidArgument(
-                  "Variable value (input) of OP(fluid.layers.embedding) "
-                  "expected >= 0 and < %ld, but got %ld. Please check input "
-                  "value.",
-                  N,
-                  ids_data[i]));
-          for (int j = 0; j < D; ++j) {
-            d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
-          }
-        }
-      }
-    }
-  }
-
- private:
-  const framework::ExecutionContext &context_;
-  const phi::DenseTensor *ids_t_;
-};
-
-template <typename T>
-class LookupTableV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids = context.Input<phi::DenseTensor>("Ids");
-    LookupTableV2GradCPUFunctor<T> functor(context, ids);
-    framework::VisitIntDataType(framework::TransToProtoVarType(ids->dtype()),
-                                functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle

From e1d494a8813fa856dee2a690a1c220579c77285f Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Mon, 6 May 2024 16:27:27 +0800
Subject: [PATCH 2/2] ci