From 446fe6cf8b61b1d553b3b483125bb29ef5c4e8b3 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 2 May 2024 14:02:56 +0800 Subject: [PATCH 1/2] Fix --- paddle/fluid/operators/lookup_table_v2_op.cu | 254 ----------------- paddle/fluid/operators/lookup_table_v2_op.h | 285 ------------------- 2 files changed, 539 deletions(-) delete mode 100644 paddle/fluid/operators/lookup_table_v2_op.cu delete mode 100644 paddle/fluid/operators/lookup_table_v2_op.h diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu deleted file mode 100644 index 8628965251ee75..00000000000000 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ /dev/null @@ -1,254 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/lookup_table_v2_op.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -namespace paddle { -namespace operators { - -template -__global__ void LookupTableV2(T *output, - const T *table, - const IdT *ids, - const int64_t N, - const int64_t K, - const int64_t D, - const int64_t padding_idx) { - int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * gridDim.x; - - while (idy < K) { - auto id = static_cast(ids[idy]); - T *out = output + idy * D; - const T *tab = table + id * D; - for (int i = idx; i < D; i += blockDim.x) { - if (PaddingFlag) { - if (id == padding_idx) - out[i] = static_cast(0); - else - out[i] = tab[i]; - } else { - out[i] = tab[i]; - } - } - idy += blockDim.y * gridDim.x; - } -} - -template -__global__ void LookupTableV2Grad(T *table, - const T *output, - const IdT *ids, - const int64_t N, - const int64_t K, - const int64_t D) { - int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * gridDim.x; - - while (idy < K) { - auto id = static_cast(ids[idy]); - const T *out = output + idy * D; - T *tab = table + id * D; -#ifdef PADDLE_WITH_CUDA - phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); -#else - for (int i = idx; i < D; i += blockDim.x) { - phi::CudaAtomicAdd(&tab[i], out[i]); - } -#endif - idy += blockDim.y * gridDim.x; - } -} - -template -struct LookupTableV2CUDAFunctor { - LookupTableV2CUDAFunctor(const framework::ExecutionContext &context, - const phi::DenseTensor *ids_t) - : context_(context), ids_t_(ids_t) {} - - template - void apply() { - auto *table_t = context_.Input("W"); - auto *output_t = context_.Output("Out"); - int64_t padding_idx = context_.Attr("padding_idx"); - - size_t N = table_t->dims()[0]; - size_t D = table_t->dims()[1]; - size_t K = ids_t_->numel(); - - const int gridx = 2 * context_.cuda_device_context().GetSMCount(); - dim3 threads(256, 4); - dim3 grids(gridx, 1); - - const auto *table = table_t->template data(); - const auto *ids = ids_t_->template data(); - auto *output = output_t->template mutable_data(context_.GetPlace()); - auto stream = context_.cuda_device_context().stream(); - - if (padding_idx == -1) { - LookupTableV2<<>>( - output, table, ids, N, K, D, padding_idx); - } else { - LookupTableV2<<>>( - output, table, ids, N, K, D, padding_idx); - } - } - - private: - const framework::ExecutionContext &context_; - const phi::DenseTensor *ids_t_; -}; - -template -class LookupTableV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *ids_t = context.Input("Ids"); - LookupTableV2CUDAFunctor functor(context, ids_t); - framework::VisitIntDataType(framework::TransToProtoVarType(ids_t->dtype()), - functor); - } -}; - -template -__global__ void InputTypeConvert(const InT *in_ids, - const int64_t K, - OutT *out_ids) { - for (int i = 0; i < K; i++) { - out_ids[i] = static_cast(in_ids[i]); - } -} - -template -struct LookupTableV2GradCUDAFunctor { - LookupTableV2GradCUDAFunctor(const framework::ExecutionContext &context, - const phi::DenseTensor *ids_t) - : context_(context), ids_t_(ids_t) {} - - template - void apply() { - auto &dev_ctx = context_.template device_context(); - bool is_sparse = context_.Attr("is_sparse"); - - // Since paddings are not trainable and fixed in forward, the gradient of - // paddings makes no sense and we don't deal with it in backward. - if (is_sparse) { - auto *table = context_.Input("W"); - auto *d_output = - context_.Input(framework::GradVarName("Out")); - auto *d_table = - context_.Output(framework::GradVarName("W")); - - const auto *ids_data = ids_t_->template data(); - int64_t ids_num = ids_t_->numel(); - dim3 threads(128, 8); - dim3 grids(8, 1); - auto stream = dev_ctx.stream(); - phi::Vector new_rows; - new_rows.resize(ids_num); - auto gpu_place = context_.GetPlace(); - - phi::MixVector mixv_new_rows(&new_rows); - if (!std::is_same::value) { - InputTypeConvert<<>>( - ids_data, ids_num, mixv_new_rows.MutableData(gpu_place)); - } else { - memory::Copy(gpu_place, - mixv_new_rows.CUDAMutableData(gpu_place), - gpu_place, - ids_data, - ids_num * sizeof(int64_t), - stream); - } - - mixv_new_rows.CopyToCPU(); - d_table->set_rows(new_rows); - - auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, table->dims()[1]}); - d_table_value->template mutable_data(gpu_place); - - auto *d_table_data = d_table_value->template data(); - auto *d_output_data = d_output->template data(); - auto d_output_dims = d_output->dims(); - auto d_output_dims_2d = - common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1); - PADDLE_ENFORCE_EQ(d_table_value->dims(), - d_output_dims_2d, - phi::errors::InvalidArgument( - "ShapeError: The shape of lookup_table@Grad and " - "output@Grad should be same. " - "But received lookup_table@Grad's shape = [%s], " - "output@Grad's shape = [%s].", - d_table_value->dims(), - d_output_dims_2d)); - memory::Copy(gpu_place, - d_table_data, - gpu_place, - d_output_data, - d_output->numel() * sizeof(T), - stream); - - } else { - auto d_output_t = - context_.Input(framework::GradVarName("Out")); - auto d_table_t = - context_.Output(framework::GradVarName("W")); - - int N = d_table_t->dims()[0]; - int D = d_table_t->dims()[1]; - int K = ids_t_->numel(); - - const T *d_output = d_output_t->template data(); - const auto *ids = ids_t_->template data(); - T *d_table = d_table_t->mutable_data(context_.GetPlace()); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); -#endif - - const int gridx = 2 * dev_ctx.GetSMCount(); - dim3 threads(128, 8); - dim3 grids(gridx, 1); - LookupTableV2Grad<<>>( - d_table, d_output, ids, N, K, D); - } - } - - private: - const framework::ExecutionContext &context_; - const phi::DenseTensor *ids_t_; -}; - -template -class LookupTableV2GradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *ids_t = context.Input("Ids"); - LookupTableV2GradCUDAFunctor functor(context, ids_t); - framework::VisitIntDataType(framework::TransToProtoVarType(ids_t->dtype()), - functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h deleted file mode 100644 index 8e3ce198e060bf..00000000000000 --- a/paddle/fluid/operators/lookup_table_v2_op.h +++ /dev/null @@ -1,285 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -namespace paddle { -namespace operators { - -using SelectedRows = phi::SelectedRows; -using DDim = framework::DDim; - -constexpr int64_t kNoPadding = -1; - -template -static std::vector CopyIdsToVector(const phi::DenseTensor &ids) { - auto numel = ids.numel(); - const auto *src = ids.data(); - std::vector ret(numel); - if (std::is_same::value) { - std::memcpy(ret.data(), src, numel * sizeof(InT)); - } else { - for (decltype(numel) i = 0; i < numel; ++i) { - ret[i] = src[i]; - } - } - return ret; -} - -template -struct LookupTableV2CPUFunctor { - LookupTableV2CPUFunctor(const framework::ExecutionContext &context, - const phi::DenseTensor *ids_t) - : context_(context), ids_t_(ids_t) {} - - template - void apply() { - auto *output_t = context_.Output("Out"); // float tensor - auto *table_var = context_.InputVar("W"); - - int64_t padding_idx = context_.Attr("padding_idx"); - - auto ids = CopyIdsToVector(*ids_t_); - auto ids_numel = static_cast(ids.size()); - - if (table_var->template IsType()) { - const auto &table_t = table_var->template Get(); - int64_t row_number = table_t.dims()[0]; - int64_t row_width = table_t.dims()[1]; - - auto *table = table_t.template data(); - auto *output = output_t->template mutable_data(context_.GetPlace()); - - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_LT( - ids[i], - row_number, - phi::errors::InvalidArgument( - "Variable value (input) of OP(fluid.layers.embedding) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - row_number, - ids[i])); - PADDLE_ENFORCE_GE( - ids[i], - 0, - phi::errors::InvalidArgument( - "Variable value (input) of OP(fluid.layers.embedding) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - row_number, - ids[i])); - memcpy(output + i * row_width, - table + ids[i] * row_width, - row_width * sizeof(T)); - } - } - } else if (table_var->template IsType()) { - const auto &table_t = table_var->template Get(); - int64_t row_width = table_t.value().dims()[1]; - const auto *table = table_t.value().template data(); - auto *output = output_t->template mutable_data(context_.GetPlace()); - auto input_data_type = - framework::TransToProtoVarType(table_t.value().dtype()); - - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_GE( - ids[i], - 0, - phi::errors::InvalidArgument( - "Variable value (input) of OP(fluid.layers.embedding) " - "expected >= 0. But received %ld", - ids[i])); - auto id_index = table_t.Index(ids[i]); - PADDLE_ENFORCE_GE( - id_index, - 0, - phi::errors::InvalidArgument( - "the input key should be exists. But received %d.", - id_index)); - - if (input_data_type == framework::proto::VarType::BF16) { - memcpy(output + i * row_width, - table + id_index * row_width, - row_width * sizeof(T)); - } else { - auto &dev_ctx = context_.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.VCOPY(row_width, - table + id_index * row_width, - output + i * row_width); - } - } - } - } - } - - private: - const framework::ExecutionContext &context_; - const phi::DenseTensor *ids_t_; -}; - -template -class LookupTableV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *ids = context.Input("Ids"); - LookupTableV2CPUFunctor functor(context, ids); - framework::VisitIntDataType(framework::TransToProtoVarType(ids->dtype()), - functor); - } -}; - -template -struct LookupTableV2GradCPUFunctor { - LookupTableV2GradCPUFunctor(const framework::ExecutionContext &context, - const phi::DenseTensor *ids_t) - : context_(context), ids_t_(ids_t) {} - - template - void apply() { - auto *table_var = context_.InputVar("W"); - DDim table_dim; - if (table_var->template IsType()) { - table_dim = context_.Input("W")->dims(); - } else if (table_var->template IsType()) { - auto *table_t = context_.Input("W"); - table_dim = table_t->value().dims(); - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "The parameter W of a LookupTableV2 " - "must be either phi::DenseTensor or SelectedRows")); - } - - int64_t padding_idx = context_.Attr("padding_idx"); - bool is_sparse = context_.Attr("is_sparse"); - - auto ids = CopyIdsToVector(*ids_t_); - auto ids_num = static_cast(ids.size()); - - // Since paddings are not trainable and fixed in forward, the gradient of - // paddings makes no sense and we don't deal with it in backward. - if (is_sparse) { - auto *d_output = - context_.Input(framework::GradVarName("Out")); - auto *d_table = - context_.Output(framework::GradVarName("W")); - - d_table->set_rows(ids); - - auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, table_dim[1]}); - - d_table_value->template mutable_data(context_.GetPlace()); - - d_table->set_height(table_dim[0]); - - auto *d_output_data = d_output->template data(); - auto *d_table_data = d_table_value->template data(); - - auto d_output_dims = d_output->dims(); - auto d_output_dims_2d = - common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1); - PADDLE_ENFORCE_EQ(d_table_value->dims(), - d_output_dims_2d, - phi::errors::InvalidArgument( - "ShapeError: The shape of lookup_table@Grad and " - "output@Grad should be same. " - "But received lookup_table@Grad's shape = [%s], " - "output@Grad's shape = [%s].", - d_table_value->dims(), - d_output_dims_2d)); - memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); - - } else { - auto *d_output = - context_.Input(framework::GradVarName("Out")); - auto *d_table = - context_.Output(framework::GradVarName("W")); - auto *ids_data = ids.data(); - - int64_t N = table_dim[0]; - int64_t D = table_dim[1]; - - auto *d_output_data = d_output->template data(); - auto *d_table_data = - d_table->template mutable_data(context_.GetPlace()); - - memset(d_table_data, 0, d_table->numel() * sizeof(T)); - - for (int64_t i = 0; i < ids_num; ++i) { - if (padding_idx != kNoPadding && ids_data[i] == padding_idx) { - // the gradient of padding_idx should be 0, already done by memset, so - // do nothing. - } else { - PADDLE_ENFORCE_LT( - ids_data[i], - N, - phi::errors::InvalidArgument( - "Variable value (input) of OP(fluid.layers.embedding) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - N, - ids_data[i])); - PADDLE_ENFORCE_GE( - ids_data[i], - 0, - phi::errors::InvalidArgument( - "Variable value (input) of OP(fluid.layers.embedding) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - N, - ids_data[i])); - for (int j = 0; j < D; ++j) { - d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; - } - } - } - } - } - - private: - const framework::ExecutionContext &context_; - const phi::DenseTensor *ids_t_; -}; - -template -class LookupTableV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *ids = context.Input("Ids"); - LookupTableV2GradCPUFunctor functor(context, ids); - framework::VisitIntDataType(framework::TransToProtoVarType(ids->dtype()), - functor); - } -}; - -} // namespace operators -} // namespace paddle From e1d494a8813fa856dee2a690a1c220579c77285f Mon Sep 17 00:00:00 2001 From: co63oc Date: Mon, 6 May 2024 16:27:27 +0800 Subject: [PATCH 2/2] ci