Merge remote-tracking branch 'upstream/develop' into winters009

PaddlePaddle · Oct 28, 2023 · 9f95328 · 9f95328
2 parents ea1e6fc + dbba655
commit 9f95328
Show file tree

Hide file tree

Showing 616 changed files with 17,623 additions and 7,812 deletions.
diff --git a/cmake/external/jemalloc.cmake b/cmake/external/jemalloc.cmake
@@ -5,8 +5,7 @@ set(JEMALLOC_DOWNLOAD_DIR
 set(JEMALLOC_PROJECT "extern_jemalloc")
 set(JEMALLOC_BUILD ${THIRD_PARTY_PATH}/jemalloc/src/extern_jemalloc)
 set(JEMALLOC_PREFIX_DIR ${THIRD_PARTY_PATH}/jemalloc)
-set(JEMALLOC_URL
-    ${GIT_URL}/jemalloc/jemalloc/releases/download/5.1.0/jemalloc-5.1.0.tar.bz2)
+set(JEMALLOC_URL https://paddle-ci.gz.bcebos.com/jemalloc-5.1.0.tar.bz2)
 set(JEMALLOC_INSTALL ${THIRD_PARTY_PATH}/install/jemalloc)
 set(JEMALLOC_INCLUDE_DIR ${JEMALLOC_INSTALL}/include)
 

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -24,7 +24,7 @@ set(XPU_XFT_LIB_NAME "libxft.so")
 set(XPU_XPTI_LIB_NAME "libxpti.so")
 
 if(NOT DEFINED XPU_BASE_DATE)
-  set(XPU_BASE_DATE "20230926")
+  set(XPU_BASE_DATE "20231023")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.0.53.6")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -499,12 +499,15 @@ function(cc_test_run TARGET_NAME)
       NAME ${TARGET_NAME}
       COMMAND ${cc_test_COMMAND} ${cc_test_ARGS}
       WORKING_DIRECTORY ${cc_test_DIR})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_cudnn_deterministic=true)
+    set_property(
+      TEST ${TARGET_NAME}
+      PROPERTY
+        ENVIRONMENT
+        FLAGS_cpu_deterministic=true
+        FLAGS_init_allocated_mem=true
+        FLAGS_cudnn_deterministic=true
+        LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE_BINARY_DIR}/python/paddle/libs:${PADDLE_BINARY_DIR}/python/paddle/base
+    )
     # No unit test should exceed 2 minutes.
     if(WIN32)
       set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
@@ -726,6 +729,7 @@ function(nv_test TARGET_NAME)
     # 2. cuda_add_executable does not support ccache.
     # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html
     add_executable(${TARGET_NAME} ${nv_test_SRCS})
+    target_compile_definitions(${TARGET_NAME} PUBLIC STATIC_PADDLE)
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS}
                           ${os_dependency_modules} paddle_gtest_main phi)

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
@@ -118,6 +118,11 @@ list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
 list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
 list(APPEND HIP_CXX_FLAGS -Wno-return-type)
 list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)
+list(APPEND HIP_CXX_FLAGS -Wno-deprecated-builtins)
+list(APPEND HIP_CXX_FLAGS -Wno-switch)
+list(APPEND HIP_CXX_FLAGS -Wno-literal-conversion)
+list(APPEND HIP_CXX_FLAGS -Wno-constant-conversion)
+list(APPEND HIP_CXX_FLAGS -Wno-defaulted-function-deleted)
 
 if(WITH_CINN)
   list(APPEND HIP_CXX_FLAGS -std=c++14)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -684,6 +684,9 @@ function(prune_pybind_h)
   list(APPEND op_list "load_combine")
   list(APPEND op_list "tensorrt_engine")
 
+  # TODO(ming1753): conditional_block_infer is temporarily reserved here to avoid link errors in functions of standalone_executor
+  list(APPEND op_list "conditional_block_infer")
+
   # add fused_op in op_list
   list(APPEND op_list "fc")
   list(APPEND op_list "conv2d_fusion")

diff --git a/paddle/cinn/README.md b/paddle/cinn/README.md
@@ -0,0 +1,121 @@
+```
+                        ___                    ___          ___
+                       /\__\                  /\  \        /\  \
+                      /:/  /       ___        \:\  \       \:\  \
+                     /:/  /       /\__\        \:\  \       \:\  \
+                    /:/  /  ___  /:/__/    _____\:\  \  _____\:\  \
+                   /:/__/  /\__\/::\  \   /::::::::\__\/::::::::\__\
+                   \:\  \ /:/  /\/\:\  \__\:\~~\~~\/__/\:\~~\~~\/__/
+                    \:\  /:/  /    \:\/\__\\:\  \       \:\  \
+                     \:\/:/  /      \::/  / \:\  \       \:\  \
+                      \::/  /       /:/  /   \:\__\       \:\__\
+                       \/__/        \/__/     \/__/        \/__/
+
+```
+
+
+# CINN : Compiler Infrastructure for Neural Networks
+
+The project CINN is a machine learning compiler and executor for multiple hardware backends.
+It is designed to provide multiple layers of APIs to make tensor computation easier to define,  faster to execute, and more convenient to extend with hardware backends.
+Currently, it targets x86 CPUs and Nvidia GPUs.
+
+This project is under active development.
+
+## How it works
+
+The CINN lowers a traditional DNN model into a two-level intermediate representation(IR), the high-level IR(HLIR) and CINN IR.
+
+The HLIR helps to define some domain-specific computation and perform some overall optimization on the IR-graph;
+the CINN IR helps to represent some computation semantic and finally lower to a hardware backend.
+
+Both levels of IR have the similar SSA graph, analysis and optimization facilities.
+The schedule transform is applied on the CINN IR to do optimizations.
+
+For more details, you can refer to:
+https://github.com/PaddlePaddle/docs/tree/develop/docs/guides/cinn
+
+##  Getting Started
+
+### Compile
+
+Clone PaddlePaddle first.
+
+```
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+```
+
+Build paddle with cinn:
+
+```
+cmake .. -DCINN_ONLY=OFF -DWITH_CINN=ON -DWITH_GPU=ON
+```
+
+Build cinn only:
+
+```
+cmake .. -DCINN_ONLY=ON -DWITH_CINN=ON -DWITH_GPU=ON
+```
+
+And then
+
+```
+make -j
+```
+
+### Install
+
+Install paddle with cinn:
+
+```
+pip install python/dist/paddlepaddle_gpu-xxx.whl
+```
+
+Install cinn only:
+
+```
+pip install python/dist/cinn_gpu-xxx.whl
+```
+
+Then you can import paddle in the python environment and check if a paddle version with CINN is installed.
+
+```
+import paddle
+paddle.is_compiled_with_cinn()
+```
+
+### Concepts
+
+There are two levels of APIs in CINN, the higher level is HLIR and the lower level is CINN IR, both contain some concepts.
+
+In HLIR
+
+- `frontend::Program`, the program helps to define a machine learning computation,
+- `hlir::framework::Tensor`, multi-dimensional arrays helps to manage a memory buffer.
+- `hlir::framework::Program`, the final executable program in runtime. It holds many basic executable elements.
+- `hlir::framework::Graph`, the graph that represents the structure of a model. Each node in the graph represents an operator (conv2d, relu, mul, etc.).
+- `hlir::framework::GraphCompiler`, the compiler that transforms the graph representation(hlir::framework::Graph) of a model into an executable program(hlir::framework::Program).
+
+In CINN IR
+
+- `Compute`, the method to define a computation,
+- `Lower`, the method to lower a computation to the corresponding IR,
+- `LoweredFunc`, the function defined in CINN IR,
+- `Var`, a scalar variable,
+- `Expr`, an expression represents any CINN IR node(no specified Statement node),
+
+## License
+
+CINN is licensed under the [Apache 2.0 license](LICENSE).
+
+## Acknowledgement
+
+CINN learned a lot from the following projects:
+
+- [Halide](https://github.com/halide/Halide): Referenced the design of most IR nodes,
+- [TVM](https://github.com/apache/tvm): We learned many ideas including the semantics of some schedule primitives, TOPI, NNVM, and so on,
+- [tiramisu](https://github.com/Tiramisu-Compiler): The isl usage, polyhedral compilation, schedule primitive implementation, and so on,
+- [tensorflow/xla](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): Referenced the semantics of the primitive operations.
diff --git a/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
@@ -1,4 +1,4 @@
-# TODO(Aurelius84): new_ir_compiler depends on pd_op_dialect and could
+# TODO(Aurelius84): pir_compiler depends on pd_op_dialect and could
 # not found under CINN_ONLY mode
 if(NOT CINN_ONLY)
   set(CINN_DIALECT_BINARY_DIR

diff --git a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
@@ -18,8 +18,8 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/cinn/hlir/framework/new_ir/utils.h"
 #include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/pir/core/attribute_base.h"
 #include "paddle/pir/core/operation.h"
 
@@ -51,7 +51,7 @@ struct GroupInfo {
  private:
   void Initialize() {
     op_pattern_kind = hlir::framework::OpPatternKind::kElementWise;
-    fn_name = hlir::framework::newir::CompatibleInfo::GroupOpsName(ops);
+    fn_name = hlir::framework::pir::CompatibleInfo::GroupOpsName(ops);
   }
 };
 
@@ -78,7 +78,7 @@ struct GroupInfoAttributeStorage : public pir::AttributeStorage {
 };
 
 struct JITInfoAttributeStorage : public pir::AttributeStorage {
-  using ParamKey = cinn::hlir::framework::newir::CUDAJITInfo;
+  using ParamKey = cinn::hlir::framework::pir::CUDAJITInfo;
   explicit JITInfoAttributeStorage(const ParamKey& key) : data_(key) {}
 
   static JITInfoAttributeStorage* Construct(const ParamKey& key) {

diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc
@@ -20,7 +20,7 @@ const GroupInfo &GroupInfoAttribute::data() const {
   return storage()->GetAsKey();
 }
 
-const cinn::hlir::framework::newir::CUDAJITInfo &CUDAJITInfoAttribute::data()
+const cinn::hlir::framework::pir::CUDAJITInfo &CUDAJITInfoAttribute::data()
     const {
   return storage()->GetAsKey();
 }

diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h
@@ -44,7 +44,7 @@ class CUDAJITInfoAttribute : public pir::Attribute {
     return storage() < right.storage();
   }
 
-  const cinn::hlir::framework::newir::CUDAJITInfo& data() const;
+  const cinn::hlir::framework::pir::CUDAJITInfo& data() const;
 };
 
 }  // namespace dialect

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt
@@ -4,7 +4,19 @@ if(NOT CINN_ONLY)
     SRCS
     group_with_group_merge_pass.cc
     op_with_group_merge_pass.cc
+    cinn_group_lowering_pass.cc
     tensor_node.cc
     DEPS
+    pd_op_dialect
+    pir_compiler
+    cinn_runtime_dialect)
+
+  cinn_cc_library(
+    pd_to_cinn_pass
+    SRCS
+    pd_to_cinn_pass.cc
+    DEPS
+    drr
+    cinn_op_dialect
     pd_op_dialect)
 endif()