Merge remote-tracking branch 'upstream/develop' into gast

PaddlePaddle · Aug 5, 2021 · a7114e2 · a7114e2 · paddle-bot-old · Aug 5, 2021
2 parents d6721a0 + 0989211
commit a7114e2
Show file tree

Hide file tree

Showing 555 changed files with 20,758 additions and 5,358 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -119,17 +119,19 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
-    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
-
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
         CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
-        if(NOT WITH_GPU)
+
+        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
+        # For Visual Studio generators, /MP should be added. 
+        # For other generators like Ninja, it is not need to add /MP.
+        if("${CMAKE_GENERATOR}" STREQUAL "Visual Studio" AND NOT WITH_GPU)
+            math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
             set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
         endif()
     endforeach(flag_var)
@@ -312,6 +314,17 @@ else()
      endif()
 endif()
 
+if(WITH_DISTRIBUTE)
+    if(LINUX)
+        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+    endif()
+    if(WITH_ASCEND_CL)
+        # disable WITH_PSCORE for NPU before include third_party
+        MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
+        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
+    endif()
+endif()
+
 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 
 include(flags)              # set paddle compile flags
@@ -322,12 +335,6 @@ if(WITH_PROFILER)
     add_definitions(-DWITH_GPERFTOOLS)
 endif()
 
-if(WITH_DISTRIBUTE)
-    if(LINUX)
-        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
-    endif()
-endif()
-
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION

diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
@@ -18,7 +18,7 @@ elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
 
     if(SCCACHE_PATH)
         execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
-        message(STATUS "${sccache_version} is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
+        message(STATUS "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
 
         set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
         set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210729")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -932,12 +932,8 @@ function(generate_dummy_static_lib)
   if(NOT dummy_GENERATOR)
     message(FATAL_ERROR "You must provide a generator file name.")
   endif()
-  # if ${dummy_GENERATOR} contains "/", it may be a file path
-  if(NOT ${dummy_GENERATOR} MATCHES ".*/.*")
-    set(dummy_GENERATOR "${CMAKE_CURRENT_LIST_DIR}/${dummy_GENERATOR}")
-  endif()
   if(NOT dummy_CONTENT)
-    set(dummy_CONTENT "${dummy_FILE_PATH} for lib ${dummy_LIB_NAME}")
+    set(dummy_CONTENT "${dummy_LIB_NAME}_dummy.c for lib ${dummy_LIB_NAME}")
   endif()
 
   configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH} @ONLY)

diff --git a/cmake/unity_build.cmake b/cmake/unity_build.cmake
@@ -77,11 +77,14 @@ function(compose_unity_target_sources TARGET TYPE)
     get_property(unity_group_index_max GLOBAL PROPERTY ${TARGET}_${TYPE}_group_index)
     foreach(src ${ARGN})
         set(unity_file "")
-        # UB use absolute path of source.
+        # Note(zhouwei25): UB use the path releative to CMAKE_SOURCE_DIR.
+        # If use absolute path, sccache/ccache hit rate will be reduced.
         if(IS_ABSOLUTE ${src})
             set(src_absolute_path ${src})
+            file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src})
         else()
             set(src_absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${src})
+            file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src_absolute_path})
         endif()
         # If `unity_group_index_max` is empty, there is no combination
         # relationship.
@@ -106,7 +109,7 @@ function(compose_unity_target_sources TARGET TYPE)
                             set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} ${UNITY_CU_BEFORE_CODE})
                         endif()
                     endif()
-                    set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} "#include \"${src_absolute_path}\"")
+                    set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} "#include \"${src_relative_path}\"")
                     set(unity_target_sources ${unity_target_sources} ${unity_file})
                     break()
                 endif()

diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h
@@ -68,31 +68,62 @@ class BlockingQueue {
   }
 
   bool Push(const T &elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      queue_.push_back(elem);
+    std::unique_lock<std::mutex> lock(mutex_);
+    WaitForWrite(lock);
+
+    queue_.push_back(elem);
+
+    Notify();
+    return true;
+  }
+  bool WaitForWrite(std::unique_lock<std::mutex> &lock) {  // NOLINT
+    while (FullUnlocked()) {
+      if (empty_waiters_ != 0) {
+        empty_cond_.notify_one();
+      }
+      full_waiters_++;
+      full_cond_.wait(lock);
+      full_waiters_--;
     }
-    cv_.notify_one();
     return true;
   }
-
-  bool Push(T &&elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      queue_.emplace_back(std::move(elem));
+  bool WaitForRead(std::unique_lock<std::mutex> &lock) {  // NOLINT
+    while (EmptyUnlocked()) {
+      if (full_waiters_ != 0) {
+        full_cond_.notify_one();
+      }
+      empty_waiters_++;
+      empty_cond_.wait(lock);
+      empty_waiters_--;
     }
-    cv_.notify_one();
     return true;
   }
+  bool EmptyUnlocked() { return queue_.empty(); }
+
+  bool FullUnlocked() { return queue_.size() >= capacity_; }
+  void Notify() {
+    if (empty_waiters_ != 0 && (!EmptyUnlocked())) {
+      empty_cond_.notify_one();
+    }
+    if (full_waiters_ != 0 && (!FullUnlocked())) {
+      full_cond_.notify_one();
+    }
+  }
+
+  bool Push(T &&elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    WaitForWrite(lock);
+    queue_.emplace_back(std::move(elem));
 
+    Notify();
+    return true;
+  }
   T Pop() {
     std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [=] { return !queue_.empty(); });
+    WaitForRead(lock);
     T rc(std::move(queue_.front()));
     queue_.pop_front();
-    cv_.notify_one();
+    Notify();
     return rc;
   }
 
@@ -107,11 +138,14 @@ class BlockingQueue {
   }
 
  private:
+  int empty_waiters_ = 0;
+  int full_waiters_ = 0;
+  std::condition_variable empty_cond_;
+  std::condition_variable full_cond_;
   const size_t capacity_;
   std::deque<T> queue_;
 
   mutable std::mutex mutex_;
-  std::condition_variable cv_;
 };
 
 template <typename T, int MajorType = Eigen::RowMajor,

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -188,23 +188,28 @@ cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 
 cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_vars_inference)
 
+IF(WITH_XPU)
+cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)
 
 cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute shape_inference op_info operator glog version)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 
 cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
 cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
 
-cc_library(program_processing SRCS program_processing.cc DEPS framework_proto)
+cc_library(program_processing SRCS program_processing.cc DEPS boost proto_desc)
 cc_test(program_processing_test SRCS program_processing_test.cc DEPS proto_desc program_processing)
 
 if(WITH_GPU)
@@ -405,7 +410,7 @@ configure_file(commit.h.in commit.h)
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include)
 
 if(WITH_ROCM)
   hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
@@ -134,11 +134,14 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     modify_op_lock_and_record_event_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
-    sync_batch_norm_pass runtime_context_cache_pass)
+    sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
+    fix_op_run_order_pass)
 if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})
+cc_test(build_strategy_test SRCS build_strategy_test.cc
+        DEPS build_strategy op_registry op_proto_maker graph)
 
 if (WITH_MKLDNN)
   target_link_libraries(build_strategy mkldnn_placement_pass)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
 
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(convert_all_blocks);
 
 namespace paddle {
 namespace framework {
@@ -312,6 +313,11 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                 DeviceType use_device) const {
 #endif
   VLOG(1) << "apply all passes";
+  if (FLAGS_convert_all_blocks) {
+    PADDLE_ENFORCE_EQ(
+        graph->IsMainGraph(), true,
+        platform::errors::InvalidArgument("This graph is not main_graph"));
+  }
   // Create a default one if not finalized by user.
   CreatePassesFromStrategy(false);
 
@@ -432,7 +438,14 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       }
     }
     VLOG(1) << "Start Apply Pass " << pass->Type();
-    graph = pass->Apply(graph);
+    if (FLAGS_convert_all_blocks) {
+      for (size_t i = 0; i < graph->SubGraphsSize(); ++i) {
+        VLOG(3) << "Apply Pass " << pass->Type() << "to SubGraph " << i;
+        pass->Apply(graph->GetSubGraph(i));
+      }
+    } else {
+      graph = pass->Apply(graph);
+    }
     VLOG(1) << "Finish Apply Pass " << pass->Type();
   }
   VLOG(1) << "All Passes Applied";

diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
@@ -100,6 +100,9 @@ struct BuildStrategy {
   // while running.
   bool cache_runtime_context_{false};
 
+  // Fix the op run order.
+  bool fix_op_run_order_{false};
+
   // Operator fusion
   // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
   // cycle.