Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/develop' into gast
Browse files Browse the repository at this point in the history
  • Loading branch information
0x45f committed Aug 5, 2021
2 parents d6721a0 + 0989211 commit a7114e2
Show file tree
Hide file tree
Showing 555 changed files with 20,758 additions and 5,358 deletions.
27 changes: 17 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,19 @@ if(WIN32)
endforeach(flag_var)
endif()

math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")

# windows build turn off warnings, use parallel compiling.
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
# NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
if(NOT WITH_GPU)

# NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
# For Visual Studio generators, /MP should be added.
# For other generators like Ninja, it is not need to add /MP.
if("${CMAKE_GENERATOR}" STREQUAL "Visual Studio" AND NOT WITH_GPU)
math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
endif()
endforeach(flag_var)
Expand Down Expand Up @@ -312,6 +314,17 @@ else()
endif()
endif()

if(WITH_DISTRIBUTE)
if(LINUX)
set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
endif()
if(WITH_ASCEND_CL)
# disable WITH_PSCORE for NPU before include third_party
MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
endif()
endif()

include(third_party) # download, build, install third_party, Contains about 20+ dependencies

include(flags) # set paddle compile flags
Expand All @@ -322,12 +335,6 @@ if(WITH_PROFILER)
add_definitions(-DWITH_GPERFTOOLS)
endif()

if(WITH_DISTRIBUTE)
if(LINUX)
set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
endif()
endif()

include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(version) # set PADDLE_VERSION
Expand Down
2 changes: 1 addition & 1 deletion cmake/ccache.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")

if(SCCACHE_PATH)
execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
message(STATUS "${sccache_version} is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
message(STATUS "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")

set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ ELSE ()
ENDIF()

SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210729")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
Expand Down
6 changes: 1 addition & 5 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -932,12 +932,8 @@ function(generate_dummy_static_lib)
if(NOT dummy_GENERATOR)
message(FATAL_ERROR "You must provide a generator file name.")
endif()
# if ${dummy_GENERATOR} contains "/", it may be a file path
if(NOT ${dummy_GENERATOR} MATCHES ".*/.*")
set(dummy_GENERATOR "${CMAKE_CURRENT_LIST_DIR}/${dummy_GENERATOR}")
endif()
if(NOT dummy_CONTENT)
set(dummy_CONTENT "${dummy_FILE_PATH} for lib ${dummy_LIB_NAME}")
set(dummy_CONTENT "${dummy_LIB_NAME}_dummy.c for lib ${dummy_LIB_NAME}")
endif()

configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH} @ONLY)
Expand Down
7 changes: 5 additions & 2 deletions cmake/unity_build.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,14 @@ function(compose_unity_target_sources TARGET TYPE)
get_property(unity_group_index_max GLOBAL PROPERTY ${TARGET}_${TYPE}_group_index)
foreach(src ${ARGN})
set(unity_file "")
# UB use absolute path of source.
# Note(zhouwei25): UB use the path releative to CMAKE_SOURCE_DIR.
# If use absolute path, sccache/ccache hit rate will be reduced.
if(IS_ABSOLUTE ${src})
set(src_absolute_path ${src})
file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src})
else()
set(src_absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${src})
file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src_absolute_path})
endif()
# If `unity_group_index_max` is empty, there is no combination
# relationship.
Expand All @@ -106,7 +109,7 @@ function(compose_unity_target_sources TARGET TYPE)
set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} ${UNITY_CU_BEFORE_CODE})
endif()
endif()
set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} "#include \"${src_absolute_path}\"")
set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} "#include \"${src_relative_path}\"")
set(unity_target_sources ${unity_target_sources} ${unity_file})
break()
endif()
Expand Down
64 changes: 49 additions & 15 deletions paddle/fluid/distributed/service/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,31 +68,62 @@ class BlockingQueue {
}

bool Push(const T &elem) {
{
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [&] { return queue_.size() < capacity_; });
queue_.push_back(elem);
std::unique_lock<std::mutex> lock(mutex_);
WaitForWrite(lock);

queue_.push_back(elem);

Notify();
return true;
}
bool WaitForWrite(std::unique_lock<std::mutex> &lock) { // NOLINT
while (FullUnlocked()) {
if (empty_waiters_ != 0) {
empty_cond_.notify_one();
}
full_waiters_++;
full_cond_.wait(lock);
full_waiters_--;
}
cv_.notify_one();
return true;
}

bool Push(T &&elem) {
{
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [&] { return queue_.size() < capacity_; });
queue_.emplace_back(std::move(elem));
bool WaitForRead(std::unique_lock<std::mutex> &lock) { // NOLINT
while (EmptyUnlocked()) {
if (full_waiters_ != 0) {
full_cond_.notify_one();
}
empty_waiters_++;
empty_cond_.wait(lock);
empty_waiters_--;
}
cv_.notify_one();
return true;
}
bool EmptyUnlocked() { return queue_.empty(); }

bool FullUnlocked() { return queue_.size() >= capacity_; }
void Notify() {
if (empty_waiters_ != 0 && (!EmptyUnlocked())) {
empty_cond_.notify_one();
}
if (full_waiters_ != 0 && (!FullUnlocked())) {
full_cond_.notify_one();
}
}

bool Push(T &&elem) {
std::unique_lock<std::mutex> lock(mutex_);
WaitForWrite(lock);
queue_.emplace_back(std::move(elem));

Notify();
return true;
}
T Pop() {
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [=] { return !queue_.empty(); });
WaitForRead(lock);
T rc(std::move(queue_.front()));
queue_.pop_front();
cv_.notify_one();
Notify();
return rc;
}

Expand All @@ -107,11 +138,14 @@ class BlockingQueue {
}

private:
int empty_waiters_ = 0;
int full_waiters_ = 0;
std::condition_variable empty_cond_;
std::condition_variable full_cond_;
const size_t capacity_;
std::deque<T> queue_;

mutable std::mutex mutex_;
std::condition_variable cv_;
};

template <typename T, int MajorType = Eigen::RowMajor,
Expand Down
11 changes: 8 additions & 3 deletions paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -188,23 +188,28 @@ cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)

cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_vars_inference)

IF(WITH_XPU)
cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
ELSE()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
ENDIF()

cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)

cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version)

cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute shape_inference op_info operator glog version)

cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)

cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)

cc_library(program_processing SRCS program_processing.cc DEPS framework_proto)
cc_library(program_processing SRCS program_processing.cc DEPS boost proto_desc)
cc_test(program_processing_test SRCS program_processing_test.cc DEPS proto_desc program_processing)

if(WITH_GPU)
Expand Down Expand Up @@ -405,7 +410,7 @@ configure_file(commit.h.in commit.h)
# Adapt to custom op mechanism: Include the header files related to the data type
# to avoid exposing the path of the underlying file
include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include)

if(WITH_ROCM)
hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/framework/details/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,14 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
modify_op_lock_and_record_event_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
sync_batch_norm_pass runtime_context_cache_pass)
sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
fix_op_run_order_pass)
if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
endif()
cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})
cc_test(build_strategy_test SRCS build_strategy_test.cc
DEPS build_strategy op_registry op_proto_maker graph)

if (WITH_MKLDNN)
target_link_libraries(build_strategy mkldnn_placement_pass)
Expand Down
15 changes: 14 additions & 1 deletion paddle/fluid/framework/details/build_strategy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"

DECLARE_bool(use_mkldnn);
DECLARE_bool(convert_all_blocks);

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -312,6 +313,11 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
DeviceType use_device) const {
#endif
VLOG(1) << "apply all passes";
if (FLAGS_convert_all_blocks) {
PADDLE_ENFORCE_EQ(
graph->IsMainGraph(), true,
platform::errors::InvalidArgument("This graph is not main_graph"));
}
// Create a default one if not finalized by user.
CreatePassesFromStrategy(false);

Expand Down Expand Up @@ -432,7 +438,14 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
}
}
VLOG(1) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(graph);
if (FLAGS_convert_all_blocks) {
for (size_t i = 0; i < graph->SubGraphsSize(); ++i) {
VLOG(3) << "Apply Pass " << pass->Type() << "to SubGraph " << i;
pass->Apply(graph->GetSubGraph(i));
}
} else {
graph = pass->Apply(graph);
}
VLOG(1) << "Finish Apply Pass " << pass->Type();
}
VLOG(1) << "All Passes Applied";
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/framework/details/build_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ struct BuildStrategy {
// while running.
bool cache_runtime_context_{false};

// Fix the op run order.
bool fix_op_run_order_{false};

// Operator fusion
// TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
// cycle.
Expand Down
Loading

1 comment on commit a7114e2

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.