From 9feb0c9b6ed30764baea16194012d4df295cd0b5 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 19 Sep 2023 23:18:08 +0200
Subject: [PATCH 01/57] Revert "[libc][cmake] Tidy compiler includes (#66783)"
 (#66822)

This reverts commit a35a3b75b219247eb9ff6784d1a0fe562f72d415. This broke
libc benchmarks.
---
 libc/CMakeLists.txt                          |  5 ---
 libc/cmake/modules/LLVMLibCObjectRules.cmake | 31 +++++++++++--------
 libc/cmake/modules/LLVMLibCTestRules.cmake   | 32 +++++++++++++++-----
 libc/utils/HdrGen/CMakeLists.txt             |  2 +-
 libc/utils/LibcTableGenUtil/CMakeLists.txt   |  1 -
 5 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 414be906336bf..0cec6fc07d982 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -7,11 +7,6 @@ endif()
 include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
   NO_POLICY_SCOPE)
 
-# `llvm-project/llvm/CMakeLists.txt` adds the following directive
-# `include_directories( ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})`
-# We undo it to be able to precisely control what is getting included.
-set_directory_properties(PROPERTIES INCLUDE_DIRECTORIES "")
-
 # Default to C++17
 set(CMAKE_CXX_STANDARD 17)
 
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index c10b81f1af8cb..994437f55d274 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -151,6 +151,7 @@ function(_build_gpu_objects fq_target_name internal_target_name)
     ${ARGN}
   )
 
+  set(include_dirs ${LIBC_SOURCE_DIR} ${LIBC_INCLUDE_DIR})
   set(common_compile_options ${ADD_GPU_OBJ_COMPILE_OPTIONS})
   if(NOT ADD_GPU_OBJ_CXX_STANDARD)
     set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
@@ -188,10 +189,13 @@ function(_build_gpu_objects fq_target_name internal_target_name)
       )
 
       target_compile_options(${gpu_target_name} PRIVATE ${compile_options})
-      target_include_directories(${gpu_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-      target_include_directories(${gpu_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+      target_include_directories(${gpu_target_name} PRIVATE ${include_dirs})
       target_compile_definitions(${gpu_target_name} PRIVATE LIBC_COPT_PUBLIC_PACKAGING)
-      set_target_properties(${gpu_target_name} PROPERTIES CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD})
+      set_target_properties(
+        ${gpu_target_name}
+        PROPERTIES
+          CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD}
+      )
       if(ADD_GPU_OBJ_DEPENDS)
         add_dependencies(${gpu_target_name} ${ADD_GPU_OBJ_DEPENDS})
       endif()
@@ -257,8 +261,7 @@ function(_build_gpu_objects fq_target_name internal_target_name)
     target_compile_options(${fq_target_name} PRIVATE
                            "SHELL:-Xclang -fembed-offload-object=${packaged_gpu_binary}")
   endforeach()
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
   add_dependencies(${fq_target_name}
                    ${full_deps_list} ${packaged_gpu_names} ${stub_target_name})
 
@@ -282,8 +285,7 @@ function(_build_gpu_objects fq_target_name internal_target_name)
       get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
       target_compile_options(${internal_target_name} PRIVATE ${nvptx_options})
     endif()
-    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
     if(full_deps_list)
       add_dependencies(${internal_target_name} ${full_deps_list})
     endif()
@@ -367,8 +369,12 @@ function(create_object_library fq_target_name)
       ${ADD_OBJECT_SRCS}
       ${ADD_OBJECT_HDRS}
     )
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(
+      ${fq_target_name}
+      PRIVATE
+        ${LIBC_SOURCE_DIR}
+        ${LIBC_INCLUDE_DIR}
+    )
     target_compile_options(${fq_target_name} PRIVATE ${compile_options})
   endif()
 
@@ -627,6 +633,7 @@ function(create_entrypoint_object fq_target_name)
     "${ADD_ENTRYPOINT_OBJ_FLAGS}"
     ${ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS}
   )
+  set(include_dirs ${LIBC_SOURCE_DIR} ${LIBC_INCLUDE_DIR})
   get_fq_deps_list(fq_deps_list ${ADD_ENTRYPOINT_OBJ_DEPENDS})
   set(full_deps_list ${fq_deps_list} libc.src.__support.common)
 
@@ -663,8 +670,7 @@ function(create_entrypoint_object fq_target_name)
       ${ADD_ENTRYPOINT_OBJ_HDRS}
     )
     target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
-    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
     add_dependencies(${internal_target_name} ${full_deps_list})
     target_link_libraries(${internal_target_name} ${full_deps_list})
 
@@ -678,8 +684,7 @@ function(create_entrypoint_object fq_target_name)
       ${ADD_ENTRYPOINT_OBJ_HDRS}
     )
     target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
     add_dependencies(${fq_target_name} ${full_deps_list})
     target_link_libraries(${fq_target_name} ${full_deps_list})
   endif()
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 3d02a37e96db5..e1286b4ab9631 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -148,8 +148,12 @@ function(create_libc_unittest fq_target_name)
     ${LIBC_UNITTEST_SRCS}
     ${LIBC_UNITTEST_HDRS}
   )
-  target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(
+    ${fq_build_target_name}
+    PRIVATE
+      ${LIBC_SOURCE_DIR}
+      ${LIBC_INCLUDE_DIR}
+  )
   target_compile_options(
     ${fq_build_target_name}
     PRIVATE -fpie ${LIBC_COMPILE_OPTIONS_DEFAULT}
@@ -382,8 +386,12 @@ function(add_libc_fuzzer target_name)
     ${LIBC_FUZZER_SRCS}
     ${LIBC_FUZZER_HDRS}
   )
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(
+    ${fq_target_name}
+    PRIVATE
+      ${LIBC_SOURCE_DIR}
+      ${LIBC_INCLUDE_DIR}
+  )
 
   target_link_libraries(${fq_target_name} PRIVATE 
     ${link_object_files} 
@@ -508,8 +516,12 @@ function(add_integration_test test_name)
   )
   set_target_properties(${fq_build_target_name}
       PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(
+    ${fq_build_target_name}
+    PRIVATE
+      ${LIBC_SOURCE_DIR}
+      ${LIBC_INCLUDE_DIR}
+  )
   target_compile_options(${fq_build_target_name}
       PRIVATE -fpie -ffreestanding -fno-exceptions -fno-rtti ${INTEGRATION_TEST_COMPILE_OPTIONS})
   # The GPU build requires overriding the default CMake triple and architecture.
@@ -671,8 +683,12 @@ function(add_libc_hermetic_test test_name)
       RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
       #OUTPUT_NAME ${fq_target_name}
   )
-  target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(
+    ${fq_build_target_name}
+    PRIVATE
+      ${LIBC_SOURCE_DIR}
+      ${LIBC_INCLUDE_DIR}
+  )
   target_compile_options(${fq_build_target_name}
       PRIVATE ${LIBC_HERMETIC_TEST_COMPILE_OPTIONS} ${HERMETIC_TEST_COMPILE_OPTIONS})
 
diff --git a/libc/utils/HdrGen/CMakeLists.txt b/libc/utils/HdrGen/CMakeLists.txt
index 0ec1cba542d40..0b1612e8a886a 100644
--- a/libc/utils/HdrGen/CMakeLists.txt
+++ b/libc/utils/HdrGen/CMakeLists.txt
@@ -14,7 +14,7 @@ add_tablegen(libc-hdrgen LIBC
   PublicAPICommand.h
 )
 
-target_include_directories(libc-hdrgen PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
+target_include_directories(libc-hdrgen PRIVATE ${LIBC_SOURCE_DIR})
 target_link_libraries(libc-hdrgen PRIVATE LibcTableGenUtil)
 
 add_subdirectory(PrototypeTestGen)
diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt
index dca6a7bb83065..222f177ee2f77 100644
--- a/libc/utils/LibcTableGenUtil/CMakeLists.txt
+++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt
@@ -6,4 +6,3 @@ add_llvm_library(
   LINK_COMPONENTS Support TableGen
 )
 target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR})
-target_include_directories(LibcTableGenUtil PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})

From 87b8c85bba5298fea657b71eb7c75aeb1afa446d Mon Sep 17 00:00:00 2001
From: Rashmi Mudduluru <r_mudduluru@apple.com>
Date: Tue, 19 Sep 2023 14:20:20 -0700
Subject: [PATCH 02/57] [-Wunsafe-bugger-usage] Clean tests: remove
 nondeterministic ordering

Differential Review: https://reviews.llvm.org/D158553
---
 ...er-usage-fixits-deref-simple-ptr-arith.cpp | 180 +++++++++---------
 ...afe-buffer-usage-fixits-local-var-span.cpp |  40 +---
 ...safe-buffer-usage-fixits-pointer-deref.cpp |  40 ++--
 ...safe-buffer-usage-fixits-pre-increment.cpp |  14 +-
 ...uffer-usage-fixits-unevaluated-context.cpp |  28 +--
 ...fe-buffer-usage-multi-decl-fixits-test.cpp | 108 +++++------
 ...uffer-usage-multi-decl-ptr-init-fixits.cpp |  84 ++++----
 ...nsafe-buffer-usage-multi-decl-ptr-init.cpp |  12 +-
 ...afe-buffer-usage-multi-decl-uuc-fixits.cpp |  48 ++---
 ...arn-unsafe-buffer-usage-multi-decl-uuc.cpp |  10 +-
 ...nsafe-buffer-usage-multi-decl-warnings.cpp |  66 +++----
 .../warn-unsafe-buffer-usage-pragma-fixit.cpp |  24 +--
 ...warn-unsafe-buffer-usage-source-ranges.cpp |  32 ++--
 13 files changed, 326 insertions(+), 360 deletions(-)

diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-deref-simple-ptr-arith.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-deref-simple-ptr-arith.cpp
index 90cfa6842fae8..a4a09a0afed59 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-deref-simple-ptr-arith.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-deref-simple-ptr-arith.cpp
@@ -10,169 +10,169 @@
 
 void basic() {
   int *ptr;
-// CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> ptr"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> ptr"
   *(ptr+5)=1;
-// CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:5}:""
-// CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:8-[[@LINE-2]]:9}:"["
-// CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:11}:"]"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:5}:""
+// CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:8-[[@LINE-2]]:9}:"["
+// CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:11}:"]"
 }
 
 // The weird preceding semicolon ensures that we preserve that range intact.
 void char_ranges() {
   int *p;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:9}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:9}:"std::span<int> p"
 
   ;* ( p + 5 ) = 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:8}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:9-[[@LINE-2]]:12}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:15}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:8}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:9-[[@LINE-2]]:12}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:15}:"]"
 
   ;*   (p+5)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
 
   ;*(   p+5)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
 
   ;*(   p+5)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
 
   ;*( p   +5)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:7}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:8-[[@LINE-2]]:12}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:14}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:7}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:8-[[@LINE-2]]:12}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:14}:"]"
 
   ;*(p+   5)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
 
   ;*(p+ 5   )= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:9}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:14}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:9}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:14}:"]"
 
   ;*(p+ 5)   = 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:9}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:11}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:9}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:11}:"]"
 
   ;   *(p+5)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:13}:"]"
 
   ;*(p+123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:8}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:14-[[@LINE-3]]:15}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:8}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:14-[[@LINE-3]]:15}:"]"
 
   ;*   (p+123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
 
   ;*(   p+123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
 
   ;*(   p+123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
 
   ;*(p   +123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
 
   ;*(p+   123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:11}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:11}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:18}:"]"
 
   ;*(p+123456   )= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:8}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:14-[[@LINE-3]]:18}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:8}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:14-[[@LINE-3]]:18}:"]"
 
   ;*(p+123456)   = 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:8}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:14-[[@LINE-3]]:15}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:8}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:14-[[@LINE-3]]:15}:"]"
 
   int *ptrrrrrr;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:16}:"std::span<int> ptrrrrrr"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:16}:"std::span<int> ptrrrrrr"
 
   ;* ( ptrrrrrr + 123456 )= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:8}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:19}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:25-[[@LINE-3]]:27}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:8}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:19}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:25-[[@LINE-3]]:27}:"]"
 
   ;*   (ptrrrrrr+123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:18}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:18}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
 
   ;*(   ptrrrrrr+123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:18}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:18}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
 
   ;*(   ptrrrrrr+123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:18}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:9}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:18}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
 
   ;*(ptrrrrrr   +123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:18}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:18}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
 
   ;*(ptrrrrrr+   123456)= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:18}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:18}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:25}:"]"
 
   ;*(ptrrrrrr+123456   )= 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:15}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:21-[[@LINE-3]]:25}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:15}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:21-[[@LINE-3]]:25}:"]"
 
   ;*(ptrrrrrr+123456)   = 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:15}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:21-[[@LINE-3]]:22}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:4-[[@LINE-1]]:6}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:15}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:21-[[@LINE-3]]:22}:"]"
 }
 
 void base_on_rhs() {
   int* ptr;
   *(10 + ptr) = 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:5}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:10}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:14}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:5}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:10}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:14}:"]"
 }
 
 void many_parens() {
   int* ptr;
   *(( (10 + ptr)) ) = 1;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:8}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:13}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:16-[[@LINE-3]]:20}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:8}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:10-[[@LINE-2]]:13}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:16-[[@LINE-3]]:20}:"]"
 }
 
 void lvaue_to_rvalue() {
   int * ptr;
   int tmp = *(ptr + 10);
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:13-[[@LINE-1]]:15}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:18-[[@LINE-2]]:21}:"["
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:24}:"]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:13-[[@LINE-1]]:15}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:18-[[@LINE-2]]:21}:"["
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:24}:"]"
 }
 
 // Fixits emitted for the cases below would be incorrect.
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-span.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-span.cpp
index b9c8bec77787d..114ceaad56e45 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-span.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-span.cpp
@@ -58,46 +58,12 @@ void local_variable_qualifiers_specifiers() {
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:24}:"std::span<int const> const q"
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:25-[[@LINE-2]]:25}:"{"
   // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:26-[[@LINE-3]]:26}:", 10}"
-  [[deprecated]] const int * x = a;
-  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:18-[[@LINE-1]]:33}:"std::span<int const> x"
-  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:34-[[@LINE-2]]:34}:"{"
-  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:35-[[@LINE-3]]:35}:", 10}"
-  const int * y [[deprecated]];
-  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:16}:"std::span<int const> y"
-
   int tmp;
-
   tmp = p[5];
   tmp = q[5];
-  tmp = x[5];
-  tmp = y[5];
 }
 
 
-void local_variable_unsupported_specifiers() {
-  int a[10];
-  const int * p [[deprecated]] = a; //  not supported because the attribute overlaps the source range of the declaration
-  // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:
-
-  static const int * q = a; //  storage specifier not supported yet
-  // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:
-
-  extern int * x; //  storage specifier not supported yet
-  // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:
-
-  constexpr int * y = 0; //  `constexpr` specifier not supported yet
-  // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:
-
-  int tmp;
-
-  tmp = p[5];
-  tmp = q[5];
-  tmp = x[5];
-  tmp = y[5];
-}
-
-
-
 void local_array_subscript_variable_extent() {
   int n = 10;
   int tmp;
@@ -282,15 +248,15 @@ void unsupported_subscript_negative(int i, unsigned j, unsigned long k) {
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
 
   tmp = p[-1]; // If `p` is made a span, this `[]` operation is wrong,
-	       // so no fix-it emitted.
+         // so no fix-it emitted.
 
   int * q = new int[10];
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
 
   tmp = q[5];
   tmp = q[i];  // If `q` is made a span, this `[]` operation may be
-	       // wrong as we do not know if `i` is non-negative, so
-	       // no fix-it emitted.
+         // wrong as we do not know if `i` is non-negative, so
+         // no fix-it emitted.
 
   int * r = new int[10];
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> r"
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp
index 91ec3fef6760d..f3f24e0a246e1 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp
@@ -5,24 +5,24 @@
 void basic_dereference() {
   int tmp;
   int* p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
   tmp = p[5];
   int val = *p;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:13-[[@LINE-1]]:14}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:15-[[@LINE-2]]:15}:"[0]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:13-[[@LINE-1]]:14}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:15-[[@LINE-2]]:15}:"[0]"
 }
 
 int return_method() {
   int* p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
   int tmp = p[5];
   return *p;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:11}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"[0]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:11}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"[0]"
 }
 
 void foo(int v) {
@@ -30,28 +30,28 @@ void foo(int v) {
 
 void method_invocation() {
   int* p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
 
   int tmp = p[5];
 
   foo(*p);
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:8}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:9-[[@LINE-2]]:9}:"[0]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:8}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:9-[[@LINE-2]]:9}:"[0]"
 }
 
 void binary_operation() {
   int* p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
 
   int tmp = p[5];
 
   int k = *p + 20;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:12}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"[0]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:12}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"[0]"
 
 }
 
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pre-increment.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pre-increment.cpp
index 57548d1e496a8..737c38f0918a0 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pre-increment.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pre-increment.cpp
@@ -7,9 +7,9 @@ void foo(int * , int *);
 
 void simple() {
   int * p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
   bool b = ++p;
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:12-[[@LINE-1]]:15}:"(p = p.subspan(1)).data()"
   unsigned long long n = (unsigned long long) ++p;
@@ -18,12 +18,12 @@ void simple() {
     // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:10}:"(p = p.subspan(1)).data()"
   }
   if (++p - ++p) {
-    // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:10}:"(p = p.subspan(1)).data()"
-    // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:16}:"(p = p.subspan(1)).data()"
+    // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:10}:"(p = p.subspan(1)).data()"
+    // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:16}:"(p = p.subspan(1)).data()"
   }
   foo(++p, p);
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:10}:"(p = p.subspan(1)).data()"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:".data()"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:10}:"(p = p.subspan(1)).data()"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:".data()"
 
   // FIXME: Don't know how to fix the following cases:
   // CHECK-NOT: fix-it:"{{.*}}":{
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-unevaluated-context.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-unevaluated-context.cpp
index 9f9520ee33a6f..a0156f64a8e1f 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-unevaluated-context.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-unevaluated-context.cpp
@@ -16,24 +16,24 @@ int bar(int *ptr);
 
 void uneval_context_fix_pointer_dereference() {
   int* p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
 
   int tmp = p[5];
   typeid(foo(*p));
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:14-[[@LINE-1]]:15}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:"[0]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:14-[[@LINE-1]]:15}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:"[0]"
   _Generic(*p, int: 2, float: 3);
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:12-[[@LINE-1]]:13}:""
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:14}:"[0]"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:12-[[@LINE-1]]:13}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:14}:"[0]"
 }
 
 void uneval_context_fix_pointer_array_access() {
   int* p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
 
   int tmp = p[5];
   typeid(foo(p[5]));
@@ -42,13 +42,13 @@ void uneval_context_fix_pointer_array_access() {
 
 void uneval_context_fix_pointer_reference() {
   int* p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
 
   int tmp = p[5];
   typeid(bar(p));
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:15-[[@LINE-1]]:15}:".data()"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:15-[[@LINE-1]]:15}:".data()"
 }
 
 // The FixableGagdtes are not working in the following scenarios:
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-fixits-test.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-fixits-test.cpp
index 73bf2cb7a689a..2ab311295d343 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-fixits-test.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-fixits-test.cpp
@@ -2,13 +2,13 @@
 
 void foo1a() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   p = r;
   int tmp = p[9];
   int *q;
@@ -18,28 +18,28 @@ void foo1a() {
 
 void foo1b() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   p = r;
   int tmp = p[9];
   int *q = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   q = r;
   tmp = q[9];
 }
 
 void foo1c() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[4];
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
   p = r;
@@ -52,17 +52,17 @@ void foo1c() {
 
 void foo2a() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[5];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 5}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 5}"
   int *q = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   p = q;
   int tmp = p[8];
   q = r;
@@ -70,17 +70,17 @@ void foo2a() {
 
 void foo2b() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[5];
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 5}"
   int *q = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   p = q;
   int tmp = q[8];
   q = r;
@@ -88,17 +88,17 @@ void foo2b() {
 
 void foo2c() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[5];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 5}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 5}"
   int *q = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   p = q;
   int tmp = p[8];
   q = r;
@@ -109,9 +109,9 @@ void foo3a() {
   int *r = new int[7];
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
   int *p = new int[5];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 5}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 5}"
   int *q = new int[4];
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
   q = p;
@@ -121,17 +121,17 @@ void foo3a() {
 
 void foo3b() {
   int *r = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
   int *p = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
   int *q = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
   q = p;
   int tmp = q[8];
   q = r;
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init-fixits.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init-fixits.cpp
index e29ac7fa8fcb3..4a40d3159627c 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init-fixits.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init-fixits.cpp
@@ -4,57 +4,57 @@
 
 void lhs_span_multi_assign() {
   int *a = new int[2];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> a"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 2}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> a"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 2}"
   int *b = a;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> b"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> b"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
   int *c = b;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> c"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> c"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
   int *d = c;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> d"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> d"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
   int tmp = d[2];  // expected-note{{used in buffer access here}}
 }
 
 void rhs_span1() {
   int *q = new int[12];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 12}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 12}"
   int *p = q;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
   p[5] = 10;
   int *r = q;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
-  r[10] = 5;  // expected-note{{used in buffer access here}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
+  r[10] = 5;
 }
 
 void rhs_span2() {
   int *q = new int[6];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 6}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 6}"
   int *p = q;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
-  p[5] = 10;  // expected-note{{used in buffer access here}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
+  p[5] = 10;
 }
 
 void rhs_span3() {
   int *q = new int[6];
-  int *p = q;  // expected-warning{{'p' is an unsafe pointer used for buffer access}}
-  p[5] = 10;  // expected-note{{used in buffer access here}}
+  int *p = q;
+  p[5] = 10;
   int *r = q;
 }
 
@@ -62,9 +62,9 @@ void test_grouping() {
   int *z = new int[8];
   int tmp;
   int *y = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> y"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> y"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
   tmp = y[5];
 
   int *x = new int[10];
@@ -75,15 +75,15 @@ void test_grouping() {
 
 void test_crash() {
   int *r = new int[8];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 8}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 8}"
   int *q = r;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> q"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:13-[[@LINE-3]]:13}:", <# placeholder #>}"
   int *p;
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:9}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:9}:"std::span<int> p"
   p = q;
-  int tmp = p[9];  // expected-note{{used in buffer access here}}
+  int tmp = p[9];
 }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init.cpp
index 2fec417aafa8c..10ef66edce2be 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-ptr-init.cpp
@@ -4,21 +4,21 @@ void lhs_span_multi_assign() {
   int *a = new int[2];
   int *b = a;
   int *c = b;
-  int *d = c;  // expected-warning{{'d' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'd' to 'std::span' to preserve bounds information, and change ('a', 'b', and 'c'|'a', 'c', and 'b'|'b', 'a', and 'c'|'b', 'c', and 'a'|'c', 'a', and 'b'|'c', 'b', and 'a') to 'std::span' to propagate bounds information between them$}}}}
+  int *d = c;  // expected-warning{{'d' is an unsafe pointer used for buffer access}} expected-note{{change type of 'd' to 'std::span' to preserve bounds information, and change 'c', 'b', and 'a' to 'std::span' to propagate bounds information between them}}
   int tmp = d[2];  // expected-note{{used in buffer access here}}
 }
 
 void rhs_span1() {
   int *q = new int[12];
-  int *p = q;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('q' and 'r'|'r' and 'q') to 'std::span' to propagate bounds information between them$}}}}
+  int *p = q;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
   p[5] = 10;  // expected-note{{used in buffer access here}}
-  int *r = q;  // expected-warning{{'r' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'r' to 'std::span' to preserve bounds information, and change ('p' and 'q'|'q' and 'p') to 'std::span' to propagate bounds information between them$}}}}
+  int *r = q;  // expected-warning{{'r' is an unsafe pointer used for buffer access}} expected-note{{change type of 'r' to 'std::span' to preserve bounds information, and change 'p' and 'q' to 'std::span' to propagate bounds information between them}}
   r[10] = 5;  // expected-note{{used in buffer access here}}
 }
 
 void rhs_span2() {
   int *q = new int[6];
-  int *p = q;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them$}}}}
+  int *p = q;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them}}
   p[5] = 10;  // expected-note{{used in buffer access here}}
 }
 
@@ -33,7 +33,7 @@ void rhs_span3() {
 void test_grouping() {
   int *z = new int[8];
   int tmp;
-  int *y = new int[10];  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'y' to 'std::span' to preserve bounds information$}}}}
+  int *y = new int[10];  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note{{change type of 'y' to 'std::span' to preserve bounds information}}
   tmp = y[5]; // expected-note{{used in buffer access here}}
 
   int *x = new int[10];
@@ -45,7 +45,7 @@ void test_grouping() {
 void test_crash() {
   int *r = new int[8];
   int *q = r;
-  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('r' and 'q'|'q' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
   p = q;
   int tmp = p[9];  // expected-note{{used in buffer access here}}
 }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc-fixits.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc-fixits.cpp
index 6c867fe8a3204..fab8d659ba65c 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc-fixits.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc-fixits.cpp
@@ -6,13 +6,13 @@ void bar(int * param) {}
 
 void foo1a() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   p = r;
   int tmp = p[9];
   int *q;
@@ -21,13 +21,13 @@ void foo1a() {
 
 void uuc_if_body() {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   if (true)
     p = r;
   p[5] = 4;
@@ -35,13 +35,13 @@ void uuc_if_body() {
 
 void uuc_if_body1(bool flag) {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   int *p = new int[4];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 4}"
   if (flag) {
     p = r;
   }
@@ -50,15 +50,15 @@ void uuc_if_body1(bool flag) {
 
 void uuc_if_body2_ptr_init(bool flag) {
   int *r = new int[7];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> r"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:", 7}"
   if (flag) {
   } else {
     int* p = r;
-    // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:5-[[@LINE-1]]:13}:"std::span<int> p"
-    // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:14}:"{"
-    // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:15-[[@LINE-3]]:15}:", <# placeholder #>}"
+    // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:5-[[@LINE-1]]:13}:"std::span<int> p"
+    // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:14-[[@LINE-2]]:14}:"{"
+    // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:15-[[@LINE-3]]:15}:", <# placeholder #>}"
     p[5] = 4;
   }
 }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc.cpp
index d5a3f121c19ea..9f6deac6683ba 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-uuc.cpp
@@ -3,7 +3,7 @@ void bar(int * param) {}
 
 void foo1a() {
   int *r = new int[7];
-  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
   p = r;
   int tmp = p[9];  // expected-note{{used in buffer access here}}
   int *q;
@@ -12,7 +12,7 @@ void foo1a() {
 
 void uuc_if_body() {
   int *r = new int[7];
-  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
   if (true)
     p = r;
   p[5] = 4;  // expected-note{{used in buffer access here}}
@@ -20,7 +20,7 @@ void uuc_if_body() {
 
 void uuc_if_body1(bool flag) {
   int *r = new int[7];
-  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
   if (flag) {
     p = r;
   }
@@ -29,7 +29,7 @@ void uuc_if_body1(bool flag) {
 
 void uuc_if_body2(bool flag) {
   int *r = new int[7];
-  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
   if (flag) {
   } else {
     p = r;
@@ -42,7 +42,7 @@ void uuc_if_body2_ptr_init(bool flag) {
   int *r = new int[7];
   if (flag) {
   } else {
-    int* p = r;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+    int* p = r;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} // expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
     p[5] = 4;  // expected-note{{used in buffer access here}}
   }
 }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-warnings.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-warnings.cpp
index a60e7310c99a6..0f2491888d297 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-warnings.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-multi-decl-warnings.cpp
@@ -6,10 +6,10 @@ namespace std {
 
 void local_assign_both_span() {
   int tmp;
-  int* p = new int[10]; // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them$}}}}
+  int* p = new int[10]; // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them}}
   tmp = p[4];  // expected-note{{used in buffer access here}}
 
-  int* q = new int[10];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change 'p' to 'std::span' to propagate bounds information between them$}}}}
+  int* q = new int[10];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'p' to 'std::span' to propagate bounds information between them}}
   tmp = q[4];  // expected-note{{used in buffer access here}}
 
   q = p;
@@ -18,7 +18,7 @@ void local_assign_both_span() {
 void local_assign_rhs_span() {
   int tmp;
   int* p = new int[10];
-  int* q = new int[10];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information$}}}}
+  int* q = new int[10];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note{{change type of 'q' to 'std::span' to preserve bounds information}}
   tmp = q[4];  // expected-note{{used in buffer access here}}
   p = q;
 }
@@ -32,7 +32,7 @@ void local_assign_no_span() {
 
 void local_assign_lhs_span() {
   int tmp;
-  int* p = new int[10];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them$}}}}
+  int* p = new int[10];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them}}
   tmp = p[4];  // expected-note{{used in buffer access here}}
   int* q = new int[10];
 
@@ -43,13 +43,13 @@ void lhs_span_multi_assign() {
   int *a = new int[2];
   int *b = a;
   int *c = b;
-  int *d = c;  // expected-warning{{'d' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'd' to 'std::span' to preserve bounds information, and change ('a', 'b', and 'c'|'a', 'c', and 'b'|'b', 'a', and 'c'|'b', 'c', and 'a'|'c', 'a', and 'b'|'c', 'b', and 'a') to 'std::span' to propagate bounds information between them$}}}}
+  int *d = c;  // expected-warning{{'d' is an unsafe pointer used for buffer access}} expected-note{{change type of 'd' to 'std::span' to preserve bounds information, and change 'c', 'b', and 'a' to 'std::span' to propagate bounds information between them}}
   int tmp = d[2];  // expected-note{{used in buffer access here}}
 }
 
 void rhs_span() {
   int *x = new int[3];
-  int *y;  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'y' to 'std::span' to preserve bounds information$}}}}
+  int *y;  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note{{change type of 'y' to 'std::span' to preserve bounds information}}
   y[5] = 10;  // expected-note{{used in buffer access here}}
 
   x = y;
@@ -57,9 +57,9 @@ void rhs_span() {
 
 void rhs_span1() {
   int *q = new int[12];
-  int *p = q;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('q' and 'r'|'r' and 'q') to 'std::span' to propagate bounds information between them$}}}}
+  int *p = q;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
   p[5] = 10;  // expected-note{{used in buffer access here}}
-  int *r = q;  // expected-warning{{'r' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'r' to 'std::span' to preserve bounds information, and change ('p' and 'q'|'q' and 'p') to 'std::span' to propagate bounds information between them$}}}}
+  int *r = q;  // expected-warning{{'r' is an unsafe pointer used for buffer access}} expected-note{{change type of 'r' to 'std::span' to preserve bounds information, and change 'p' and 'q' to 'std::span' to propagate bounds information between them}}
   r[10] = 5;  // expected-note{{used in buffer access here}}
 }
 
@@ -73,7 +73,7 @@ void rhs_span2() {
 void test_grouping() {
   int *z = new int[8];
   int tmp;
-  int *y = new int[10];  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'y' to 'std::span' to preserve bounds information$}}}}
+  int *y = new int[10];  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note{{change type of 'y' to 'std::span' to preserve bounds information}}
   tmp = y[5]; // expected-note{{used in buffer access here}}
 
   int *x = new int[10];
@@ -84,12 +84,12 @@ void test_grouping() {
 
 void test_grouping1() {
   int tmp;
-  int *y = new int[10];  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'y' to 'std::span' to preserve bounds information$}}}}
+  int *y = new int[10];  // expected-warning{{'y' is an unsafe pointer used for buffer access}} expected-note{{change type of 'y' to 'std::span' to preserve bounds information}}
   tmp = y[5];  // expected-note{{used in buffer access here}}
   int *x = new int[10];
   x = y;
 
-  int *w = new int[10];  // expected-warning{{'w' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'w' to 'std::span' to preserve bounds information$}}}}
+  int *w = new int[10];  // expected-warning{{'w' is an unsafe pointer used for buffer access}} expected-note{{change type of 'w' to 'std::span' to preserve bounds information}}
   tmp = w[5];  // expected-note{{used in buffer access here}}
   int *z = new int[10];
   z = w;
@@ -97,7 +97,7 @@ void test_grouping1() {
 
 void foo1a() {
   int *r = new int[7];
-  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
   p = r;
   int tmp = p[9];  // expected-note{{used in buffer access here}}
   int *q;
@@ -106,27 +106,27 @@ void foo1a() {
 
 void foo1b() {
   int *r = new int[7];
-  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('r' and 'q'|'q' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[4];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' and 'q' to 'std::span' to propagate bounds information between them}}
   p = r;
   int tmp = p[9];  // expected-note{{used in buffer access here}}
-  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change ('p' and 'r'|'r' and 'p') to 'std::span' to propagate bounds information between them$}}}}
+  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'p' and 'r' to 'std::span' to propagate bounds information between them}}
   q = r;
   tmp = q[9];  // expected-note{{used in buffer access here}}
 }
 
 void foo1c() {
-  int *r = new int[7];  // expected-warning{{'r' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'r' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them$}}}}
+  int *r = new int[7];  // expected-warning{{'r' is an unsafe pointer used for buffer access}} expected-note{{change type of 'r' to 'std::span' to preserve bounds information, and change 'q' to 'std::span' to propagate bounds information between them}}
   int *p = new int[4];
   p = r;
   int tmp = r[9];  // expected-note{{used in buffer access here}}
-  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
   q = r;
   tmp = q[9];  // expected-note{{used in buffer access here}}
 }
 
 void foo2a() {
   int *r = new int[7];
-  int *p = new int[5];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('r' and 'q'|'q' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[5];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
   int *q = new int[4];
   p = q;
   int tmp = p[8];  // expected-note{{used in buffer access here}}
@@ -136,7 +136,7 @@ void foo2a() {
 void foo2b() {
   int *r = new int[7];
   int *p = new int[5];
-  int *q = new int[4];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them$}}}}
+  int *q = new int[4];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'r' to 'std::span' to propagate bounds information between them}}
   p = q;
   int tmp = q[8];  // expected-note{{used in buffer access here}}
   q = r;
@@ -144,8 +144,8 @@ void foo2b() {
 
 void foo2c() {
   int *r = new int[7];
-  int *p = new int[5];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('r' and 'q'|'q' and 'r') to 'std::span' to propagate bounds information between them$}}}}
-  int *q = new int[4];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change ('p' and 'r'|'r' and 'p') to 'std::span' to propagate bounds information between them$}}}}
+  int *p = new int[5];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
+  int *q = new int[4];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'p' and 'r' to 'std::span' to propagate bounds information between them}}
   p = q;
   int tmp = p[8];  // expected-note{{used in buffer access here}}
   q = r;
@@ -154,7 +154,7 @@ void foo2c() {
 
 void foo3a() {
   int *r = new int[7];
-  int *p = new int[5];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information$}}}}
+  int *p = new int[5];  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information}}
   int *q = new int[4];
   q = p;
   int tmp = p[8];  // expected-note{{used in buffer access here}}
@@ -164,7 +164,7 @@ void foo3a() {
 void foo3b() {
   int *r = new int[7];
   int *p = new int[5];
-  int *q = new int[4];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} //expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change ('r' and 'p'|'p' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *q = new int[4];  // expected-warning{{'q' is an unsafe pointer used for buffer access}} //expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'r' and 'p' to 'std::span' to propagate bounds information between them}}
   q = p;
   int tmp = q[8];  // expected-note{{used in buffer access here}}
   q = r;
@@ -173,7 +173,7 @@ void foo3b() {
 void test_crash() {
   int *r = new int[8];
   int *q = r;
-  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('r' and 'q'|'q' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}} expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
   p = q;
   int tmp = p[9];  // expected-note{{used in buffer access here}}
 }
@@ -188,7 +188,7 @@ void foo_uuc() {
 }
 
 void check_rhs_fix() {
-  int *r = new int[8];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  // expected-note-re{{{{^change type of 'r' to 'std::span' to preserve bounds information, and change 'x' to 'std::span' to propagate bounds information between them$}}}}
+  int *r = new int[8];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  // expected-note{{change type of 'r' to 'std::span' to preserve bounds information, and change 'x' to 'std::span' to propagate bounds information between them}}
   int *x;
   r[7] = 9;  // expected-note{{used in buffer access here}}
   r = x;
@@ -243,7 +243,7 @@ void check_rhs_nofix_order4() {
 }
 
 void no_unhandled_lhs() {
-  int *r = new int[8];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  // expected-note-re{{{{^change type of 'r' to 'std::span' to preserve bounds information, and change 'x' to 'std::span' to propagate bounds information between them$}}}}
+  int *r = new int[8];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  // expected-note{{change type of 'r' to 'std::span' to preserve bounds information, and change 'x' to 'std::span' to propagate bounds information between them}}
   r[7] = 9;  // expected-note{{used in buffer access here}}
   int *x;
   r = x;
@@ -281,7 +281,7 @@ void test_unfixable() {
 }
 
 void test_cyclic_deps() {
-  int *r = new int[10];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'r' to 'std::span' to preserve bounds information, and change ('q' and 'p'|'p' and 'q') to 'std::span' to propagate bounds information between them$}}}}
+  int *r = new int[10];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'r' to 'std::span' to preserve bounds information, and change 'p' and 'q' to 'std::span' to propagate bounds information between them}}
   int *q;
   q = r;
   int *p;
@@ -305,7 +305,7 @@ void test_cyclic_deps1() {
   int *r = new int[10];
   int *q;
   q = r;
-  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('r' and 'q'|'q' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
   p = q;
   p[3] = 9; // expected-note{{used in buffer access here}}
   r = p;
@@ -313,7 +313,7 @@ void test_cyclic_deps1() {
 
 void test_cyclic_deps2() {
   int *r = new int[10];
-  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change ('r' and 'p'|'p' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'r' and 'p' to 'std::span' to propagate bounds information between them}}
   q = r;
   int *p;
   p = q;
@@ -323,9 +323,9 @@ void test_cyclic_deps2() {
 
 void test_cyclic_deps3() {
   int *r = new int[10];
-  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change ('r' and 'p'|'p' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'r' and 'p' to 'std::span' to propagate bounds information between them}}
   q = r;
-  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('q' and 'r'|'r' and 'q') to 'std::span' to propagate bounds information between them$}}}}
+  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'q' and 'r' to 'std::span' to propagate bounds information between them}}
   p = q;
   q[3] = 9; // expected-note{{used in buffer access here}}
   p[4] = 7; // expected-note{{used in buffer access here}}
@@ -333,10 +333,10 @@ void test_cyclic_deps3() {
 }
 
 void test_cyclic_deps4() {
-  int *r = new int[10];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'r' to 'std::span' to preserve bounds information, and change ('q' and 'p'|'p' and 'q') to 'std::span' to propagate bounds information between them$}}}}
-  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'q' to 'std::span' to preserve bounds information, and change ('r' and 'p'|'p' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *r = new int[10];  // expected-warning{{'r' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'r' to 'std::span' to preserve bounds information, and change 'p' and 'q' to 'std::span' to propagate bounds information between them}}
+  int *q;  // expected-warning{{'q' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'q' to 'std::span' to preserve bounds information, and change 'r' and 'p' to 'std::span' to propagate bounds information between them}}
   q = r;
-  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}}  expected-note-re{{{{^change type of 'p' to 'std::span' to preserve bounds information, and change ('r' and 'q'|'q' and 'r') to 'std::span' to propagate bounds information between them$}}}}
+  int *p;  // expected-warning{{'p' is an unsafe pointer used for buffer access}}  expected-note{{change type of 'p' to 'std::span' to preserve bounds information, and change 'r' and 'q' to 'std::span' to propagate bounds information between them}}
   p = q;
   q[3] = 9; // expected-note{{used in buffer access here}}
   p[4] = 7; // expected-note{{used in buffer access here}}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-fixit.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-fixit.cpp
index 47ef0b7972951..6bd1065e1e2be 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-fixit.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-fixit.cpp
@@ -7,9 +7,9 @@ void basic(int * x) {
   int *p1 = new int[10];  // no fix
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:
   int *p2 = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p2"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p2"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
 #pragma clang unsafe_buffer_usage begin
   tmp = p1[5];
 #pragma clang unsafe_buffer_usage end
@@ -21,9 +21,9 @@ void withDiagnosticWarning() {
   int *p1 = new int[10]; // no fix
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:
   int *p2 = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p2"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p2"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
 
   // diagnostics in opt-out region
 #pragma clang unsafe_buffer_usage begin
@@ -57,13 +57,13 @@ void withDiagnosticIgnore() {
   int *p1 = new int[10];
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:
   int *p2 = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p2"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p2"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
   int *p3 = new int[10];
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p3"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
-  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:12}:"std::span<int> p3"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:24-[[@LINE-3]]:24}:", 10}"
 
 #pragma clang unsafe_buffer_usage begin
   tmp = p1[5];  // not to warn
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-source-ranges.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-source-ranges.cpp
index fec3ea2913526..fe3a952696557 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-source-ranges.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-source-ranges.cpp
@@ -6,45 +6,45 @@ void foo(int i) {
   int * ptr;
 
   ptr++;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   ptr--;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   ++ptr;
-  // CHECK-DAG: {[[@LINE-1]]:5-[[@LINE-1]]:8}
+  // CHECK: {[[@LINE-1]]:5-[[@LINE-1]]:8}
   --ptr;
-  // CHECK-DAG: {[[@LINE-1]]:5-[[@LINE-1]]:8}
+  // CHECK: {[[@LINE-1]]:5-[[@LINE-1]]:8}
 
 
   ptr + 1;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   2 + ptr;
-  // CHECK-DAG: {[[@LINE-1]]:7-[[@LINE-1]]:10}
+  // CHECK: {[[@LINE-1]]:7-[[@LINE-1]]:10}
   ptr + i;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   i + ptr;
-  // CHECK-DAG: {[[@LINE-1]]:7-[[@LINE-1]]:10}
+  // CHECK: {[[@LINE-1]]:7-[[@LINE-1]]:10}
 
 
   ptr - 3;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   ptr - i;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
 
 
   ptr += 4;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   ptr += i;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
 
 
   ptr -= 5;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   ptr -= i;
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
 
 
   ptr[5];
-  // CHECK-DAG: {[[@LINE-1]]:3-[[@LINE-1]]:6}
+  // CHECK: {[[@LINE-1]]:3-[[@LINE-1]]:6}
   5[ptr];
-  // CHECK-DAG: {[[@LINE-1]]:5-[[@LINE-1]]:8}
+  // CHECK: {[[@LINE-1]]:5-[[@LINE-1]]:8}
 }

From 619a888dd8f7c739e62ac899edb4a409e1b7dd25 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Tue, 19 Sep 2023 14:33:12 -0700
Subject: [PATCH 03/57] [mlir][sparse][gpu] free all buffers allocated for
 spGEMM (#66813)

Yup, a bit of an oversight ;-)
---
 .../Transforms/SparseGPUCodegen.cpp           | 15 +++++++++--
 .../SparseTensor/GPU/gpu_spgemm_lib.mlir      | 26 ++++++++++++++-----
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index efdd3347558b4..91b346c8a9b4c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -795,10 +795,10 @@ rewriteSpGEMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
   Value rowC = e1.getResult(0);
   token = e1.getAsyncToken();
   auto e2 = genAllocBuffer(rewriter, loc, cTp.getCrdType(), zero, token);
-  Value colC = e2.getResult(0);
+  Value colC = e2.getResult(0);  // no free needed
   token = e2.getAsyncToken();
   auto e3 = genAllocBuffer(rewriter, loc, dnCType, zero, token);
-  Value valC = e3.getResult(0);
+  Value valC = e3.getResult(0);  // no free needed
   token = e3.getAsyncToken();
   Operation *spGenC =
       genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szm, szn, zero,
@@ -881,6 +881,17 @@ rewriteSpGEMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
   token = genCopyMemRef(rewriter, loc, rowH, rowC, token);
   token = genCopyMemRef(rewriter, loc, colH, colC, token);
   token = genCopyMemRef(rewriter, loc, valH, valC, token);
+  token = genDeallocMemRef(rewriter, loc, rowA, token);
+  token = genDeallocMemRef(rewriter, loc, colA, token);
+  token = genDeallocMemRef(rewriter, loc, valA, token);
+  token = genDeallocMemRef(rewriter, loc, rowB, token);
+  token = genDeallocMemRef(rewriter, loc, colB, token);
+  token = genDeallocMemRef(rewriter, loc, valB, token);
+  token = genDeallocMemRef(rewriter, loc, rowC, token);
+  token = genDeallocMemRef(rewriter, loc, colC, token);
+  token = genDeallocMemRef(rewriter, loc, valC, token);
+  token = genDeallocMemRef(rewriter, loc, buffer1, token);
+  token = genDeallocMemRef(rewriter, loc, buffer2, token);
   tokens.push_back(token);
   genBlockingWait(rewriter, loc, tokens);
   tokens.clear();
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
index 7b4c48dc34105..1bb51f4fcf518 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
@@ -5,7 +5,7 @@
 
 // CHECK-LABEL: func.func @matmulCSR(
 // CHECK-SAME:      %[[VAL_0:.*0]]: tensor<8x8xf32, #{{.*}}>,
-// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<8x8xf32, #{{.*}}>
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<8x8xf32, #{{.*}}>) -> tensor<8x8xf32, #{{.*}}> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_4:.*]] = arith.constant 9 : index
@@ -72,12 +72,24 @@
 // CHECK:           %[[VAL_88:.*]] = gpu.memcpy async {{\[}}%[[VAL_87]]] %[[VAL_81]], %[[VAL_49]] : memref<?xindex>, memref<?xindex>
 // CHECK:           %[[VAL_89:.*]] = gpu.memcpy async {{\[}}%[[VAL_88]]] %[[VAL_82]], %[[VAL_75]] : memref<?xindex>, memref<?xindex>
 // CHECK:           %[[VAL_90:.*]] = gpu.memcpy async {{\[}}%[[VAL_89]]] %[[VAL_83]], %[[VAL_77]] : memref<?xf32>, memref<?xf32>
-// CHECK:           gpu.wait {{\[}}%[[VAL_90]]]
-// CHECK:           %[[VAL_91:.*]] = bufferization.to_tensor %[[VAL_83]] : memref<?xf32>
-// CHECK:           %[[VAL_92:.*]] = bufferization.to_tensor %[[VAL_81]] : memref<?xindex>
-// CHECK:           %[[VAL_93:.*]] = bufferization.to_tensor %[[VAL_82]] : memref<?xindex>
-// CHECK:           %[[VAL_94:.*]] = sparse_tensor.pack %[[VAL_91]], %[[VAL_92]], %[[VAL_93]] : tensor<?xf32>, tensor<?xindex>, tensor<?xindex> to tensor<8x8xf32, #{{.*}}>
-// CHECK:           return %[[VAL_94]] : tensor<8x8xf32, #{{.*}}>
+// CHECK:           %[[VAL_91:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_92:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_93:.*]] = gpu.dealloc async {{.*}} : memref<?xf32>
+// CHECK:           %[[VAL_94:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_95:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_96:.*]] = gpu.dealloc async {{.*}} : memref<?xf32>
+// CHECK:           %[[VAL_97:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_98:.*]] = gpu.dealloc async {{.*}} : memref<?xindex>
+// CHECK:           %[[VAL_99:.*]] = gpu.dealloc async {{.*}} : memref<?xf32>
+// CHECK:           %[[VAL_a0:.*]] = gpu.dealloc async {{.*}} : memref<?xi8>
+// CHECK:           %[[VAL_a1:.*]] = gpu.dealloc async {{.*}} : memref<?xi8>
+// CHECK:           gpu.wait [%[[VAL_a1]]]
+// CHECK:           %[[VAL_a2:.*]] = bufferization.to_tensor %[[VAL_83]] : memref<?xf32>
+// CHECK:           %[[VAL_a3:.*]] = bufferization.to_tensor %[[VAL_81]] : memref<?xindex>
+// CHECK:           %[[VAL_a4:.*]] = bufferization.to_tensor %[[VAL_82]] : memref<?xindex>
+// CHECK:           %[[VAL_a5:.*]] = sparse_tensor.pack %[[VAL_a2]], %[[VAL_a3]], %[[VAL_a4]] : tensor<?xf32>, tensor<?xindex>, tensor<?xindex> to tensor<8x8xf32, #{{.*}}>
+// CHECK:           return %[[VAL_a5]] : tensor<8x8xf32, #{{.*}}>
+// CHECK:         }
 func.func @matmulCSR(%A: tensor<8x8xf32, #CSR>,
                      %B: tensor<8x8xf32, #CSR>) -> tensor<8x8xf32, #CSR> {
   %init = bufferization.alloc_tensor() : tensor<8x8xf32, #CSR>

From 64cffc7996b186a0901ae90e8a5432e0dc7b9c89 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Tue, 19 Sep 2023 14:33:55 -0700
Subject: [PATCH 04/57] [llvm][docs] Update active CoC Commitee members
 (#66814)

---
 llvm/docs/CodeOfConduct.rst | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/CodeOfConduct.rst b/llvm/docs/CodeOfConduct.rst
index 7b3fbd2c2a45b..c88c86369382e 100644
--- a/llvm/docs/CodeOfConduct.rst
+++ b/llvm/docs/CodeOfConduct.rst
@@ -148,13 +148,11 @@ The current committee members are:
 
 * Kit Barton (kbarton\@llvm.org)
 * Kristof Beyls (kristof.beyls\@llvm.org)
+* Stella Stamenova (sstamenova\@llvm.org)
+* David Blaikie (dblaikie\@llvm.org)
 * Mike Edwards (medwards\@llvm.org)
-* Hal Finkel (hfinkel\@llvm.org)
 * Cyndy Ishida (cishida\@llvm.org)
-* Anton Korobeynikov (anton.korobeynikov\@llvm.org)
 * Tanya Lattner (tanyalattner\@llvm.org)
-* Chris Lattner (clattner\@llvm.org)
-* Tom Stellard (tstellar\@llvm.org)
 
 
 Transparency Reports

From b88cffeafd393d54f85feb641c4f8fd4fdb73fe4 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn@google.com>
Date: Tue, 19 Sep 2023 14:33:53 -0700
Subject: [PATCH 05/57] Explicitly set triple on line-numbers.test

---
 llvm/test/tools/llvm-nm/X86/line-numbers.test | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/test/tools/llvm-nm/X86/line-numbers.test b/llvm/test/tools/llvm-nm/X86/line-numbers.test
index 4b9817ab5c62d..4acda8afb2a4e 100644
--- a/llvm/test/tools/llvm-nm/X86/line-numbers.test
+++ b/llvm/test/tools/llvm-nm/X86/line-numbers.test
@@ -18,7 +18,7 @@
 ## of main.o.
 # RUN: rm -rf %t
 # RUN: split-file %s %t
-# RUN: llvm-mc -g --filetype=obj %t/main.s -o %t/main.o
+# RUN: llvm-mc -g --filetype=obj -triple=x86_64-pc-linux %t/main.s -o %t/main.o
 # RUN: llvm-nm -l %t/main.o | FileCheck %s --match-full-lines --implicit-check-not={{.}}
 # RUN: llvm-nm --line-numbers %t/main.o | FileCheck %s --match-full-lines --implicit-check-not={{.}}
 
@@ -34,7 +34,7 @@
 
 ## Check that in the absence of DWARF in the whole object, no line number
 ## information is printed.
-# RUN: llvm-mc --filetype=obj %t/main.s -o %t/no-dwarf.o
+# RUN: llvm-mc --filetype=obj %t/main.s -triple=x86_64-pc-linux -o %t/no-dwarf.o
 # RUN: llvm-nm -l %t/no-dwarf.o | FileCheck %s --check-prefix=NO-DWARF --match-full-lines --implicit-check-not={{.}}
 
 # NO-DWARF:      0000000000001234 a absolute_symbol
@@ -49,7 +49,7 @@
 
 ## Check that printing line numbers for undefined values is not attempted in
 ## the absence of any relocation section.
-# RUN: llvm-mc --filetype=obj %t/undef-no-reloc-sections.s -o %t/undef-no-reloc-sections.o
+# RUN: llvm-mc --filetype=obj %t/undef-no-reloc-sections.s -triple=x86_64-pc-linux -o %t/undef-no-reloc-sections.o
 # RUN: llvm-nm --line-numbers %t/undef-no-reloc-sections.o | FileCheck %s --check-prefix=UNDEF-NO-RELOC-SECTIONS --match-full-lines --implicit-check-not={{.}}
 
 # UNDEF-NO-RELOC-SECTIONS: U undef
@@ -57,7 +57,7 @@
 ## Check that printing line numbers for undefined values does not include
 ## relocations for non-text sections. This is broken out of main.s to ensure
 ## that the data relocation for undef comes first.
-# RUN: llvm-mc -g --filetype=obj %t/undef-data-reloc.s -o %t/undef-data-reloc.o
+# RUN: llvm-mc -g --filetype=obj %t/undef-data-reloc.s -triple=x86_64-pc-linux -o %t/undef-data-reloc.o
 # RUN: llvm-nm --line-numbers %t/undef-data-reloc.o | FileCheck %s --check-prefix=UNDEF-DATA-RELOC --match-full-lines --implicit-check-not={{.}}
 
 # UNDEF-DATA-RELOC:      0000000000000000 r data_reloc
@@ -65,7 +65,7 @@
 
 ## Check that line numbers can be printed for data definitions. These are broken
 ## out of main.s since their DWARF cannot be generated with llvm-mc -g.
-# RUN: llvm-mc -g --filetype=obj %t/data-dwarf.s -o %t/data-dwarf.o
+# RUN: llvm-mc -g --filetype=obj %t/data-dwarf.s -triple=x86_64-pc-linux -o %t/data-dwarf.o
 # RUN: llvm-nm --line-numbers %t/data-dwarf.o | FileCheck %s --check-prefix=DATA-DWARF --match-full-lines --implicit-check-not={{.}}
 
 # DATA-DWARF: 0000000000000000 D defined_data /tmp/tmp.c:1

From d8873df4dc74cdcbbfd3334657daf9fedfaab951 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Tue, 19 Sep 2023 14:37:06 -0700
Subject: [PATCH 06/57] [AsmPrint] Dump raw frequencies in `-mbb-profile-dump`
 (#66818)

We were losing the function entry count, which is useful to check profile quality. For the original cases where we want
entrypoint-relative MBB frequencies, the user would just need to divide these values by the entrypoint (first MBB, with ID=0) value.
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp      | 2 +-
 llvm/test/CodeGen/MLRegAlloc/bb-profile-dump.ll | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 2ce08a2ff4395..0c4ea1b3d9f04 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1940,7 +1940,7 @@ void AsmPrinter::emitFunctionBody() {
     for (const auto &MBB : *MF) {
       *MBBProfileDumpFileOutput.get()
           << MF->getName() << "," << MBB.getBBID() << ","
-          << MBFI.getBlockFreqRelativeToEntryBlock(&MBB) << "\n";
+          << MBFI.getBlockFreq(&MBB).getFrequency() << "\n";
     }
   }
 }
diff --git a/llvm/test/CodeGen/MLRegAlloc/bb-profile-dump.ll b/llvm/test/CodeGen/MLRegAlloc/bb-profile-dump.ll
index e0ac148456cac..934c281219d4a 100644
--- a/llvm/test/CodeGen/MLRegAlloc/bb-profile-dump.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/bb-profile-dump.ll
@@ -12,6 +12,8 @@ define i64 @f2(i64 %a, i64 %b) {
     ret i64 %sum
 }
 
+; CHECK: f2,0,8
+
 define i64 @f1() {
     %sum = call i64 @f2(i64 2, i64 2)
     %isEqual = icmp eq i64 %sum, 4
@@ -22,10 +24,9 @@ ifNotEqual:
     ret i64 %sum
 }
 
-; CHECK: f2,0,1.000000e+00
-; CHECK-NEXT: f1,0,1.000000e+00
-; CHECK-NEXT: f1,1,5.000000e-01
-; CHECK-NEXT: f1,2,1.000000e+00
+; CHECK-NEXT: f1,0,16
+; CHECK-NEXT: f1,1,8
+; CHECK-NEXT: f1,2,16
 
 ; Check that if we pass -mbb-profile-dump but don't set -basic-block-sections,
 ; we get an appropriate error message

From 0677d7cd8c1549b04ae02e0a878faabf17f8999f Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Tue, 19 Sep 2023 23:41:51 +0200
Subject: [PATCH 07/57] [Clang] Static member initializers are not immediate
 escalating context. (#66021)

Per CWG2760, default members initializers should be consider part the
body of constructors, which mean they are evaluated in an immediate
escalating context.

However, this does not apply to static members.

This patch produces some extraneous diagnostics, unfortunately we do not
have a good way to report an error back to the initializer and this is a
pre existing issue

Fixes #65985
Fixes #66562
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/lib/Parse/ParseDeclCXX.cpp              | 12 ++++-
 clang/test/SemaCXX/cxx2a-consteval.cpp        | 49 +++++++++++++++++++
 .../SemaCXX/cxx2b-consteval-propagate.cpp     | 15 ++++++
 4 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ba91f9481fe98..44a5d5740dabe 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -287,6 +287,9 @@ Bug Fixes to C++ Support
   a non-template inner-class between the function and the class template.
   (`#65810 <https://github.com/llvm/llvm-project/issues/65810>`_)
 
+- Fix a crash when calling a non-constant immediate function
+  in the initializer of a static data member.
+  (`#65985 <https://github.com/llvm/llvm-project/issues/65985>_`).
 - Clang now properly converts static lambda call operator to function
   pointers on win32.
   (`#62594 <https://github.com/llvm/llvm-project/issues/62594>`_)
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 5fe9abb1fdcab..5a6b5efbf6c12 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -3232,13 +3232,21 @@ ExprResult Parser::ParseCXXMemberInitializer(Decl *D, bool IsFunction,
   assert(Tok.isOneOf(tok::equal, tok::l_brace) &&
          "Data member initializer not starting with '=' or '{'");
 
+  bool IsFieldInitialization = isa_and_present<FieldDecl>(D);
+
   EnterExpressionEvaluationContext Context(
       Actions,
-      isa_and_present<FieldDecl>(D)
+      IsFieldInitialization
           ? Sema::ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed
           : Sema::ExpressionEvaluationContext::PotentiallyEvaluated,
       D);
-  Actions.ExprEvalContexts.back().InImmediateEscalatingFunctionContext = true;
+
+  // CWG2760
+  // Default member initializers used to initialize a base or member subobject
+  // [...] are considered to be part of the function body
+  Actions.ExprEvalContexts.back().InImmediateEscalatingFunctionContext =
+      IsFieldInitialization;
+
   if (TryConsumeToken(tok::equal, EqualLoc)) {
     if (Tok.is(tok::kw_delete)) {
       // In principle, an initializer of '= delete p;' is legal, but it will
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index d98ec8048c324..a091fadfa3094 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -1126,4 +1126,53 @@ int  test2() { return h{nullptr}; }
 // expected-note@-2 {{subobject 'g' is not initialized}}
 
 
+}
+
+namespace GH65985 {
+
+int consteval operator""_foo(unsigned long long V) {
+    return 0;
+}
+int consteval operator""_bar(unsigned long long V); // expected-note 3{{here}}
+
+int consteval f() {
+  return 0;
+}
+
+int consteval g();  // expected-note {{here}}
+
+
+struct C {
+    static const int a = 1_foo;
+    static constexpr int b = 1_foo;
+    static const int c = 1_bar; // expected-error {{call to consteval function 'GH65985::operator""_bar' is not a constant expression}} \
+                                // expected-note {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
+                                // expected-error {{in-class initializer for static data member is not a constant expression}}
+
+    // FIXME: remove duplicate diagnostics
+    static constexpr int d = 1_bar; // expected-error {{call to consteval function 'GH65985::operator""_bar' is not a constant expression}} \
+                                    // expected-note {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
+                                    // expected-error {{constexpr variable 'd' must be initialized by a constant expression}}  \
+                                    // expected-note {{undefined function 'operator""_bar' cannot be used in a constant expression}}
+
+    static const int e = f();
+    static const int f = g(); // expected-error {{call to consteval function 'GH65985::g' is not a constant expression}} \
+                              // expected-error {{in-class initializer for static data member is not a constant expression}} \
+                              // expected-note  {{undefined function 'g' cannot be used in a constant expression}}
+};
+
+}
+
+namespace GH66562 {
+
+namespace ns
+{
+    consteval int foo(int x) { return 1; } // expected-note {{declared here}}
+}
+
+template <class A>
+struct T {
+    static constexpr auto xx = ns::foo(A{}); // expected-error {{cannot take address of consteval function 'foo' outside of an immediate invocation}}
+};
+
 }
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index c0adbbdf9be63..531a626228733 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -331,6 +331,21 @@ S s(0); // expected-note {{in the default initializer of 'j'}}
 
 }
 
+namespace GH65985 {
+consteval int invalid(); // expected-note 2{{declared here}}
+constexpr int escalating(auto) {
+    return invalid();
+    // expected-note@-1 {{'escalating<int>' is an immediate function because its body contains a call to a consteval function 'invalid' and that call is not a constant expression}}
+    // expected-note@-2 2{{undefined function 'invalid' cannot be used in a constant expression}}
+}
+struct S {
+    static constexpr int a = escalating(0); // expected-note 2{{in call to}}
+    // expected-error@-1 {{call to immediate function 'GH65985::escalating<int>' is not a constant expression}}
+    // expected-error@-2 {{constexpr variable 'a' must be initialized by a constant expression}}
+};
+
+}
+
 namespace GH66324 {
 
 consteval int allocate();  // expected-note  2{{declared here}}

From ab2c10451809129acdb97b48cd0d7133086eb589 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 19 Sep 2023 17:45:14 -0400
Subject: [PATCH 08/57] [mlir][spirv] Suffix NV cooperative matrix props with
 `_nv` (#66820)

This is in preparation for adding a KHR variant which does not share the
same parameters and needs a separate attribute.
---
 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td |  2 +-
 mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir        | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
index 259a96651abb3..fe61231919337 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
@@ -58,7 +58,7 @@ def SPIRV_LinkageAttributesAttr : SPIRV_Attr<"LinkageAttributes", "linkage_attri
 // target. Represents `VkCooperativeMatrixPropertiesNV`. See
 // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkCooperativeMatrixPropertiesNV.html
 def SPIRV_CooperativeMatrixPropertiesNVAttr :
-    SPIRV_Attr<"CooperativeMatrixPropertiesNV", "coop_matrix_props"> {
+    SPIRV_Attr<"CooperativeMatrixPropertiesNV", "coop_matrix_props_nv"> {
   let parameters = (ins
     "int":$m_size,
     "int":$n_size,
diff --git a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
index 82a7601dbd06e..2a9272568d44b 100644
--- a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
@@ -208,14 +208,14 @@ func.func @target_env_extra_fields() attributes {
 
 // -----
 
-func.func @target_env_cooperative_matrix() attributes{
+func.func @target_env_cooperative_matrix_nv() attributes{
   // CHECK:      spirv.target_env = #spirv.target_env<
   // CHECK-SAME:   SPV_NV_cooperative_matrix
-  // CHECK-SAME: #spirv.coop_matrix_props<
+  // CHECK-SAME: #spirv.coop_matrix_props_nv<
   // CHECK-SAME:   m_size = 8, n_size = 8, k_size = 32,
   // CHECK-SAME:   a_type = i8, b_type = i8, c_type = i32,
   // CHECK-SAME:   result_type = i32, scope = <Subgroup>>
-  // CHECK-SAME: #spirv.coop_matrix_props<
+  // CHECK-SAME: #spirv.coop_matrix_props_nv<
   // CHECK-SAME:   m_size = 8, n_size = 8, k_size = 16,
   // CHECK-SAME:   a_type = f16, b_type = f16, c_type = f16,
   // CHECK-SAME:   result_type = f16, scope = <Subgroup>>
@@ -223,7 +223,7 @@ func.func @target_env_cooperative_matrix() attributes{
   #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class,
                             SPV_NV_cooperative_matrix]>,
   #spirv.resource_limits<
-    cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<
+    cooperative_matrix_properties_nv = [#spirv.coop_matrix_props_nv<
       m_size = 8,
       n_size = 8,
       k_size = 32,
@@ -232,7 +232,7 @@ func.func @target_env_cooperative_matrix() attributes{
       c_type = i32,
       result_type = i32,
       scope = #spirv.scope<Subgroup>
-    >, #spirv.coop_matrix_props<
+    >, #spirv.coop_matrix_props_nv<
       m_size = 8,
       n_size = 8,
       k_size = 16,

From d13da154a7c7eff77df8686b2de1cfdfa7cc7029 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 19 Sep 2023 17:50:22 -0400
Subject: [PATCH 09/57] [mlir][spirv] Define KHR cooperative matrix properties
 (#66823)

---
 .../mlir/Dialect/SPIRV/IR/SPIRVAttributes.td  | 28 +++++++++++++
 mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp    |  3 +-
 .../test/Dialect/SPIRV/IR/target-and-abi.mlir | 41 +++++++++++++++++++
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
index fe61231919337..f2c1ee5cfd56e 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
@@ -54,6 +54,29 @@ def SPIRV_LinkageAttributesAttr : SPIRV_Attr<"LinkageAttributes", "linkage_attri
   let assemblyFormat = "`<` struct(params) `>`";
 }
 
+// Description of cooperative matrix operations supported on the
+// target. Represents `VkCooperativeMatrixPropertiesKHR`. See
+// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkCooperativeMatrixPropertiesKHR.html
+def SPIRV_CooperativeMatrixPropertiesKHRAttr :
+    SPIRV_Attr<"CooperativeMatrixPropertiesKHR", "coop_matrix_props_khr"> {
+  let parameters = (ins
+    "uint32_t":$m_size,
+    "uint32_t":$n_size,
+    "uint32_t":$k_size,
+    "mlir::Type":$a_type,
+    "mlir::Type":$b_type,
+    "mlir::Type":$c_type,
+    "mlir::Type":$result_type,
+    "bool":$acc_sat,
+    "mlir::spirv::ScopeAttr":$scope
+  );
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def SPIRV_CooperativeMatrixPropertiesKHRArrayAttr :
+    TypedArrayAttrBase<SPIRV_CooperativeMatrixPropertiesKHRAttr,
+                       "CooperativeMatrixPropertiesKHR array attribute">;
+
 // Description of cooperative matrix operations supported on the
 // target. Represents `VkCooperativeMatrixPropertiesNV`. See
 // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkCooperativeMatrixPropertiesNV.html
@@ -130,6 +153,11 @@ def SPIRV_ResourceLimitsAttr : SPIRV_Attr<"ResourceLimits", "resource_limits"> {
 
     // The configurations of cooperative matrix operations
     // supported. Default is an empty list.
+    DefaultValuedParameter<
+      "ArrayAttr",
+      "nullptr"
+    >:$cooperative_matrix_properties_khr,
+
     DefaultValuedParameter<
       "ArrayAttr",
       "nullptr"
diff --git a/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp b/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp
index 051b2cb9f1a88..5b7c0a59ba420 100644
--- a/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp
@@ -166,7 +166,8 @@ spirv::getDefaultResourceLimits(MLIRContext *context) {
       /*subgroup_size=*/32,
       /*min_subgroup_size=*/std::nullopt,
       /*max_subgroup_size=*/std::nullopt,
-      /*cooperative_matrix_properties_nv=*/ArrayAttr());
+      /*cooperative_matrix_properties_khr=*/ArrayAttr{},
+      /*cooperative_matrix_properties_nv=*/ArrayAttr{});
 }
 
 StringRef spirv::getTargetEnvAttrName() { return "spirv.target_env"; }
diff --git a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
index 2a9272568d44b..10fbcf06eb052 100644
--- a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
@@ -208,6 +208,47 @@ func.func @target_env_extra_fields() attributes {
 
 // -----
 
+func.func @target_env_cooperative_matrix_khr() attributes{
+  // CHECK:      spirv.target_env = #spirv.target_env<
+  // CHECK-SAME:   SPV_KHR_cooperative_matrix
+  // CHECK-SAME: #spirv.coop_matrix_props_khr<
+  // CHECK-SAME:   m_size = 8, n_size = 8, k_size = 32,
+  // CHECK-SAME:   a_type = i8, b_type = i8, c_type = i32,
+  // CHECK-SAME:   result_type = i32, acc_sat = true, scope = <Subgroup>>
+  // CHECK-SAME: #spirv.coop_matrix_props_khr<
+  // CHECK-SAME:   m_size = 8, n_size = 8, k_size = 16,
+  // CHECK-SAME:   a_type = f16, b_type = f16, c_type = f16,
+  // CHECK-SAME:   result_type = f16, acc_sat = false, scope = <Subgroup>>
+  spirv.target_env = #spirv.target_env<
+  #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class,
+                            SPV_KHR_cooperative_matrix]>,
+  #spirv.resource_limits<
+    cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<
+      m_size = 8,
+      n_size = 8,
+      k_size = 32,
+      a_type = i8,
+      b_type = i8,
+      c_type = i32,
+      result_type = i32,
+      acc_sat = true,
+      scope = #spirv.scope<Subgroup>
+    >, #spirv.coop_matrix_props_khr<
+      m_size = 8,
+      n_size = 8,
+      k_size = 16,
+      a_type = f16,
+      b_type = f16,
+      c_type = f16,
+      result_type = f16,
+      acc_sat = false,
+      scope = #spirv.scope<Subgroup>
+    >]
+  >>
+} { return }
+
+// -----
+
 func.func @target_env_cooperative_matrix_nv() attributes{
   // CHECK:      spirv.target_env = #spirv.target_env<
   // CHECK-SAME:   SPV_NV_cooperative_matrix

From aa71680f2948ac177144e1089c58d55b9ac0cef2 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Tue, 19 Sep 2023 18:05:13 -0400
Subject: [PATCH 10/57] [lit] Fix a test fail under windows

Seen at
<https://lab.llvm.org/buildbot/#/builders/216/builds/27531/steps/7/logs/FAIL__lit___shtest-run-at-line_py>.

Discussed starting at
<https://github.com/llvm/llvm-project/pull/66408#issuecomment-1726448368>.
---
 llvm/utils/lit/tests/shtest-run-at-line.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/lit/tests/shtest-run-at-line.py b/llvm/utils/lit/tests/shtest-run-at-line.py
index b92aee97628b2..a1bdb039805ad 100644
--- a/llvm/utils/lit/tests/shtest-run-at-line.py
+++ b/llvm/utils/lit/tests/shtest-run-at-line.py
@@ -1,8 +1,15 @@
 # Check that -a/-v/-vv makes the line number of the failing RUN command clear.
 
-# RUN: not %{lit} -a %{inputs}/shtest-run-at-line | FileCheck %s
-# RUN: not %{lit} -v %{inputs}/shtest-run-at-line | FileCheck %s
-# RUN: not %{lit} -vv %{inputs}/shtest-run-at-line | FileCheck %s
+
+# This diagnostic sometimes appears in windows when using bash as an external
+# shell.  Ignore it so we can strictly check the relevant output.
+#
+# DEFINE: %{filter} = \
+# DEFINE:   grep -v 'bash.exe: warning: could not find /tmp, please create!'
+
+# RUN: not %{lit} -a %{inputs}/shtest-run-at-line | %{filter} | FileCheck %s
+# RUN: not %{lit} -v %{inputs}/shtest-run-at-line | %{filter} | FileCheck %s
+# RUN: not %{lit} -vv %{inputs}/shtest-run-at-line | %{filter} | FileCheck %s
 # END.
 
 

From a50486fd736ab2fe03fcacaf8b98876db77217a7 Mon Sep 17 00:00:00 2001
From: Alan Phipps <a-phipps@ti.com>
Date: Mon, 18 Sep 2023 15:18:40 -0500
Subject: [PATCH 11/57] [InstrProf][compiler-rt] Enable MC/DC Support in LLVM
 Source-based Code Coverage (1/3)

Part 1 of 3. This includes the LLVM back-end processing and profile
reading/writing components. compiler-rt changes are included.

Differential Revision: https://reviews.llvm.org/D138846
---
 .../CodeGen/coverage-profile-raw-version.c    |   4 +-
 compiler-rt/include/profile/InstrProfData.inc |  22 +-
 compiler-rt/lib/profile/InstrProfiling.c      |   4 +
 compiler-rt/lib/profile/InstrProfiling.h      |  25 +-
 .../lib/profile/InstrProfilingBuffer.c        |  42 +-
 compiler-rt/lib/profile/InstrProfilingFile.c  |  50 ++-
 .../lib/profile/InstrProfilingInternal.h      |   8 +-
 compiler-rt/lib/profile/InstrProfilingMerge.c |  33 +-
 .../profile/InstrProfilingPlatformDarwin.c    |   9 +
 .../lib/profile/InstrProfilingPlatformLinux.c |  10 +
 .../lib/profile/InstrProfilingPlatformOther.c |   4 +
 .../profile/InstrProfilingPlatformWindows.c   |   7 +
 .../lib/profile/InstrProfilingWriter.c        |  18 +-
 .../profile/instrprof-write-buffer-internal.c |  21 +-
 llvm/docs/LangRef.rst                         | 138 +++++++
 llvm/include/llvm/IR/IntrinsicInst.h          |  94 ++++-
 llvm/include/llvm/IR/Intrinsics.td            |  15 +
 .../ProfileData/Coverage/CoverageMapping.h    |   4 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |  21 +-
 .../llvm/ProfileData/InstrProfData.inc        |  22 +-
 .../llvm/ProfileData/InstrProfReader.h        |   9 +
 .../Instrumentation/InstrProfiling.h          |  46 ++-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   6 +
 llvm/lib/IR/IntrinsicInst.cpp                 |   4 +-
 .../Coverage/CoverageMappingReader.cpp        |   4 +
 llvm/lib/ProfileData/InstrProf.cpp            |  23 +-
 llvm/lib/ProfileData/InstrProfCorrelator.cpp  |   4 +
 llvm/lib/ProfileData/InstrProfReader.cpp      | 109 ++++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  19 +
 .../Instrumentation/InstrProfiling.cpp        | 379 ++++++++++++++----
 .../Instrumentation/InstrProfiling/mcdc.ll    |  53 +++
 .../Transforms/PGOProfile/comdat_internal.ll  |   4 +-
 .../tools/llvm-profdata/Inputs/basic.profraw  | Bin 152 -> 192 bytes
 .../llvm-profdata/Inputs/c-general.profraw    | Bin 1800 -> 2016 bytes
 .../llvm-profdata/Inputs/compressed.profraw   | Bin 1768 -> 1968 bytes
 .../llvm-profdata/binary-ids-padding.test     |  13 +-
 .../llvm-profdata/large-binary-id-size.test   |   5 +-
 ...alformed-not-space-for-another-header.test |   9 +-
 .../malformed-num-counters-zero.test          |  10 +-
 .../malformed-ptr-to-counter-array.test       |   9 +-
 .../test/tools/llvm-profdata/mcdc-bitmap.test | 201 ++++++++++
 .../misaligned-binary-ids-size.test           |   2 +-
 .../mismatched-raw-profile-header.test        |   3 +
 .../tools/llvm-profdata/raw-32-bits-be.test   |  28 +-
 .../tools/llvm-profdata/raw-32-bits-le.test   |  28 +-
 .../tools/llvm-profdata/raw-64-bits-be.test   |  24 +-
 .../tools/llvm-profdata/raw-64-bits-le.test   |  24 +-
 .../tools/llvm-profdata/raw-two-profiles.test |  14 +-
 48 files changed, 1410 insertions(+), 171 deletions(-)
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/mcdc.ll
 create mode 100644 llvm/test/tools/llvm-profdata/mcdc-bitmap.test

diff --git a/clang/test/CodeGen/coverage-profile-raw-version.c b/clang/test/CodeGen/coverage-profile-raw-version.c
index 749dce50298f0..bb30fd8c1c70a 100644
--- a/clang/test/CodeGen/coverage-profile-raw-version.c
+++ b/clang/test/CodeGen/coverage-profile-raw-version.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -debug-info-kind=standalone -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -debug-info-kind=standalone -mllvm -debug-info-correlate -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -o - %s | FileCheck %s --check-prefix=DEBUG_INFO
 
-// CHECK: @__llvm_profile_raw_version = {{.*}}constant i64 8
-// DEBUG_INFO: @__llvm_profile_raw_version = {{.*}}constant i64 576460752303423496
+// CHECK: @__llvm_profile_raw_version = {{.*}}constant i64 9
+// DEBUG_INFO: @__llvm_profile_raw_version = {{.*}}constant i64 576460752303423497
 
 int main() {
     return 0;
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 4456bf1ab1763..fad14576c442d 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -76,6 +76,7 @@ INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, CounterPtr, RelativeCounterPtr)
+INSTR_PROF_DATA(const IntPtrT, IntPtrTy, BitmapPtr, RelativeBitmapPtr)
 /* This is used to map function pointers for the indirect call targets to
  * function name hashes during the conversion from raw to merged profile
  * data.
@@ -87,7 +88,9 @@ INSTR_PROF_DATA(IntPtrT, llvm::Type::getInt8PtrTy(Ctx), Values, \
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
 INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+                ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
@@ -132,9 +135,13 @@ INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, NumCounters, NumCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterCounters, PaddingBytesAfterCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, NumBitmapBytes, NumBitmapBytes)
+INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterBitmapBytes, PaddingBytesAfterBitmapBytes)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
                       (uintptr_t)CountersBegin - (uintptr_t)DataBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
+                      (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
@@ -267,6 +274,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
+                      INSTR_PROF_BITS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
@@ -646,11 +656,11 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 
 /* FIXME: Please remedy the fixme in the header before bumping the version. */
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 8
+#define INSTR_PROF_RAW_VERSION 9
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 10
+#define INSTR_PROF_INDEX_VERSION 11
 /* Coverage mapping format version (start from 0). */
-#define INSTR_PROF_COVMAP_VERSION 5
+#define INSTR_PROF_COVMAP_VERSION 6
 
 /* Profile version is always of type uint64_t. Reserve the upper 8 bits in the
  * version for other variants of profile. We set the lowest bit of the upper 8
@@ -687,6 +697,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
@@ -698,6 +709,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
+#define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
@@ -709,6 +721,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -723,6 +736,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON)
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c
index 0dd5ff5ae6331..da04d8ebdec95 100644
--- a/compiler-rt/lib/profile/InstrProfiling.c
+++ b/compiler-rt/lib/profile/InstrProfiling.c
@@ -60,6 +60,10 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) {
       (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE) ? 0xFF : 0;
   memset(I, ResetValue, E - I);
 
+  I = __llvm_profile_begin_bitmap();
+  E = __llvm_profile_end_bitmap();
+  memset(I, 0x0, E - I);
+
   const __llvm_profile_data *DataBegin = __llvm_profile_begin_data();
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const __llvm_profile_data *DI;
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 4433d7bd48871..e143149fca827 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -88,6 +88,8 @@ const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
+char *__llvm_profile_begin_bitmap(void);
+char *__llvm_profile_end_bitmap(void);
 ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
 uint32_t *__llvm_profile_begin_orderfile();
@@ -101,11 +103,11 @@ void __llvm_profile_reset_counters(void);
 /*!
  * \brief Merge profile data from buffer.
  *
- * Read profile data form buffer \p Profile  and merge with in-process profile
- * counters. The client is expected to have checked or already knows the profile
- * data in the buffer matches the in-process counter structure before calling
- * it. Returns 0 (success) if the profile data is valid. Upon reading
- * invalid/corrupted profile data, returns 1 (failure).
+ * Read profile data from buffer \p Profile and merge with in-process profile
+ * counters and bitmaps. The client is expected to have checked or already
+ * know the profile data in the buffer matches the in-process counter
+ * structure before calling it. Returns 0 (success) if the profile data is
+ * valid. Upon reading invalid/corrupted profile data, returns 1 (failure).
  */
 int __llvm_profile_merge_from_buffer(const char *Profile, uint64_t Size);
 
@@ -113,8 +115,8 @@ int __llvm_profile_merge_from_buffer(const char *Profile, uint64_t Size);
  *
  *  Returns 0 (success) if the profile data in buffer \p Profile with size
  *  \p Size was generated by the same binary and therefore matches
- *  structurally the in-process counters. If the profile data in buffer is
- *  not compatible, the interface returns 1 (failure).
+ *  structurally the in-process counters and bitmaps. If the profile data in
+ *  buffer is not compatible, the interface returns 1 (failure).
  */
 int __llvm_profile_check_compatibility(const char *Profile,
                                        uint64_t Size);
@@ -276,6 +278,10 @@ uint64_t __llvm_profile_get_num_counters(const char *Begin, const char *End);
 /*! \brief Get the size of the profile counters section in bytes. */
 uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End);
 
+/*! \brief Get the number of bytes in the profile bitmap section. */
+uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
+                                             const char *End);
+
 /* ! \brief Given the sizes of the data and counter information, return the
  * number of padding bytes before and after the counters, and after the names,
  * in the raw profile.
@@ -286,8 +292,9 @@ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End);
  * needed to achieve that.
  */
 void __llvm_profile_get_padding_sizes_for_counters(
-    uint64_t DataSize, uint64_t CountersSize, uint64_t NamesSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
+    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
     uint64_t *PaddingBytesAfterNames);
 
 /*!
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 61ac5d9c02850..c7217b2dfef8a 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -43,11 +43,14 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
 
   return __llvm_profile_get_size_for_buffer_internal(
-      DataBegin, DataEnd, CountersBegin, CountersEnd, NamesBegin, NamesEnd);
+      DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd,
+      NamesBegin, NamesEnd);
 }
 
 COMPILER_RT_VISIBILITY
@@ -83,6 +86,12 @@ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End) {
          __llvm_profile_counter_entry_size();
 }
 
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
+                                             const char *End) {
+  return (End - Begin);
+}
+
 /// Calculate the number of padding bytes needed to add to \p Offset in order
 /// for (\p Offset + Padding) to be page-aligned.
 static uint64_t calculateBytesNeededToPageAlign(uint64_t Offset) {
@@ -102,13 +111,16 @@ static int needsCounterPadding(void) {
 
 COMPILER_RT_VISIBILITY
 void __llvm_profile_get_padding_sizes_for_counters(
-    uint64_t DataSize, uint64_t CountersSize, uint64_t NamesSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
+    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
     uint64_t *PaddingBytesAfterNames) {
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
         __llvm_profile_get_num_padding_bytes(CountersSize);
+    *PaddingBytesAfterBitmapBytes =
+        __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
     return;
   }
@@ -118,31 +130,37 @@ void __llvm_profile_get_padding_sizes_for_counters(
   *PaddingBytesBeforeCounters =
       calculateBytesNeededToPageAlign(sizeof(__llvm_profile_header) + DataSize);
   *PaddingBytesAfterCounters = calculateBytesNeededToPageAlign(CountersSize);
+  *PaddingBytesAfterBitmapBytes =
+      calculateBytesNeededToPageAlign(NumBitmapBytes);
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
-    const char *CountersBegin, const char *CountersEnd, const char *NamesBegin,
-    const char *NamesEnd) {
+    const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) {
   /* Match logic in __llvm_profile_write_buffer(). */
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
   uint64_t CountersSize =
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
+  const uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
 
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NamesSize, &PaddingBytesBeforeCounters,
-      &PaddingBytesAfterCounters, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize,
+      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
-         PaddingBytesAfterCounters + NamesSize + PaddingBytesAfterNames;
+         PaddingBytesAfterCounters + NumBitmapBytes +
+         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames;
 }
 
 COMPILER_RT_VISIBILITY
@@ -160,9 +178,11 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer(char *Buffer) {
 COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     char *Buffer, const __llvm_profile_data *DataBegin,
     const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd) {
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *NamesBegin, const char *NamesEnd) {
   ProfDataWriter BufferWriter;
   initBufferWriter(&BufferWriter, Buffer);
   return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, 0, NamesBegin, NamesEnd, 0);
+                            CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin,
+                            NamesEnd, 0);
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 5556eccbf5787..36c17b3b4e1b7 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -108,14 +108,18 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
   uint64_t CountersSize =
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
+  uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
 
-  /* Check that the counter and data sections in this image are
+  /* Check that the counter, bitmap, and data sections in this image are
    * page-aligned. */
   unsigned PageSize = getpagesize();
   if ((intptr_t)CountersBegin % PageSize != 0) {
@@ -123,6 +127,11 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
              CountersBegin, PageSize);
     return 1;
   }
+  if ((intptr_t)BitmapBegin % PageSize != 0) {
+    PROF_ERR("Bitmap section not page-aligned (start = %p, pagesz = %u).\n",
+             BitmapBegin, PageSize);
+    return 1;
+  }
   if ((intptr_t)DataBegin % PageSize != 0) {
     PROF_ERR("Data section not page-aligned (start = %p, pagesz = %u).\n",
              DataBegin, PageSize);
@@ -132,10 +141,11 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Determine how much padding is needed before/after the counters and
    * after the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NamesSize, &PaddingBytesBeforeCounters,
-      &PaddingBytesAfterCounters, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize,
+      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   uint64_t PageAlignedCountersLength = CountersSize + PaddingBytesAfterCounters;
   uint64_t FileOffsetToCounters = CurrentFileOffset +
@@ -155,6 +165,31 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
         FileOffsetToCounters);
     return 1;
   }
+
+  /* Also mmap MCDC bitmap bytes. If there aren't any bitmap bytes, mmap()
+   * will fail with EINVAL. */
+  if (NumBitmapBytes == 0)
+    return 0;
+
+  uint64_t PageAlignedBitmapLength =
+      NumBitmapBytes + PaddingBytesAfterBitmapBytes;
+  uint64_t FileOffsetToBitmap =
+      CurrentFileOffset + sizeof(__llvm_profile_header) + DataSize +
+      PaddingBytesBeforeCounters + CountersSize + PaddingBytesAfterCounters;
+  void *BitmapMmap =
+      mmap((void *)BitmapBegin, PageAlignedBitmapLength, PROT_READ | PROT_WRITE,
+           MAP_FIXED | MAP_SHARED, Fileno, FileOffsetToBitmap);
+  if (BitmapMmap != BitmapBegin) {
+    PROF_ERR(
+        "Continuous counter sync mode is enabled, but mmap() failed (%s).\n"
+        "  - BitmapBegin: %p\n"
+        "  - PageAlignedBitmapLength: %" PRIu64 "\n"
+        "  - Fileno: %d\n"
+        "  - FileOffsetToBitmap: %" PRIu64 "\n",
+        strerror(errno), BitmapBegin, PageAlignedBitmapLength, Fileno,
+        FileOffsetToBitmap);
+    return 1;
+  }
   return 0;
 }
 #elif defined(__ELF__) || defined(_WIN32)
@@ -197,6 +232,8 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
   /* Get the file size. */
   uint64_t FileSize = 0;
@@ -218,6 +255,11 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
 
   /* Return the memory allocated for counters to OS. */
   lprofReleaseMemoryPagesToOS((uintptr_t)CountersBegin, (uintptr_t)CountersEnd);
+
+  /* BIAS MODE not supported yet for Bitmap (MCDC). */
+
+  /* Return the memory allocated for counters to OS. */
+  lprofReleaseMemoryPagesToOS((uintptr_t)BitmapBegin, (uintptr_t)BitmapEnd);
   return 0;
 }
 #else
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 360165e32ab3f..03ed67fcfa766 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -21,8 +21,8 @@
  */
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
-    const char *CountersBegin, const char *CountersEnd, const char *NamesBegin,
-    const char *NamesEnd);
+    const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
 
 /*!
  * \brief Write instrumentation data to the given buffer, given explicit
@@ -36,7 +36,8 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
 int __llvm_profile_write_buffer_internal(
     char *Buffer, const __llvm_profile_data *DataBegin,
     const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *NamesBegin, const char *NamesEnd);
 
 /*!
  * The data structure describing the data to be written by the
@@ -153,6 +154,7 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        const __llvm_profile_data *DataBegin,
                        const __llvm_profile_data *DataEnd,
                        const char *CountersBegin, const char *CountersEnd,
+                       const char *BitmapBegin, const char *BitmapEnd,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
                        const char *NamesEnd, int SkipNameDataWrite);
 
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index 9cf12f251f726..c5f168bf75177 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -66,6 +66,9 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
       Header->NumCounters !=
           __llvm_profile_get_num_counters(__llvm_profile_begin_counters(),
                                           __llvm_profile_end_counters()) ||
+      Header->NumBitmapBytes !=
+          __llvm_profile_get_num_bitmap_bytes(__llvm_profile_begin_bitmap(),
+                                              __llvm_profile_end_bitmap()) ||
       Header->NamesSize != (uint64_t)(__llvm_profile_end_names() -
                                       __llvm_profile_begin_names()) ||
       Header->ValueKindLast != IPVK_Last)
@@ -74,7 +77,8 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
   if (ProfileSize <
       sizeof(__llvm_profile_header) + Header->BinaryIdsSize +
           Header->NumData * sizeof(__llvm_profile_data) + Header->NamesSize +
-          Header->NumCounters * __llvm_profile_counter_entry_size())
+          Header->NumCounters * __llvm_profile_counter_entry_size() +
+          Header->NumBitmapBytes)
     return 1;
 
   for (SrcData = SrcDataStart,
@@ -82,7 +86,8 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
        SrcData < SrcDataEnd; ++SrcData, ++DstData) {
     if (SrcData->NameRef != DstData->NameRef ||
         SrcData->FuncHash != DstData->FuncHash ||
-        SrcData->NumCounters != DstData->NumCounters)
+        SrcData->NumCounters != DstData->NumCounters ||
+        SrcData->NumBitmapBytes != DstData->NumBitmapBytes)
       return 1;
   }
 
@@ -112,9 +117,11 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   __llvm_profile_header *Header = (__llvm_profile_header *)ProfileData;
   char *SrcCountersStart, *DstCounter;
   const char *SrcCountersEnd, *SrcCounter;
+  const char *SrcBitmapStart;
   const char *SrcNameStart;
   const char *SrcValueProfDataStart, *SrcValueProfData;
   uintptr_t CountersDelta = Header->CountersDelta;
+  uintptr_t BitmapDelta = Header->BitmapDelta;
 
   SrcDataStart =
       (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header) +
@@ -123,11 +130,12 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   SrcCountersStart = (char *)SrcDataEnd;
   SrcCountersEnd = SrcCountersStart +
                    Header->NumCounters * __llvm_profile_counter_entry_size();
-  SrcNameStart = SrcCountersEnd;
+  SrcBitmapStart = SrcCountersEnd;
+  SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
   SrcValueProfDataStart =
       SrcNameStart + Header->NamesSize +
       __llvm_profile_get_num_padding_bytes(Header->NamesSize);
-  if (SrcNameStart < SrcCountersStart)
+  if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
   // Merge counters by iterating the entire counter section when debug info
@@ -157,6 +165,8 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
     // extend CounterPtr to get the original value.
     char *DstCounters =
         (char *)((uintptr_t)DstData + signextIfWin64(DstData->CounterPtr));
+    char *DstBitmap =
+        (char *)((uintptr_t)DstData + signextIfWin64(DstData->BitmapPtr));
     unsigned NVK = 0;
 
     // SrcData is a serialized representation of the memory image. We need to
@@ -186,6 +196,21 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
       }
     }
 
+    const char *SrcBitmap =
+        SrcBitmapStart + ((uintptr_t)SrcData->BitmapPtr - BitmapDelta);
+    // BitmapDelta also needs to be decreased as we advance to the next data
+    // record.
+    BitmapDelta -= sizeof(*SrcData);
+    unsigned NB = SrcData->NumBitmapBytes;
+    // NumBitmapBytes may legitimately be 0. Just keep going.
+    if (NB != 0) {
+      if (SrcBitmap < SrcBitmapStart || (SrcBitmap + NB) > SrcNameStart)
+        return 1;
+      // Merge Src and Dst Bitmap bytes by simply ORing them together.
+      for (unsigned I = 0; I < NB; I++)
+        DstBitmap[I] |= SrcBitmap[I];
+    }
+
     /* Now merge value profile data. */
     if (!VPMergeHook)
       continue;
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
index d9f2a113f5b02..2154d242a8174 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
@@ -31,6 +31,11 @@ extern char
 COMPILER_RT_VISIBILITY
 extern char CountersEnd __asm("section$end$__DATA$" INSTR_PROF_CNTS_SECT_NAME);
 COMPILER_RT_VISIBILITY
+extern char
+    BitmapStart __asm("section$start$__DATA$" INSTR_PROF_BITS_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern char BitmapEnd __asm("section$end$__DATA$" INSTR_PROF_BITS_SECT_NAME);
+COMPILER_RT_VISIBILITY
 extern uint32_t
     OrderFileStart __asm("section$start$__DATA$" INSTR_PROF_ORDERFILE_SECT_NAME);
 
@@ -56,6 +61,10 @@ char *__llvm_profile_begin_counters(void) { return &CountersStart; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_counters(void) { return &CountersEnd; }
 COMPILER_RT_VISIBILITY
+char *__llvm_profile_begin_bitmap(void) { return &BitmapStart; }
+COMPILER_RT_VISIBILITY
+char *__llvm_profile_end_bitmap(void) { return &BitmapEnd; }
+COMPILER_RT_VISIBILITY
 uint32_t *__llvm_profile_begin_orderfile(void) { return &OrderFileStart; }
 
 COMPILER_RT_VISIBILITY
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 2cce0a4b2c48d..d0c42462e5e31 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -35,6 +35,8 @@
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
+#define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON)
+#define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON)
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
 #define PROF_VNODES_START INSTR_PROF_SECT_START(INSTR_PROF_VNODES_COMMON)
 #define PROF_VNODES_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNODES_COMMON)
@@ -48,6 +50,8 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_NAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_NAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -74,6 +78,12 @@ COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) {
 COMPILER_RT_VISIBILITY char *__llvm_profile_end_counters(void) {
   return &PROF_CNTS_STOP;
 }
+COMPILER_RT_VISIBILITY char *__llvm_profile_begin_bitmap(void) {
+  return &PROF_BITS_START;
+}
+COMPILER_RT_VISIBILITY char *__llvm_profile_end_bitmap(void) {
+  return &PROF_BITS_STOP;
+}
 COMPILER_RT_VISIBILITY uint32_t *__llvm_profile_begin_orderfile(void) {
   return &PROF_ORDERFILE_START;
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
index c7b6e842c9fac..5319ca813b43f 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
@@ -88,6 +88,10 @@ COMPILER_RT_VISIBILITY
 char *__llvm_profile_begin_counters(void) { return CountersFirst; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_counters(void) { return CountersLast; }
+COMPILER_RT_VISIBILITY
+char *__llvm_profile_begin_bitmap(void) { return BitmapFirst; }
+COMPILER_RT_VISIBILITY
+char *__llvm_profile_end_bitmap(void) { return BitmapLast; }
 /* TODO: correctly set up OrderFileFirst. */
 COMPILER_RT_VISIBILITY
 uint32_t *__llvm_profile_begin_orderfile(void) { return OrderFileFirst; }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
index dd576b2f8357d..9dbd702865fd2 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
@@ -14,6 +14,7 @@
 #if defined(_MSC_VER)
 /* Merge read-write sections into .data. */
 #pragma comment(linker, "/MERGE:.lprfc=.data")
+#pragma comment(linker, "/MERGE:.lprfb=.data")
 #pragma comment(linker, "/MERGE:.lprfd=.data")
 #pragma comment(linker, "/MERGE:.lprfv=.data")
 #pragma comment(linker, "/MERGE:.lprfnd=.data")
@@ -30,6 +31,8 @@
 #pragma section(".lprfd$Z", read, write)
 #pragma section(".lprfc$A", read, write)
 #pragma section(".lprfc$Z", read, write)
+#pragma section(".lprfb$A", read, write)
+#pragma section(".lprfb$Z", read, write)
 #pragma section(".lorderfile$A", read, write)
 #pragma section(".lprfnd$A", read, write)
 #pragma section(".lprfnd$Z", read, write)
@@ -43,6 +46,8 @@ const char COMPILER_RT_SECTION(".lprfn$Z") NamesEnd = '\0';
 
 char COMPILER_RT_SECTION(".lprfc$A") CountersStart;
 char COMPILER_RT_SECTION(".lprfc$Z") CountersEnd;
+char COMPILER_RT_SECTION(".lprfb$A") BitmapStart;
+char COMPILER_RT_SECTION(".lprfb$Z") BitmapEnd;
 uint32_t COMPILER_RT_SECTION(".lorderfile$A") OrderFileStart;
 
 ValueProfNode COMPILER_RT_SECTION(".lprfnd$A") VNodesStart;
@@ -58,6 +63,8 @@ const char *__llvm_profile_end_names(void) { return &NamesEnd; }
 
 char *__llvm_profile_begin_counters(void) { return &CountersStart + 1; }
 char *__llvm_profile_end_counters(void) { return &CountersEnd; }
+char *__llvm_profile_begin_bitmap(void) { return &BitmapStart + 1; }
+char *__llvm_profile_end_bitmap(void) { return &BitmapEnd; }
 uint32_t *__llvm_profile_begin_orderfile(void) { return &OrderFileStart; }
 
 ValueProfNode *__llvm_profile_begin_vnodes(void) { return &VNodesStart + 1; }
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 1e22398a4c0f6..3b61f3def9f6e 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -246,17 +246,20 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, VPDataReader, NamesBegin, NamesEnd,
-                            SkipNameDataWrite);
+                            CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
+                            NamesBegin, NamesEnd, SkipNameDataWrite);
 }
 
 COMPILER_RT_VISIBILITY int
 lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const __llvm_profile_data *DataEnd,
                    const char *CountersBegin, const char *CountersEnd,
+                   const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
                    const char *NamesEnd, int SkipNameDataWrite) {
   int DebugInfoCorrelate =
@@ -271,6 +274,8 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumCounters =
       __llvm_profile_get_num_counters(CountersBegin, CountersEnd);
+  const uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   const uint64_t NamesSize = DebugInfoCorrelate ? 0 : NamesEnd - NamesBegin;
 
   /* Create the header. */
@@ -279,11 +284,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSectionSize, CountersSectionSize, NamesSize,
+      DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
       &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterNames);
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   {
 /* Initialize header structure.  */
@@ -295,6 +300,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
    * CountersDelta to match. */
 #ifdef _WIN64
   Header.CountersDelta = (uint32_t)Header.CountersDelta;
+  Header.BitmapDelta = (uint32_t)Header.BitmapDelta;
 #endif
 
   /* The data and names sections are omitted in lightweight mode. */
@@ -319,6 +325,8 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       {NULL, sizeof(uint8_t), PaddingBytesBeforeCounters, 1},
       {CountersBegin, sizeof(uint8_t), CountersSectionSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterCounters, 1},
+      {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
       {(SkipNameDataWrite || DebugInfoCorrelate) ? NULL : NamesBegin,
        sizeof(uint8_t), NamesSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
index 7b96c6d91c33f..d9670f739ca98 100644
--- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c
+++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
@@ -25,17 +25,18 @@ const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
+char *__llvm_profile_begin_bitmap(void);
+char *__llvm_profile_end_bitmap(void);
 
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const void *DataBegin, const void *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *NamesBegin, const char *NamesEnd);
 
-int __llvm_profile_write_buffer_internal(char *Buffer, const void *DataBegin,
-                                         const void *DataEnd,
-                                         const char *CountersBegin,
-                                         const char *CountersEnd,
-                                         const char *NamesBegin,
-                                         const char *NamesEnd);
+int __llvm_profile_write_buffer_internal(
+    char *Buffer, const void *DataBegin, const void *DataEnd,
+    const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
 
 void __llvm_profile_set_dumped(void);
 
@@ -43,12 +44,14 @@ int main(int argc, const char *argv[]) {
   uint64_t bufsize = __llvm_profile_get_size_for_buffer_internal(
       __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
+      __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
       __llvm_profile_begin_names(), __llvm_profile_end_names());
 
   char *buf = malloc(bufsize);
-  int ret = __llvm_profile_write_buffer_internal(buf,
-      __llvm_profile_begin_data(), __llvm_profile_end_data(),
+  int ret = __llvm_profile_write_buffer_internal(
+      buf, __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
+      __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
       __llvm_profile_begin_names(), __llvm_profile_end_names());
 
   if (ret != 0) {
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f542e70bcfee8..d3b0cb0cc50ce 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -13842,6 +13842,144 @@ pass will generate the appropriate data structures and replace the
 ``llvm.instrprof.value.profile`` intrinsic with the call to the profile
 runtime library with proper arguments.
 
+'``llvm.instrprof.mcdc.parameters``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof.mcdc.parameters(ptr <name>, i64 <hash>,
+                                                   i32 <bitmap-bytes>)
+
+Overview:
+"""""""""
+
+The '``llvm.instrprof.mcdc.parameters``' intrinsic is used to initiate MC/DC
+code coverage instrumentation for a function.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to a global variable containing the
+name of the entity being instrumented. This should generally be the
+(mangled) function name for a set of counters.
+
+The second argument is a hash value that can be used by the consumer
+of the profile data to detect changes to the instrumented source.
+
+The third argument is the number of bitmap bytes required by the function to
+record the number of test vectors executed for each boolean expression.
+
+Semantics:
+""""""""""
+
+This intrinsic represents basic MC/DC parameters initiating one or more MC/DC
+instrumentation sequences in a function. It will cause the ``-instrprof`` pass
+to generate the appropriate data structures and the code to instrument MC/DC
+test vectors in a format that can be written out by a compiler runtime and
+consumed via the ``llvm-profdata`` tool.
+
+'``llvm.instrprof.mcdc.condbitmap.update``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof.mcdc.condbitmap.update(ptr <name>, i64 <hash>,
+                                                          i32 <condition-id>,
+                                                          ptr <mcdc-temp-addr>,
+                                                          i1 <bool-value>)
+
+Overview:
+"""""""""
+
+The '``llvm.instrprof.mcdc.condbitmap.update``' intrinsic is used to track
+MC/DC condition evaluation for each condition in a boolean expression.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to a global variable containing the
+name of the entity being instrumented. This should generally be the
+(mangled) function name for a set of counters.
+
+The second argument is a hash value that can be used by the consumer
+of the profile data to detect changes to the instrumented source.
+
+The third argument is an ID of a condition to track. This value is used as a
+bit index into the condition bitmap.
+
+The fourth argument is the address of the condition bitmap.
+
+The fifth argument is the boolean value representing the evaluation of the
+condition (true or false)
+
+Semantics:
+""""""""""
+
+This intrinsic represents the update of a condition bitmap that is local to a
+function and will cause the ``-instrprof`` pass to generate the code to
+instrument the control flow around each condition in a boolean expression. The
+ID of each condition corresponds to a bit index in the condition bitmap which
+is set based on the evaluation of the condition.
+
+'``llvm.instrprof.mcdc.tvbitmap.update``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof.mcdc.tvbitmap.update(ptr <name>, i64 <hash>,
+                                                        i32 <bitmap-bytes>)
+                                                        i32 <bitmap-index>,
+                                                        ptr <mcdc-temp-addr>)
+
+Overview:
+"""""""""
+
+The '``llvm.instrprof.mcdc.tvbitmap.update``' intrinsic is used to track MC/DC
+test vector execution after each boolean expression has been fully executed.
+The overall value of the condition bitmap, after it has been successively
+updated using the '``llvm.instrprof.mcdc.condbitmap.update``' intrinsic with
+the true or false evaluation of each condition, uniquely identifies an executed
+MC/DC test vector and is used as a bit index into the global test vector
+bitmap.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to a global variable containing the
+name of the entity being instrumented. This should generally be the
+(mangled) function name for a set of counters.
+
+The second argument is a hash value that can be used by the consumer
+of the profile data to detect changes to the instrumented source.
+
+The third argument is the number of bitmap bytes required by the function to
+record the number of test vectors executed for each boolean expression.
+
+The fourth argument is the byte index into the global test vector bitmap
+corresponding to the function.
+
+The fifth argument is the address of the condition bitmap, which contains a
+value representing an executed MC/DC test vector. It is loaded and used as the
+bit index of the test vector bitmap.
+
+Semantics:
+""""""""""
+
+This intrinsic represents the final operation of an MC/DC instrumentation
+sequence and will cause the ``-instrprof`` pass to generate the code to
+instrument an update of a function's global test vector bitmap to indicate that
+a test vector has been executed. The global test vector bitmap can be consumed
+by the ``llvm-profdata`` and ``llvm-cov`` tools.
+
 '``llvm.thread.pointer``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index c9777c72558be..c26ecef6eaaee 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1424,6 +1424,11 @@ class InstrProfInstBase : public IntrinsicInst {
   ConstantInt *getHash() const {
     return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
   }
+};
+
+/// A base class for all instrprof counter intrinsics.
+class InstrProfCntrInstBase : public InstrProfInstBase {
+public:
   // The number of counters for the instrumented function.
   ConstantInt *getNumCounters() const;
   // The index of the counter that this instruction acts on.
@@ -1431,7 +1436,7 @@ class InstrProfInstBase : public IntrinsicInst {
 };
 
 /// This represents the llvm.instrprof.cover intrinsic.
-class InstrProfCoverInst : public InstrProfInstBase {
+class InstrProfCoverInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_cover;
@@ -1442,7 +1447,7 @@ class InstrProfCoverInst : public InstrProfInstBase {
 };
 
 /// This represents the llvm.instrprof.increment intrinsic.
-class InstrProfIncrementInst : public InstrProfInstBase {
+class InstrProfIncrementInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_increment ||
@@ -1466,7 +1471,7 @@ class InstrProfIncrementInstStep : public InstrProfIncrementInst {
 };
 
 /// This represents the llvm.instrprof.timestamp intrinsic.
-class InstrProfTimestampInst : public InstrProfInstBase {
+class InstrProfTimestampInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_timestamp;
@@ -1477,7 +1482,7 @@ class InstrProfTimestampInst : public InstrProfInstBase {
 };
 
 /// This represents the llvm.instrprof.value.profile intrinsic.
-class InstrProfValueProfileInst : public InstrProfInstBase {
+class InstrProfValueProfileInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_value_profile;
@@ -1500,6 +1505,87 @@ class InstrProfValueProfileInst : public InstrProfInstBase {
   }
 };
 
+/// A base class for instrprof mcdc intrinsics that require global bitmap bytes.
+class InstrProfMCDCBitmapInstBase : public InstrProfInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_parameters ||
+           I->getIntrinsicID() == Intrinsic::instrprof_mcdc_tvbitmap_update;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  /// \return The number of bytes used for the MCDC bitmaps for the instrumented
+  /// function.
+  ConstantInt *getNumBitmapBytes() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+  }
+};
+
+/// This represents the llvm.instrprof.mcdc.parameters intrinsic.
+class InstrProfMCDCBitmapParameters : public InstrProfMCDCBitmapInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_parameters;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.instrprof.mcdc.tvbitmap.update intrinsic.
+class InstrProfMCDCTVBitmapUpdate : public InstrProfMCDCBitmapInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_tvbitmap_update;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  /// \return The index of the TestVector Bitmap upon which this intrinsic
+  /// acts.
+  ConstantInt *getBitmapIndex() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
+  }
+
+  /// \return The address of the corresponding condition bitmap containing
+  /// the index of the TestVector to update within the TestVector Bitmap.
+  Value *getMCDCCondBitmapAddr() const {
+    return cast<Value>(const_cast<Value *>(getArgOperand(4)));
+  }
+};
+
+/// This represents the llvm.instrprof.mcdc.condbitmap.update intrinsic.
+/// It does not pertain to global bitmap updates or parameters and so doesn't
+/// inherit from InstrProfMCDCBitmapInstBase.
+class InstrProfMCDCCondBitmapUpdate : public InstrProfInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_condbitmap_update;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  /// \return The ID of the condition to update.
+  ConstantInt *getCondID() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+  }
+
+  /// \return The address of the corresponding condition bitmap.
+  Value *getMCDCCondBitmapAddr() const {
+    return cast<Value>(const_cast<Value *>(getArgOperand(3)));
+  }
+
+  /// \return The boolean value to set in the condition bitmap for the
+  /// corresponding condition ID. This represents how the condition evaluated.
+  Value *getCondBool() const {
+    return cast<Value>(const_cast<Value *>(getArgOperand(4)));
+  }
+};
+
 class PseudoProbeInst : public IntrinsicInst {
 public:
   static bool classof(const IntrinsicInst *I) {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index e94b59508de7b..4d661883abb6c 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -917,6 +917,21 @@ def int_instrprof_value_profile : Intrinsic<[],
                                              llvm_i64_ty, llvm_i32_ty,
                                              llvm_i32_ty]>;
 
+// A parameter configuration for instrumentation based MCDC profiling.
+def int_instrprof_mcdc_parameters : Intrinsic<[],
+                                              [llvm_ptr_ty, llvm_i64_ty,
+                                               llvm_i32_ty]>;
+
+// A test vector bitmap update for instrumentation based MCDC profiling.
+def int_instrprof_mcdc_tvbitmap_update : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_i64_ty,
+                                         llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty]>;
+
+// A condition bitmap value update for instrumentation based MCDC profiling.
+def int_instrprof_mcdc_condbitmap_update : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_i64_ty,
+                                         llvm_i32_ty, llvm_ptr_ty, llvm_i1_ty]>;
+
 def int_call_preallocated_setup : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty]>;
 def int_call_preallocated_arg : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>;
 def int_call_preallocated_teardown : DefaultAttrsIntrinsic<[], [llvm_token_ty]>;
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index b407fe277c543..93cd0f060ef57 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -1027,7 +1027,9 @@ enum CovMapVersion {
   // Compilation directory is stored separately and combined with relative
   // filenames to produce an absolute file path.
   Version6 = 5,
-  // The current version is Version6.
+  // Branch regions extended and Decision Regions added for MC/DC.
+  Version7 = 6,
+  // The current version is Version7.
   CurrentVersion = INSTR_PROF_COVMAP_VERSION
 };
 
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index f9096b4615720..2a780104b177d 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -96,6 +96,9 @@ inline StringRef getInstrProfDataVarPrefix() { return "__profd_"; }
 /// Return the name prefix of profile counter variables.
 inline StringRef getInstrProfCountersVarPrefix() { return "__profc_"; }
 
+/// Return the name prefix of profile bitmap variables.
+inline StringRef getInstrProfBitmapVarPrefix() { return "__profbm_"; }
+
 /// Return the name prefix of value profile variables.
 inline StringRef getInstrProfValuesVarPrefix() { return "__profvp_"; }
 
@@ -335,6 +338,7 @@ enum class instrprof_error {
   invalid_prof,
   hash_mismatch,
   count_mismatch,
+  bitmap_mismatch,
   counter_overflow,
   value_site_count_mismatch,
   compress_failed,
@@ -690,18 +694,23 @@ struct InstrProfValueSiteRecord {
 /// Profiling information for a single function.
 struct InstrProfRecord {
   std::vector<uint64_t> Counts;
+  std::vector<uint8_t> BitmapBytes;
 
   InstrProfRecord() = default;
   InstrProfRecord(std::vector<uint64_t> Counts) : Counts(std::move(Counts)) {}
+  InstrProfRecord(std::vector<uint64_t> Counts,
+                  std::vector<uint8_t> BitmapBytes)
+      : Counts(std::move(Counts)), BitmapBytes(std::move(BitmapBytes)) {}
   InstrProfRecord(InstrProfRecord &&) = default;
   InstrProfRecord(const InstrProfRecord &RHS)
-      : Counts(RHS.Counts),
+      : Counts(RHS.Counts), BitmapBytes(RHS.BitmapBytes),
         ValueData(RHS.ValueData
                       ? std::make_unique<ValueProfData>(*RHS.ValueData)
                       : nullptr) {}
   InstrProfRecord &operator=(InstrProfRecord &&) = default;
   InstrProfRecord &operator=(const InstrProfRecord &RHS) {
     Counts = RHS.Counts;
+    BitmapBytes = RHS.BitmapBytes;
     if (!RHS.ValueData) {
       ValueData = nullptr;
       return *this;
@@ -880,6 +889,11 @@ struct NamedInstrProfRecord : InstrProfRecord {
   NamedInstrProfRecord(StringRef Name, uint64_t Hash,
                        std::vector<uint64_t> Counts)
       : InstrProfRecord(std::move(Counts)), Name(Name), Hash(Hash) {}
+  NamedInstrProfRecord(StringRef Name, uint64_t Hash,
+                       std::vector<uint64_t> Counts,
+                       std::vector<uint8_t> BitmapBytes)
+      : InstrProfRecord(std::move(Counts), std::move(BitmapBytes)), Name(Name),
+        Hash(Hash) {}
 
   static bool hasCSFlagInHash(uint64_t FuncHash) {
     return ((FuncHash >> CS_FLAG_IN_FUNC_HASH) & 1);
@@ -1015,7 +1029,9 @@ enum ProfVersion {
   Version9 = 9,
   // An additional (optional) temporal profile traces section is added.
   Version10 = 10,
-  // The current version is 10.
+  // An additional field is used for bitmap bytes.
+  Version11 = 11,
+  // The current version is 11.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1153,6 +1169,7 @@ namespace RawInstrProf {
 // Version 6: Added binary id.
 // Version 7: Reorder binary id and include version in signature.
 // Version 8: Use relative counter pointer.
+// Version 9: Added relative bitmap bytes pointer and count used by MC/DC.
 const uint64_t Version = INSTR_PROF_RAW_VERSION;
 
 template <class IntPtrT> inline uint64_t getMagic();
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 4456bf1ab1763..fad14576c442d 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -76,6 +76,7 @@ INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, CounterPtr, RelativeCounterPtr)
+INSTR_PROF_DATA(const IntPtrT, IntPtrTy, BitmapPtr, RelativeBitmapPtr)
 /* This is used to map function pointers for the indirect call targets to
  * function name hashes during the conversion from raw to merged profile
  * data.
@@ -87,7 +88,9 @@ INSTR_PROF_DATA(IntPtrT, llvm::Type::getInt8PtrTy(Ctx), Values, \
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
 INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+                ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
@@ -132,9 +135,13 @@ INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, NumCounters, NumCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterCounters, PaddingBytesAfterCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, NumBitmapBytes, NumBitmapBytes)
+INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterBitmapBytes, PaddingBytesAfterBitmapBytes)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
                       (uintptr_t)CountersBegin - (uintptr_t)DataBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
+                      (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
@@ -267,6 +274,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
+                      INSTR_PROF_BITS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
@@ -646,11 +656,11 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 
 /* FIXME: Please remedy the fixme in the header before bumping the version. */
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 8
+#define INSTR_PROF_RAW_VERSION 9
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 10
+#define INSTR_PROF_INDEX_VERSION 11
 /* Coverage mapping format version (start from 0). */
-#define INSTR_PROF_COVMAP_VERSION 5
+#define INSTR_PROF_COVMAP_VERSION 6
 
 /* Profile version is always of type uint64_t. Reserve the upper 8 bits in the
  * version for other variants of profile. We set the lowest bit of the upper 8
@@ -687,6 +697,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
@@ -698,6 +709,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
+#define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
@@ -709,6 +721,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -723,6 +736,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON)
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 74e921e10c47b..51d86ee0bce32 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -323,11 +323,14 @@ class RawInstrProfReader : public InstrProfReader {
   // the variant types of the profile.
   uint64_t Version;
   uint64_t CountersDelta;
+  uint64_t BitmapDelta;
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
   const char *CountersStart;
   const char *CountersEnd;
+  const char *BitmapStart;
+  const char *BitmapEnd;
   const char *NamesStart;
   const char *NamesEnd;
   // After value profile is all read, this pointer points to
@@ -429,6 +432,7 @@ class RawInstrProfReader : public InstrProfReader {
   Error readName(NamedInstrProfRecord &Record);
   Error readFuncHash(NamedInstrProfRecord &Record);
   Error readRawCounts(InstrProfRecord &Record);
+  Error readRawBitmapBytes(InstrProfRecord &Record);
   Error readValueProfilingData(InstrProfRecord &Record);
   bool atEnd() const { return Data == DataEnd; }
 
@@ -441,6 +445,7 @@ class RawInstrProfReader : public InstrProfReader {
       // As we advance to the next record, we maintain the correct CountersDelta
       // with respect to the next record.
       CountersDelta -= sizeof(*Data);
+      BitmapDelta -= sizeof(*Data);
     }
     Data++;
     ValueDataStart += CurValueDataSize;
@@ -732,6 +737,10 @@ class IndexedInstrProfReader : public InstrProfReader {
   Error getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                           std::vector<uint64_t> &Counts);
 
+  /// Fill Bitmap Bytes with the profile data for the given function name.
+  Error getFunctionBitmapBytes(StringRef FuncName, uint64_t FuncHash,
+                               std::vector<uint8_t> &BitmapBytes);
+
   /// Return the maximum of all known function counts.
   /// \c UseCS indicates whether to use the context-sensitive count.
   uint64_t getMaximumFunctionCount(bool UseCS) {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
index cb0c055dcb74a..d8f3e75087ace 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
@@ -50,6 +50,7 @@ class InstrProfiling : public PassInfoMixin<InstrProfiling> {
     uint32_t NumValueSites[IPVK_Last + 1];
     GlobalVariable *RegionCounters = nullptr;
     GlobalVariable *DataVar = nullptr;
+    GlobalVariable *RegionBitmaps = nullptr;
 
     PerFunctionProfileData() {
       memset(NumValueSites, 0, sizeof(uint32_t) * (IPVK_Last + 1));
@@ -105,20 +106,59 @@ class InstrProfiling : public PassInfoMixin<InstrProfiling> {
   /// Force emitting of name vars for unused functions.
   void lowerCoverageData(GlobalVariable *CoverageNamesVar);
 
+  /// Replace instrprof.mcdc.tvbitmask.update with a shift and or instruction
+  /// using the index represented by the a temp value into a bitmap.
+  void lowerMCDCTestVectorBitmapUpdate(InstrProfMCDCTVBitmapUpdate *Ins);
+
+  /// Replace instrprof.mcdc.temp.update with a shift and or instruction using
+  /// the corresponding condition ID.
+  void lowerMCDCCondBitmapUpdate(InstrProfMCDCCondBitmapUpdate *Ins);
+
   /// Compute the address of the counter value that this profiling instruction
   /// acts on.
-  Value *getCounterAddress(InstrProfInstBase *I);
+  Value *getCounterAddress(InstrProfCntrInstBase *I);
 
   /// Get the region counters for an increment, creating them if necessary.
   ///
   /// If the counter array doesn't yet exist, the profile data variables
   /// referring to them will also be created.
-  GlobalVariable *getOrCreateRegionCounters(InstrProfInstBase *Inc);
+  GlobalVariable *getOrCreateRegionCounters(InstrProfCntrInstBase *Inc);
 
   /// Create the region counters.
-  GlobalVariable *createRegionCounters(InstrProfInstBase *Inc, StringRef Name,
+  GlobalVariable *createRegionCounters(InstrProfCntrInstBase *Inc,
+                                       StringRef Name,
                                        GlobalValue::LinkageTypes Linkage);
 
+  /// Compute the address of the test vector bitmap that this profiling
+  /// instruction acts on.
+  Value *getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I);
+
+  /// Get the region bitmaps for an increment, creating them if necessary.
+  ///
+  /// If the bitmap array doesn't yet exist, the profile data variables
+  /// referring to them will also be created.
+  GlobalVariable *getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc);
+
+  /// Create the MC/DC bitmap as a byte-aligned array of bytes associated with
+  /// an MC/DC Decision region. The number of bytes required is indicated by
+  /// the intrinsic used (type InstrProfMCDCBitmapInstBase).  This is called
+  /// as part of setupProfileSection() and is conceptually very similar to
+  /// what is done for profile data counters in createRegionCounters().
+  GlobalVariable *createRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc,
+                                      StringRef Name,
+                                      GlobalValue::LinkageTypes Linkage);
+
+  /// Set Comdat property of GV, if required.
+  void maybeSetComdat(GlobalVariable *GV, Function *Fn, StringRef VarName);
+
+  /// Setup the sections into which counters and bitmaps are allocated.
+  GlobalVariable *setupProfileSection(InstrProfInstBase *Inc,
+                                      InstrProfSectKind IPSK);
+
+  /// Create INSTR_PROF_DATA variable for counters and bitmaps.
+  void createDataVariable(InstrProfCntrInstBase *Inc,
+                          InstrProfMCDCBitmapParameters *Update);
+
   /// Emit the section with compressed function names.
   void emitNameData();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f39b62abdd877..435e525966afe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7196,6 +7196,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     llvm_unreachable("instrprof failed to lower a timestamp");
   case Intrinsic::instrprof_value_profile:
     llvm_unreachable("instrprof failed to lower a value profiling call");
+  case Intrinsic::instrprof_mcdc_parameters:
+    llvm_unreachable("instrprof failed to lower mcdc parameters");
+  case Intrinsic::instrprof_mcdc_tvbitmap_update:
+    llvm_unreachable("instrprof failed to lower an mcdc tvbitmap update");
+  case Intrinsic::instrprof_mcdc_condbitmap_update:
+    llvm_unreachable("instrprof failed to lower an mcdc condbitmap update");
   case Intrinsic::localescape: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index e4ddd57575355..20ae08dd12830 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -270,13 +270,13 @@ int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
   return -1;
 }
 
-ConstantInt *InstrProfInstBase::getNumCounters() const {
+ConstantInt *InstrProfCntrInstBase::getNumCounters() const {
   if (InstrProfValueProfileInst::classof(this))
     llvm_unreachable("InstrProfValueProfileInst does not have counters!");
   return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
 }
 
-ConstantInt *InstrProfInstBase::getIndex() const {
+ConstantInt *InstrProfCntrInstBase::getIndex() const {
   if (InstrProfValueProfileInst::classof(this))
     llvm_unreachable("Please use InstrProfValueProfileInst::getIndex()");
   return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index b17caaf998073..5c888cb388d16 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -784,6 +784,7 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
   case CovMapVersion::Version4:
   case CovMapVersion::Version5:
   case CovMapVersion::Version6:
+  case CovMapVersion::Version7:
     // Decompress the name data.
     if (Error E = P.create(P.getNameData()))
       return std::move(E);
@@ -802,6 +803,9 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
     else if (Version == CovMapVersion::Version6)
       return std::make_unique<VersionedCovMapFuncRecordReader<
           CovMapVersion::Version6, IntPtrT, Endian>>(P, R, D, F);
+    else if (Version == CovMapVersion::Version7)
+      return std::make_unique<VersionedCovMapFuncRecordReader<
+          CovMapVersion::Version7, IntPtrT, Endian>>(P, R, D, F);
   }
   llvm_unreachable("Unsupported version");
 }
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 835dd697bc7b6..7e2ee660978a4 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -136,6 +136,9 @@ static std::string getInstrProfErrString(instrprof_error Err,
   case instrprof_error::count_mismatch:
     OS << "function basic block count change detected (counter mismatch)";
     break;
+  case instrprof_error::bitmap_mismatch:
+    OS << "function bitmap size change detected (bitmap size mismatch)";
+    break;
   case instrprof_error::counter_overflow:
     OS << "counter overflow";
     break;
@@ -804,6 +807,18 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
       Warn(instrprof_error::counter_overflow);
   }
 
+  // If the number of bitmap bytes doesn't match we either have bad data
+  // or a hash collision.
+  if (BitmapBytes.size() != Other.BitmapBytes.size()) {
+    Warn(instrprof_error::bitmap_mismatch);
+    return;
+  }
+
+  // Bitmap bytes are merged by simply ORing them together.
+  for (size_t I = 0, E = Other.BitmapBytes.size(); I < E; ++I) {
+    BitmapBytes[I] = Other.BitmapBytes[I] | BitmapBytes[I];
+  }
+
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     mergeValueProfData(Kind, Other, Weight, Warn);
 }
@@ -1476,9 +1491,11 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version10,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
         "Please update the reading code below if a new field has been added, "
         "if not add a case statement to fall through to the latest version.");
+  case 11ull:
+    [[fallthrough]];
   case 10ull:
     H.TemporalProfTracesOffset =
         read(Buffer, offsetOf(&Header::TemporalProfTracesOffset));
@@ -1502,10 +1519,12 @@ size_t Header::size() const {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version10,
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
                   "Please update the size computation below if a new field has "
                   "been added to the header, if not add a case statement to "
                   "fall through to the latest version.");
+  case 11ull:
+    [[fallthrough]];
   case 10ull:
     return offsetOf(&Header::TemporalProfTracesOffset) +
            sizeof(Header::TemporalProfTracesOffset);
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index 5ccfbd2ed9749..76203f28759cd 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -212,11 +212,15 @@ void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName,
       // In this mode, CounterPtr actually stores the section relative address
       // of the counter.
       maybeSwap<IntPtrT>(CounterOffset),
+      // TODO: MC/DC is not yet supported.
+      /*BitmapOffset=*/maybeSwap<IntPtrT>(0),
       maybeSwap<IntPtrT>(FunctionPtr),
       // TODO: Value profiling is not yet supported.
       /*ValuesPtr=*/maybeSwap<IntPtrT>(0),
       maybeSwap<uint32_t>(NumCounters),
       /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)},
+      // TODO: MC/DC is not yet supported.
+      /*NumBitmapBytes=*/maybeSwap<uint32_t>(0),
   });
   NamesVec.push_back(FunctionName.str());
 }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index db20441b712cd..e4ef1b52d4156 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -433,6 +433,29 @@ Error TextInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
     Record.Counts.push_back(Count);
   }
 
+  // Bitmap byte information is indicated with special character.
+  if (Line->startswith("$")) {
+    Record.BitmapBytes.clear();
+    // Read the number of bitmap bytes.
+    uint64_t NumBitmapBytes;
+    if ((Line++)->drop_front(1).trim().getAsInteger(0, NumBitmapBytes))
+      return error(instrprof_error::malformed,
+                   "number of bitmap bytes is not a valid integer");
+    if (NumBitmapBytes != 0) {
+      // Read each bitmap and fill our internal storage with the values.
+      Record.BitmapBytes.reserve(NumBitmapBytes);
+      for (uint8_t I = 0; I < NumBitmapBytes; ++I) {
+        if (Line.is_at_end())
+          return error(instrprof_error::truncated);
+        uint8_t BitmapByte;
+        if ((Line++)->getAsInteger(0, BitmapByte))
+          return error(instrprof_error::malformed,
+                       "bitmap byte is not a valid integer");
+        Record.BitmapBytes.push_back(BitmapByte);
+      }
+    }
+  }
+
   // Check if value profile data exists and read it if so.
   if (Error E = readValueProfileData(Record))
     return error(std::move(E));
@@ -549,11 +572,14 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     return error(instrprof_error::bad_header);
 
   CountersDelta = swap(Header.CountersDelta);
+  BitmapDelta = swap(Header.BitmapDelta);
   NamesDelta = swap(Header.NamesDelta);
   auto NumData = swap(Header.NumData);
   auto PaddingBytesBeforeCounters = swap(Header.PaddingBytesBeforeCounters);
   auto CountersSize = swap(Header.NumCounters) * getCounterTypeSize();
   auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters);
+  auto NumBitmapBytes = swap(Header.NumBitmapBytes);
+  auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
   auto NamesSize = swap(Header.NamesSize);
   ValueKindLast = swap(Header.ValueKindLast);
 
@@ -563,8 +589,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdsSize;
   ptrdiff_t CountersOffset = DataOffset + DataSize + PaddingBytesBeforeCounters;
-  ptrdiff_t NamesOffset =
+  ptrdiff_t BitmapOffset =
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
+  ptrdiff_t NamesOffset =
+      BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
   ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
@@ -593,6 +621,8 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
       reinterpret_cast<const uint8_t *>(&Header) + sizeof(RawInstrProf::Header);
   CountersStart = Start + CountersOffset;
   CountersEnd = CountersStart + CountersSize;
+  BitmapStart = Start + BitmapOffset;
+  BitmapEnd = BitmapStart + NumBitmapBytes;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
 
   const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd();
@@ -683,6 +713,49 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
   return success();
 }
 
+template <class IntPtrT>
+Error RawInstrProfReader<IntPtrT>::readRawBitmapBytes(InstrProfRecord &Record) {
+  uint32_t NumBitmapBytes = swap(Data->NumBitmapBytes);
+
+  Record.BitmapBytes.clear();
+  Record.BitmapBytes.reserve(NumBitmapBytes);
+
+  // It's possible MCDC is either not enabled or only used for some functions
+  // and not others. So if we record 0 bytes, just move on.
+  if (NumBitmapBytes == 0)
+    return success();
+
+  // BitmapDelta decreases as we advance to the next data record.
+  ptrdiff_t BitmapOffset = swap(Data->BitmapPtr) - BitmapDelta;
+  if (BitmapOffset < 0)
+    return error(
+        instrprof_error::malformed,
+        ("bitmap offset " + Twine(BitmapOffset) + " is negative").str());
+
+  if (BitmapOffset >= BitmapEnd - BitmapStart)
+    return error(instrprof_error::malformed,
+                 ("bitmap offset " + Twine(BitmapOffset) +
+                  " is greater than the maximum bitmap offset " +
+                  Twine(BitmapEnd - BitmapStart - 1))
+                     .str());
+
+  uint64_t MaxNumBitmapBytes =
+      (BitmapEnd - (BitmapStart + BitmapOffset)) / sizeof(uint8_t);
+  if (NumBitmapBytes > MaxNumBitmapBytes)
+    return error(instrprof_error::malformed,
+                 ("number of bitmap bytes " + Twine(NumBitmapBytes) +
+                  " is greater than the maximum number of bitmap bytes " +
+                  Twine(MaxNumBitmapBytes))
+                     .str());
+
+  for (uint32_t I = 0; I < NumBitmapBytes; I++) {
+    const char *Ptr = BitmapStart + BitmapOffset + I;
+    Record.BitmapBytes.push_back(swap(*Ptr));
+  }
+
+  return success();
+}
+
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
     InstrProfRecord &Record) {
@@ -733,6 +806,10 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record)
   if (Error E = readRawCounts(Record))
     return error(std::move(E));
 
+  // Read raw bitmap bytes and set Record.
+  if (Error E = readRawBitmapBytes(Record))
+    return error(std::move(E));
+
   // Read value data and set Record.
   if (Error E = readValueProfilingData(Record))
     return error(std::move(E));
@@ -794,6 +871,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
 
   DataBuffer.clear();
   std::vector<uint64_t> CounterBuffer;
+  std::vector<uint8_t> BitmapByteBuffer;
 
   const unsigned char *End = D + N;
   while (D < End) {
@@ -819,7 +897,24 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
     for (uint64_t J = 0; J < CountsSize; ++J)
       CounterBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D));
 
-    DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer));
+    // Read bitmap bytes for GET_VERSION(FormatVersion) > 8.
+    if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version8) {
+      uint64_t BitmapBytes = 0;
+      if (D + sizeof(uint64_t) > End)
+        return data_type();
+      BitmapBytes = endian::readNext<uint64_t, little, unaligned>(D);
+      // Read bitmap byte values.
+      if (D + BitmapBytes * sizeof(uint8_t) > End)
+        return data_type();
+      BitmapByteBuffer.clear();
+      BitmapByteBuffer.reserve(BitmapBytes);
+      for (uint64_t J = 0; J < BitmapBytes; ++J)
+        BitmapByteBuffer.push_back(static_cast<uint8_t>(
+            endian::readNext<uint64_t, little, unaligned>(D)));
+    }
+
+    DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer),
+                            std::move(BitmapByteBuffer));
 
     // Read value profiling data.
     if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version2 &&
@@ -1319,6 +1414,16 @@ Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
   return success();
 }
 
+Error IndexedInstrProfReader::getFunctionBitmapBytes(
+    StringRef FuncName, uint64_t FuncHash, std::vector<uint8_t> &BitmapBytes) {
+  Expected<InstrProfRecord> Record = getInstrProfRecord(FuncName, FuncHash);
+  if (Error E = Record.takeError())
+    return error(std::move(E));
+
+  BitmapBytes = Record.get().BitmapBytes;
+  return success();
+}
+
 Error IndexedInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
   ArrayRef<NamedInstrProfRecord> Data;
 
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b74d5c3862d80..d8b8571bd5c95 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -131,6 +131,8 @@ class InstrProfRecordWriterTrait {
       M += sizeof(uint64_t); // The function hash
       M += sizeof(uint64_t); // The size of the Counts vector
       M += ProfRecord.Counts.size() * sizeof(uint64_t);
+      M += sizeof(uint64_t); // The size of the Bitmap vector
+      M += ProfRecord.BitmapBytes.size() * sizeof(uint64_t);
 
       // Value data
       M += ValueProfData::getSize(ProfileData.second);
@@ -160,6 +162,10 @@ class InstrProfRecordWriterTrait {
       for (uint64_t I : ProfRecord.Counts)
         LE.write<uint64_t>(I);
 
+      LE.write<uint64_t>(ProfRecord.BitmapBytes.size());
+      for (uint64_t I : ProfRecord.BitmapBytes)
+        LE.write<uint64_t>(I);
+
       // Write value data
       std::unique_ptr<ValueProfData> VDataPtr =
           ValueProfData::serializeFrom(ProfileData.second);
@@ -380,6 +386,8 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
     const InstrProfRecord &IPR = Func.second;
     if (llvm::any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
       return true;
+    if (llvm::any_of(IPR.BitmapBytes, [](uint8_t Byte) { return Byte > 0; }))
+      return true;
   }
   return false;
 }
@@ -703,6 +711,17 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
   for (uint64_t Count : Func.Counts)
     OS << Count << "\n";
 
+  if (Func.BitmapBytes.size() > 0) {
+    OS << "# Num Bitmap Bytes:\n$" << Func.BitmapBytes.size() << "\n";
+    OS << "# Bitmap Byte Values:\n";
+    for (uint8_t Byte : Func.BitmapBytes) {
+      OS << "0x";
+      OS.write_hex(Byte);
+      OS << "\n";
+    }
+    OS << "\n";
+  }
+
   uint32_t NumValueKinds = Func.getNumValueKinds();
   if (!NumValueKinds) {
     OS << "\n";
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index c90692980d86a..f81a52edead31 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -430,6 +430,15 @@ bool InstrProfiling::lowerIntrinsics(Function *F) {
       } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
         lowerValueProfileInst(IPVP);
         MadeChange = true;
+      } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(&Instr)) {
+        IPMP->eraseFromParent();
+        MadeChange = true;
+      } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(&Instr)) {
+        lowerMCDCTestVectorBitmapUpdate(IPBU);
+        MadeChange = true;
+      } else if (auto *IPTU = dyn_cast<InstrProfMCDCCondBitmapUpdate>(&Instr)) {
+        lowerMCDCCondBitmapUpdate(IPTU);
+        MadeChange = true;
       }
     }
   }
@@ -544,19 +553,33 @@ bool InstrProfiling::run(
   // the instrumented function. This is counting the number of instrumented
   // target value sites to enter it as field in the profile data variable.
   for (Function &F : M) {
-    InstrProfInstBase *FirstProfInst = nullptr;
-    for (BasicBlock &BB : F)
-      for (auto I = BB.begin(), E = BB.end(); I != E; I++)
+    InstrProfCntrInstBase *FirstProfInst = nullptr;
+    InstrProfMCDCBitmapParameters *FirstProfMCDCParams = nullptr;
+    for (BasicBlock &BB : F) {
+      for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
         if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
           computeNumValueSiteCounts(Ind);
-        else if (FirstProfInst == nullptr &&
-                 (isa<InstrProfIncrementInst>(I) || isa<InstrProfCoverInst>(I)))
-          FirstProfInst = dyn_cast<InstrProfInstBase>(I);
+        else {
+          if (FirstProfInst == nullptr &&
+              (isa<InstrProfIncrementInst>(I) || isa<InstrProfCoverInst>(I)))
+            FirstProfInst = dyn_cast<InstrProfCntrInstBase>(I);
+          if (FirstProfMCDCParams == nullptr)
+            FirstProfMCDCParams = dyn_cast<InstrProfMCDCBitmapParameters>(I);
+        }
+      }
+    }
 
-    // Value profiling intrinsic lowering requires per-function profile data
-    // variable to be created first.
-    if (FirstProfInst != nullptr)
+    // If the MCDCBitmapParameters intrinsic was seen, create the bitmaps.
+    if (FirstProfMCDCParams != nullptr) {
+      static_cast<void>(getOrCreateRegionBitmaps(FirstProfMCDCParams));
+    }
+
+    // Use a profile intrinsic to create the region counters and data variable.
+    // Also create the data variable based on the MCDCParams.
+    if (FirstProfInst != nullptr) {
       static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
+      createDataVariable(FirstProfInst, FirstProfMCDCParams);
+    }
   }
 
   for (Function &F : M)
@@ -670,7 +693,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   Ind->eraseFromParent();
 }
 
-Value *InstrProfiling::getCounterAddress(InstrProfInstBase *I) {
+Value *InstrProfiling::getCounterAddress(InstrProfCntrInstBase *I) {
   auto *Counters = getOrCreateRegionCounters(I);
   IRBuilder<> Builder(I);
 
@@ -710,6 +733,25 @@ Value *InstrProfiling::getCounterAddress(InstrProfInstBase *I) {
   return Builder.CreateIntToPtr(Add, Addr->getType());
 }
 
+Value *InstrProfiling::getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I) {
+  auto *Bitmaps = getOrCreateRegionBitmaps(I);
+  IRBuilder<> Builder(I);
+
+  auto *Addr = Builder.CreateConstInBoundsGEP2_32(
+      Bitmaps->getValueType(), Bitmaps, 0, I->getBitmapIndex()->getZExtValue());
+
+  if (isRuntimeCounterRelocationEnabled()) {
+    LLVMContext &Ctx = M->getContext();
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Runtime counter relocation is presently not supported for MC/DC "
+              "bitmaps."),
+        DS_Warning));
+  }
+
+  return Addr;
+}
+
 void InstrProfiling::lowerCover(InstrProfCoverInst *CoverInstruction) {
   auto *Addr = getCounterAddress(CoverInstruction);
   IRBuilder<> Builder(CoverInstruction);
@@ -797,6 +839,86 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
   CoverageNamesVar->eraseFromParent();
 }
 
+void InstrProfiling::lowerMCDCTestVectorBitmapUpdate(
+    InstrProfMCDCTVBitmapUpdate *Update) {
+  IRBuilder<> Builder(Update);
+  auto *Int8Ty = Type::getInt8Ty(M->getContext());
+  auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *Int64Ty = Type::getInt64Ty(M->getContext());
+  auto *MCDCCondBitmapAddr = Update->getMCDCCondBitmapAddr();
+  auto *BitmapAddr = getBitmapAddress(Update);
+
+  // Load Temp Val.
+  //  %mcdc.temp = load i32, ptr %mcdc.addr, align 4
+  auto *Temp = Builder.CreateLoad(Int32Ty, MCDCCondBitmapAddr, "mcdc.temp");
+
+  // Calculate byte offset using div8.
+  //  %1 = lshr i32 %mcdc.temp, 3
+  auto *BitmapByteOffset = Builder.CreateLShr(Temp, 0x3);
+
+  // Add byte offset to section base byte address.
+  //  %2 = zext i32 %1 to i64
+  //  %3 = add i64 ptrtoint (ptr @__profbm_test to i64), %2
+  auto *BitmapByteAddr =
+      Builder.CreateAdd(Builder.CreatePtrToInt(BitmapAddr, Int64Ty),
+                        Builder.CreateZExtOrBitCast(BitmapByteOffset, Int64Ty));
+
+  // Convert to a pointer.
+  //  %4 = inttoptr i32 %3 to ptr
+  BitmapByteAddr = Builder.CreateIntToPtr(BitmapByteAddr, Int8PtrTy);
+
+  // Calculate bit offset into bitmap byte by using div8 remainder (AND ~8)
+  //  %5 = and i32 %mcdc.temp, 7
+  //  %6 = trunc i32 %5 to i8
+  auto *BitToSet = Builder.CreateTrunc(Builder.CreateAnd(Temp, 0x7), Int8Ty);
+
+  // Shift bit offset left to form a bitmap.
+  //  %7 = shl i8 1, %6
+  auto *ShiftedVal = Builder.CreateShl(Builder.getInt8(0x1), BitToSet);
+
+  // Load profile bitmap byte.
+  //  %mcdc.bits = load i8, ptr %4, align 1
+  auto *Bitmap = Builder.CreateLoad(Int8Ty, BitmapByteAddr, "mcdc.bits");
+
+  // Perform logical OR of profile bitmap byte and shifted bit offset.
+  //  %8 = or i8 %mcdc.bits, %7
+  auto *Result = Builder.CreateOr(Bitmap, ShiftedVal);
+
+  // Store the updated profile bitmap byte.
+  //  store i8 %8, ptr %3, align 1
+  Builder.CreateStore(Result, BitmapByteAddr);
+  Update->eraseFromParent();
+}
+
+void InstrProfiling::lowerMCDCCondBitmapUpdate(
+    InstrProfMCDCCondBitmapUpdate *Update) {
+  IRBuilder<> Builder(Update);
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *MCDCCondBitmapAddr = Update->getMCDCCondBitmapAddr();
+
+  // Load the MCDC temporary value from the stack.
+  //  %mcdc.temp = load i32, ptr %mcdc.addr, align 4
+  auto *Temp = Builder.CreateLoad(Int32Ty, MCDCCondBitmapAddr, "mcdc.temp");
+
+  // Zero-extend the evaluated condition boolean value (0 or 1) by 32bits.
+  //  %1 = zext i1 %tobool to i32
+  auto *CondV_32 = Builder.CreateZExt(Update->getCondBool(), Int32Ty);
+
+  // Shift the boolean value left (by the condition's ID) to form a bitmap.
+  //  %2 = shl i32 %1, <Update->getCondID()>
+  auto *ShiftedVal = Builder.CreateShl(CondV_32, Update->getCondID());
+
+  // Perform logical OR of the bitmap against the loaded MCDC temporary value.
+  //  %3 = or i32 %mcdc.temp, %2
+  auto *Result = Builder.CreateOr(Temp, ShiftedVal);
+
+  // Store the updated temporary value back to the stack.
+  //  store i32 %3, ptr %mcdc.addr, align 4
+  Builder.CreateStore(Result, MCDCCondBitmapAddr);
+  Update->eraseFromParent();
+}
+
 /// Get the name of a profiling variable for a particular function.
 static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix,
                               bool &Renamed) {
@@ -952,37 +1074,31 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
   return true;
 }
 
-GlobalVariable *
-InstrProfiling::createRegionCounters(InstrProfInstBase *Inc, StringRef Name,
-                                     GlobalValue::LinkageTypes Linkage) {
-  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
-  auto &Ctx = M->getContext();
-  GlobalVariable *GV;
-  if (isa<InstrProfCoverInst>(Inc)) {
-    auto *CounterTy = Type::getInt8Ty(Ctx);
-    auto *CounterArrTy = ArrayType::get(CounterTy, NumCounters);
-    // TODO: `Constant::getAllOnesValue()` does not yet accept an array type.
-    std::vector<Constant *> InitialValues(NumCounters,
-                                          Constant::getAllOnesValue(CounterTy));
-    GV = new GlobalVariable(*M, CounterArrTy, false, Linkage,
-                            ConstantArray::get(CounterArrTy, InitialValues),
-                            Name);
-    GV->setAlignment(Align(1));
-  } else {
-    auto *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
-    GV = new GlobalVariable(*M, CounterTy, false, Linkage,
-                            Constant::getNullValue(CounterTy), Name);
-    GV->setAlignment(Align(8));
-  }
-  return GV;
+void InstrProfiling::maybeSetComdat(GlobalVariable *GV, Function *Fn,
+                                    StringRef VarName) {
+  bool DataReferencedByCode = profDataReferencedByCode(*M);
+  bool NeedComdat = needsComdatForCounter(*Fn, *M);
+  bool UseComdat = (NeedComdat || TT.isOSBinFormatELF());
+
+  if (!UseComdat)
+    return;
+
+  StringRef GroupName =
+      TT.isOSBinFormatCOFF() && DataReferencedByCode ? GV->getName() : VarName;
+  Comdat *C = M->getOrInsertComdat(GroupName);
+  if (!NeedComdat)
+    C->setSelectionKind(Comdat::NoDeduplicate);
+  GV->setComdat(C);
+  // COFF doesn't allow the comdat group leader to have private linkage, so
+  // upgrade private linkage to internal linkage to produce a symbol table
+  // entry.
+  if (TT.isOSBinFormatCOFF() && GV->hasPrivateLinkage())
+    GV->setLinkage(GlobalValue::InternalLinkage);
 }
 
-GlobalVariable *
-InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
+GlobalVariable *InstrProfiling::setupProfileSection(InstrProfInstBase *Inc,
+                                                    InstrProfSectKind IPSK) {
   GlobalVariable *NamePtr = Inc->getName();
-  auto &PD = ProfileDataMap[NamePtr];
-  if (PD.RegionCounters)
-    return PD.RegionCounters;
 
   // Match the linkage and visibility of the name global.
   Function *Fn = Inc->getParent()->getParent();
@@ -1021,42 +1137,100 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
   // nodeduplicate COMDAT which is lowered to a zero-flag section group. This
   // allows -z start-stop-gc to discard the entire group when the function is
   // discarded.
-  bool DataReferencedByCode = profDataReferencedByCode(*M);
-  bool NeedComdat = needsComdatForCounter(*Fn, *M);
   bool Renamed;
-  std::string CntsVarName =
-      getVarName(Inc, getInstrProfCountersVarPrefix(), Renamed);
-  std::string DataVarName =
-      getVarName(Inc, getInstrProfDataVarPrefix(), Renamed);
-  auto MaybeSetComdat = [&](GlobalVariable *GV) {
-    bool UseComdat = (NeedComdat || TT.isOSBinFormatELF());
-    if (UseComdat) {
-      StringRef GroupName = TT.isOSBinFormatCOFF() && DataReferencedByCode
-                                ? GV->getName()
-                                : CntsVarName;
-      Comdat *C = M->getOrInsertComdat(GroupName);
-      if (!NeedComdat)
-        C->setSelectionKind(Comdat::NoDeduplicate);
-      GV->setComdat(C);
-      // COFF doesn't allow the comdat group leader to have private linkage, so
-      // upgrade private linkage to internal linkage to produce a symbol table
-      // entry.
-      if (TT.isOSBinFormatCOFF() && GV->hasPrivateLinkage())
-        GV->setLinkage(GlobalValue::InternalLinkage);
-    }
-  };
+  GlobalVariable *Ptr;
+  StringRef VarPrefix;
+  std::string VarName;
+  if (IPSK == IPSK_cnts) {
+    VarPrefix = getInstrProfCountersVarPrefix();
+    VarName = getVarName(Inc, VarPrefix, Renamed);
+    InstrProfCntrInstBase *CntrIncrement = dyn_cast<InstrProfCntrInstBase>(Inc);
+    Ptr = createRegionCounters(CntrIncrement, VarName, Linkage);
+  } else if (IPSK == IPSK_bitmap) {
+    VarPrefix = getInstrProfBitmapVarPrefix();
+    VarName = getVarName(Inc, VarPrefix, Renamed);
+    InstrProfMCDCBitmapInstBase *BitmapUpdate =
+        dyn_cast<InstrProfMCDCBitmapInstBase>(Inc);
+    Ptr = createRegionBitmaps(BitmapUpdate, VarName, Linkage);
+  } else {
+    llvm_unreachable("Profile Section must be for Counters or Bitmaps");
+  }
 
+  Ptr->setVisibility(Visibility);
+  // Put the counters and bitmaps in their own sections so linkers can
+  // remove unneeded sections.
+  Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat()));
+  Ptr->setLinkage(Linkage);
+  maybeSetComdat(Ptr, Fn, VarName);
+  return Ptr;
+}
+
+GlobalVariable *
+InstrProfiling::createRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc,
+                                    StringRef Name,
+                                    GlobalValue::LinkageTypes Linkage) {
+  uint64_t NumBytes = Inc->getNumBitmapBytes()->getZExtValue();
+  auto *BitmapTy = ArrayType::get(Type::getInt8Ty(M->getContext()), NumBytes);
+  auto GV = new GlobalVariable(*M, BitmapTy, false, Linkage,
+                               Constant::getNullValue(BitmapTy), Name);
+  GV->setAlignment(Align(1));
+  return GV;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) {
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.RegionBitmaps)
+    return PD.RegionBitmaps;
+
+  // If RegionBitmaps doesn't already exist, create it by first setting up
+  // the corresponding profile section.
+  auto *BitmapPtr = setupProfileSection(Inc, IPSK_bitmap);
+  PD.RegionBitmaps = BitmapPtr;
+  return PD.RegionBitmaps;
+}
+
+GlobalVariable *
+InstrProfiling::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
+                                     GlobalValue::LinkageTypes Linkage) {
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
-  LLVMContext &Ctx = M->getContext();
+  auto &Ctx = M->getContext();
+  GlobalVariable *GV;
+  if (isa<InstrProfCoverInst>(Inc)) {
+    auto *CounterTy = Type::getInt8Ty(Ctx);
+    auto *CounterArrTy = ArrayType::get(CounterTy, NumCounters);
+    // TODO: `Constant::getAllOnesValue()` does not yet accept an array type.
+    std::vector<Constant *> InitialValues(NumCounters,
+                                          Constant::getAllOnesValue(CounterTy));
+    GV = new GlobalVariable(*M, CounterArrTy, false, Linkage,
+                            ConstantArray::get(CounterArrTy, InitialValues),
+                            Name);
+    GV->setAlignment(Align(1));
+  } else {
+    auto *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+    GV = new GlobalVariable(*M, CounterTy, false, Linkage,
+                            Constant::getNullValue(CounterTy), Name);
+    GV->setAlignment(Align(8));
+  }
+  return GV;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.RegionCounters)
+    return PD.RegionCounters;
 
-  auto *CounterPtr = createRegionCounters(Inc, CntsVarName, Linkage);
-  CounterPtr->setVisibility(Visibility);
-  CounterPtr->setSection(
-      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
-  CounterPtr->setLinkage(Linkage);
-  MaybeSetComdat(CounterPtr);
+  // If RegionCounters doesn't already exist, create it by first setting up
+  // the corresponding profile section.
+  auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
   PD.RegionCounters = CounterPtr;
+
   if (DebugInfoCorrelate) {
+    LLVMContext &Ctx = M->getContext();
+    Function *Fn = Inc->getParent()->getParent();
     if (auto *SP = Fn->getSubprogram()) {
       DIBuilder DB(*M, true, SP->getUnit());
       Metadata *FunctionNameAnnotation[] = {
@@ -1085,8 +1259,50 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
       CounterPtr->addDebugInfo(DICounter);
       DB.finalize();
     }
+
+    // Mark the counter variable as used so that it isn't optimized out.
+    CompilerUsedVars.push_back(PD.RegionCounters);
+  }
+
+  return PD.RegionCounters;
+}
+
+void InstrProfiling::createDataVariable(InstrProfCntrInstBase *Inc,
+                                        InstrProfMCDCBitmapParameters *Params) {
+  // When debug information is correlated to profile data, a data variable
+  // is not needed.
+  if (DebugInfoCorrelate)
+    return;
+
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+
+  LLVMContext &Ctx = M->getContext();
+
+  Function *Fn = Inc->getParent()->getParent();
+  GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
+  GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
+
+  // Due to the limitation of binder as of 2021/09/28, the duplicate weak
+  // symbols in the same csect won't be discarded. When there are duplicate weak
+  // symbols, we can NOT guarantee that the relocations get resolved to the
+  // intended weak symbol, so we can not ensure the correctness of the relative
+  // CounterPtr, so we have to use private linkage for counter and data symbols.
+  if (TT.isOSBinFormatXCOFF()) {
+    Linkage = GlobalValue::PrivateLinkage;
+    Visibility = GlobalValue::DefaultVisibility;
   }
 
+  bool DataReferencedByCode = profDataReferencedByCode(*M);
+  bool NeedComdat = needsComdatForCounter(*Fn, *M);
+  bool Renamed;
+
+  // The Data Variable section is anchored to profile counters.
+  std::string CntsVarName =
+      getVarName(Inc, getInstrProfCountersVarPrefix(), Renamed);
+  std::string DataVarName =
+      getVarName(Inc, getInstrProfDataVarPrefix(), Renamed);
+
   auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
   // Allocate statically the array of pointers to value profile nodes for
   // the current function.
@@ -1104,16 +1320,17 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
     ValuesVar->setSection(
         getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
     ValuesVar->setAlignment(Align(8));
-    MaybeSetComdat(ValuesVar);
+    maybeSetComdat(ValuesVar, Fn, CntsVarName);
     ValuesPtrExpr =
         ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
   }
 
-  if (DebugInfoCorrelate) {
-    // Mark the counter variable as used so that it isn't optimized out.
-    CompilerUsedVars.push_back(PD.RegionCounters);
-    return PD.RegionCounters;
-  }
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  auto *CounterPtr = PD.RegionCounters;
+
+  uint64_t NumBitmapBytes = 0;
+  if (Params != nullptr)
+    NumBitmapBytes = Params->getNumBitmapBytes()->getZExtValue();
 
   // Create data variable.
   auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext());
@@ -1156,6 +1373,16 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
       ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
                            ConstantExpr::getPtrToInt(Data, IntPtrTy));
 
+  // Bitmaps are relative to the same data variable as profile counters.
+  GlobalVariable *BitmapPtr = PD.RegionBitmaps;
+  Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0);
+
+  if (BitmapPtr != nullptr) {
+    RelativeBitmapPtr =
+        ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
+                             ConstantExpr::getPtrToInt(Data, IntPtrTy));
+  }
+
   Constant *DataVals[] = {
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
 #include "llvm/ProfileData/InstrProfData.inc"
@@ -1165,7 +1392,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
   Data->setVisibility(Visibility);
   Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
   Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
-  MaybeSetComdat(Data);
+  maybeSetComdat(Data, Fn, CntsVarName);
 
   PD.DataVar = Data;
 
@@ -1177,8 +1404,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
   NamePtr->setLinkage(GlobalValue::PrivateLinkage);
   // Collect the referenced names to be used by emitNameData.
   ReferencedNames.push_back(NamePtr);
-
-  return PD.RegionCounters;
 }
 
 void InstrProfiling::emitVNodes() {
diff --git a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
new file mode 100644
index 0000000000000..fccb026c25bf2
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
@@ -0,0 +1,53 @@
+; Check that MC/DC intrinsics are properly lowered
+; RUN: opt < %s -passes=instrprof -S | FileCheck %s
+; RUN: opt < %s -passes=instrprof -runtime-counter-relocation -S 2>&1 | FileCheck %s --check-prefix RELOC
+
+; RELOC: Runtime counter relocation is presently not supported for MC/DC bitmaps
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@__profn_test = private constant [4 x i8] c"test"
+
+; CHECK: @__profbm_test = private global [1 x i8] zeroinitializer, section "__llvm_prf_bits", comdat, align 1
+
+define dso_local void @test(i32 noundef %A) {
+entry:
+  %A.addr = alloca i32, align 4
+  %mcdc.addr = alloca i32, align 4
+  call void @llvm.instrprof.cover(ptr @__profn_test, i64 99278, i32 5, i32 0)
+  ; CHECK: store i8 0, ptr @__profc_test, align 1
+
+  call void @llvm.instrprof.mcdc.parameters(ptr @__profn_test, i64 99278, i32 1)
+  store i32 0, ptr %mcdc.addr, align 4
+  %0 = load i32, ptr %A.addr, align 4
+  %tobool = icmp ne i32 %0, 0
+
+  call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 99278, i32 0, ptr %mcdc.addr, i1 %tobool)
+  ; CHECK:      %mcdc.temp = load i32, ptr %mcdc.addr, align 4
+  ; CHECK-NEXT: %1 = zext i1 %tobool to i32
+  ; CHECK-NEXT: %2 = shl i32 %1, 0
+  ; CHECK-NEXT: %3 = or i32 %mcdc.temp, %2
+  ; CHECK-NEXT: store i32 %3, ptr %mcdc.addr, align 4
+
+  call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 99278, i32 1, i32 0, ptr %mcdc.addr)
+  ; CHECK:       %mcdc.temp1 = load i32, ptr %mcdc.addr, align 4
+  ; CHECK-NEXT: %4 = lshr i32 %mcdc.temp1, 3
+  ; CHECK-NEXT: %5 = zext i32 %4 to i64
+  ; CHECK-NEXT: %6 = add i64 ptrtoint (ptr @__profbm_test to i64), %5
+  ; CHECK-NEXT: %7 = inttoptr i64 %6 to ptr
+  ; CHECK-NEXT: %8 = and i32 %mcdc.temp1, 7
+  ; CHECK-NEXT: %9 = trunc i32 %8 to i8
+  ; CHECK-NEXT: %10 = shl i8 1, %9
+  ; CHECK-NEXT: %mcdc.bits = load i8, ptr %7, align 1
+  ; CHECK-NEXT: %11 = or i8 %mcdc.bits, %10
+  ; CHECK-NEXT: store i8 %11, ptr %7, align 1
+  ret void
+}
+
+declare void @llvm.instrprof.cover(ptr, i64, i32, i32)
+
+declare void @llvm.instrprof.mcdc.parameters(ptr, i64, i32)
+
+declare void @llvm.instrprof.mcdc.condbitmap.update(ptr, i64, i32, ptr, i1)
+
+declare void @llvm.instrprof.mcdc.tvbitmap.update(ptr, i64, i32, i32, ptr)
diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 1c44a274f3c04..8c6942c0f527b 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, ptr, ptr, i32, [2 x i16] } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic.profraw b/llvm/test/tools/llvm-profdata/Inputs/basic.profraw
index ad88759398c6020f4ab8a5606258e69d98e36687..1b284b84fad6dd7f9407b1c3b99cb178af0e09c6 100644
GIT binary patch
delta 63
zcmbQicz}_!u_!ISs37M*=R{6@Muv%+@_Y__aks4+{{I)d(5P~>exjd}00R^}C<ciz
MFiaFsm^gt00I~}dcmMzZ

delta 71
zcmX@WID?V1u_!ISs37M*$3#y3i5>!MPiq_)7#Jp|DQPSa2C)!;5z2(oEDX7cnRyHh
E0CJEFxc~qF

diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index bc8fc5db1cb154d98ca962e84313463e3298cb92..9cd225587c92511e99f3497ce1d5f47c6fc5f0af 100644
GIT binary patch
delta 308
zcmeC+d%(}xSd^AuRFLzZb0Vky<O$3iAUx4RfX~9}b<L-S|Nr|Id}-pXpXg@<mT2IZ
z+`z&yu|t4E;?I9DnB2%LK5;!KBg4dv;v5dhg7-m!55+k=kOU|5GXg~?3o?px>_8T_
z2Map_g$4d1%$=MM7A^z|M<5H&2MaF*3U?q0gIzKCB2aV#lIY~~%z~3Sm;_Lj)^mdV
hG1&qro`I~m9;i42C}@Bz*w4f{If6-G^8%&}764*Dk$V6D

delta 330
zcmaFB-@(V(Sd^AuRFLzZV<M;iL=OQr?j8pQ28M}gMiUos2sW=}UHZ|vFC*x(X@r=K
zpo_-DM)8RcI0RWdkF#hTEzY!!(dF|N{;8e-6rZfX$RQXyr9zW+zG|HG9M4#j<cAk3
zKoSW+i9*d&Hfl|i_S<DNH=kTEli>$QVgXRX-d83sq200i+W)W}ZBd!q5+)yH6rcP6
zC=<D)Yena*#Ss%9+w_0VVPV(-mQi5h5d1pTmc=_DRyh8mQJDU7xivpPGC+4pC_Fu!
zx%XpC<%MfNKLCNjzyDw`c><G!MD2^`M+&CBcAE6+!;GZ%4;2zn$%zvrBz|4(eQuUA
t^|k~=9|H$WqX4snVAN?IC-;SZB^TWwvJb!xNnqxfEWjqPxq&%>1puPjh&2EJ

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index 134b78f7af5b760dc3af7422c1bf7661f4bae14a..9966729d92ddc33bf89eeb3fee87215bbabbbef1 100644
GIT binary patch
delta 326
zcmaFCyMbS@u_!ISs37M*Cj%5r6jYzQfSCh?Ct3(dSahA6xxL~4|GoghG>>`)21b~&
z3=Oc72^^Cruy9PAA;2N==RX)sp2#ddaX%*`!^DH)91h5W??Hke#W_5X1SiWg0!1e)
zGKzETKo<4~3kL#)1^yz;om>wVZUhQPAPcVt3vUDpcOVIaT`~C~P;>*5=;Zs%f|Dhf
o1W=XsbAtRa*#jt^fvmV6sJH?sXn-s@pNVsF29v<%4NMj+0Qs7eE&u=k

delta 364
zcmdnM|AJSru_!ISs37M*2Ll8MOcYd~=pmqxDy0AxV1$ap<mHy0d+VVRGjC#n#>532
zg3W7Lmwt5a%Luw`8X;yQ=%O*PQGDV94nY>r<189Si!*Iwboso6f2t<{#V0E;atMY_
zsnBGduNo&k$1~O>`Qe2MkVFDdqEPdcjat*B{dO76%_kSkWcUG+SOAo;_m#;@Xm_l>
z_CIV#TU6$@gvkdP#V0=i%0w>dTG2Ucam2*OHvOM-SQvJIWfYh=1iwzTW$_M(6^_4X
z6sG@NZp{yn4A5N?3QrGb?)?~3dEpwwUk3mFgTdqpOcD~cFP<MMnD*Lf(yI?MlG;C1
zNI)egPLPoJb+z}oS<2Mg60q=qX%t|V5R5v_<K({3ujHZ|MD_vLAqmVJlLgoWHa9RE
GumAv>dW;(Y

diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index 67db5c98ef323..eda63203a304a 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -5,13 +5,15 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -23,8 +25,11 @@ RUN: printf '\2\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -51,14 +56,18 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\xd8\xff\3\0\1\0\0\0' >> %t.profraw
+RUN: printf '\xc8\xff\3\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index 2394431e94de4..38b838e0d100a 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -9,6 +9,9 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index 06f418d0235d2..c967e850dbe35 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -5,20 +5,25 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -35,7 +40,9 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index b718cf0fd8e97..e1e33824bf2f8 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -5,20 +5,26 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -35,8 +41,10 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 // Make NumCounters = 0 so that we get "number of counters is zero" error message
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 38e40334a6a69..3c23bc7dd0f7f 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -5,20 +5,25 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -38,10 +43,12 @@ RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
 // Octal '\11' is 9 in decimal: this should push CounterOffset to 1. As there are two counters,
 // the profile reader should error out.
 RUN: printf '\11\0\6\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Counter Section
 
diff --git a/llvm/test/tools/llvm-profdata/mcdc-bitmap.test b/llvm/test/tools/llvm-profdata/mcdc-bitmap.test
new file mode 100644
index 0000000000000..a7b1b5df8c306
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/mcdc-bitmap.test
@@ -0,0 +1,201 @@
+# Test MC/DC bitmap reading and merging.
+
+# Merge as profdata.
+RUN: split-file %s %t
+RUN: llvm-profdata merge %t/mcdc-1.proftext %t/mcdc-2.proftext -o %t.profdata
+RUN: llvm-profdata show %t.profdata --text -all-functions | FileCheck %s --check-prefix=MCDC
+# Merge as proftext.
+RUN: llvm-profdata merge %t/mcdc-1.proftext %t/mcdc-2.proftext -o %t.proftext
+RUN: llvm-profdata show %t.proftext --text -all-functions | FileCheck %s --check-prefix=MCDC
+
+MCDC: # Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: # Bitmap Byte Values:
+MCDC-NEXT: a
+MCDC: # Num Bitmap Bytes:
+MCDC-NEXT: $2
+MCDC-NEXT: # Bitmap Byte Values:
+MCDC-NEXT: 0x29
+MCDC-NEXT: 0x0
+
+# Merge as profdata.
+RUN: llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-4.proftext -o %t.profdata
+RUN: llvm-profdata show %t.profdata --text -all-functions | FileCheck %s --check-prefix=MCDC2
+# Merge as proftext.
+RUN: llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-4.proftext -o %t.proftext
+RUN: llvm-profdata show %t.proftext --text -all-functions | FileCheck %s --check-prefix=MCDC2
+
+MCDC2: # Num Bitmap Bytes:
+MCDC2-NEXT: $8
+MCDC2-NEXT: # Bitmap Byte Values:
+MCDC2-NEXT: 0x1
+MCDC2-NEXT: 0x2
+MCDC2-NEXT: 0x3
+MCDC2-NEXT: 0xf
+MCDC2-NEXT: 0xf
+MCDC2-NEXT: 0xe
+MCDC2-NEXT: 0xf
+MCDC2-NEXT: 0xa
+
+# Incompatible size mismatch.
+RUN: llvm-profdata merge %t/mcdc-2.proftext %t/mcdc-4.proftext -o %t.profdata 2>&1 | FileCheck %s --check-prefix=MCDC3
+# Merge as proftext
+RUN: llvm-profdata merge %t/mcdc-2.proftext %t/mcdc-4.proftext -o %t.proftext 2>&1 | FileCheck %s --check-prefix=MCDC3
+
+MCDC3: function bitmap size change detected (bitmap size mismatch)
+
+# Invalid number of bitmap bytes.
+RUN: not llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-err0.proftext -o %t.proftext 2>&1 | FileCheck %s --check-prefix=MCDC4
+
+MCDC4: malformed instrumentation profile data: number of bitmap bytes is not a valid integer
+
+# Invalid bitmap byte.
+RUN: not llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-err1.proftext -o %t.proftext 2>&1 | FileCheck %s --check-prefix=MCDC5
+
+MCDC5: malformed instrumentation profile data: bitmap byte is not a valid integer
+
+;--- mcdc-1.proftext
+main
+# Func Hash:
+702755447896
+# Num Counters:
+4
+# Counter Values:
+1
+0
+1
+0
+# Num Bitmask Bytes:
+$1
+# Bitmask Byte Values:
+2
+;--- mcdc-2.proftext
+main
+# Func Hash:
+702755447896
+# Num Counters:
+4
+# Counter Values:
+1
+1
+1
+1
+# Num Bitmask Bytes:
+$1
+# Bitmask Byte Values:
+8
+
+
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$0x2
+# Bitmask Byte Values:
+0x29
+0x0
+;--- mcdc-3.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$8
+# Bitmask Byte Values:
+0x0
+0x2
+0x3
+0xf
+0xf
+0xa
+0xc
+0x2
+;--- mcdc-4.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$       8
+# Bitmask Byte Values:
+1
+2
+3
+4
+5
+6
+7
+8
+;--- mcdc-err0.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$8.9
+# Bitmask Byte Values:
+1
+2
+3
+4
+5
+6
+7
+8
+;--- mcdc-err1.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$8
+# Bitmask Byte Values:
+1
+2
+3
+4
+5.4
+6
+7
+8
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index 171b5cc60878f..4a5c42843ff4d 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index 24f3f563e9689..389646d64b1cd 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -8,6 +8,9 @@ RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index c8e862009ef02..fbd31d044382a 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -1,37 +1,46 @@
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\10' >> %t
+RUN: printf '\0\0\0\0\0\0\0\11' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\4' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\0\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
 RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
-RUN: printf '\0\xff\xff\xe0' >> %t
+RUN: printf '\0\xff\xff\xd8' >> %t
+RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\101' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -48,3 +57,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index 523ff1ceb4807..bb899c5fdb555 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -1,37 +1,46 @@
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\4\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
-RUN: printf '\xe0\xff\xff\0' >> %t
+RUN: printf '\xd8\xff\xff\0' >> %t
+RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
 RUN: printf '\101\0\0\0\0\0\0\0' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -48,3 +57,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index b2b8b31dafbf5..8fcadb6a0dd28 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -1,35 +1,44 @@
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\10' >> %t
+RUN: printf '\0\0\0\0\0\0\0\11' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\4' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
+RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
+RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
-RUN: printf '\0\0\0\1\0\3\xff\xd8' >> %t
+RUN: printf '\0\0\0\1\0\3\xff\xc8' >> %t
+RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\02\0\0\0\0' >> %t
+RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\101' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -46,3 +55,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 4e95798bc0afb..0aa8b38f69267 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -1,35 +1,44 @@
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\4\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t
+RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t
+RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
+RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
-RUN: printf '\xd8\xff\3\0\1\0\0\0' >> %t
+RUN: printf '\xc8\xff\3\0\1\0\0\0' >> %t
+RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
 RUN: printf '\101\0\0\0\0\0\0\0' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -46,3 +55,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index 8d46c91e2732c..f4a9aa8e1bbc3 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -1,12 +1,15 @@
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
@@ -15,20 +18,25 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
@@ -37,7 +45,9 @@ RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\101\0\0\0\0\0\0\0' >> %t-bar.profraw

From 60a227c464a19a00a76d19b5bc75e0e4d5c89873 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow@amd.com>
Date: Tue, 1 Aug 2023 17:51:59 -0700
Subject: [PATCH 12/57] [AMDGPU] Use inreg for hint to preload kernel arguments

This patch is the first in a series that adds support for pre-loading
kernel arguments into SGPRs. The command-line argument
'amdgpu-kernarg-preload-count' is used to specify the number of
arguments sequentially from the first that we should attempt to preload,
the default is 0.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D156852
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  24 +-
 .../AMDGPU/preload-kernargs-inreg-hints.ll    | 263 ++++++++++++++++++
 2 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 57c873f00a4a1..d7dc37066b1fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -28,6 +28,10 @@ void initializeCycleInfoWrapperPassPass(PassRegistry &);
 
 using namespace llvm;
 
+static cl::opt<unsigned> KernargPreloadCount(
+    "amdgpu-kernarg-preload-count",
+    cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
+
 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
 
 enum ImplicitArgumentPositions {
@@ -914,6 +918,21 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
 }
 
+static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  for (unsigned I = 0;
+       I < F.arg_size() &&
+       I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
+       ++I) {
+    Argument &Arg = *F.getArg(I);
+    // Check for incompatible attributes.
+    if (Arg.hasByRefAttr() || Arg.hasNestAttr())
+      break;
+
+    Arg.addAttr(Attribute::InReg);
+  }
+}
+
 class AMDGPUAttributor : public ModulePass {
 public:
   AMDGPUAttributor() : ModulePass(ID) {}
@@ -960,9 +979,12 @@ class AMDGPUAttributor : public ModulePass {
       if (!F.isIntrinsic()) {
         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
         A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
-        if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
+        CallingConv::ID CC = F.getCallingConv();
+        if (!AMDGPU::isEntryFunctionCC(CC)) {
           A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
           A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
+        } else if (CC == CallingConv::AMDGPU_KERNEL) {
+          addPreloadKernArgHint(F, *TM);
         }
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
new file mode 100644
index 0000000000000..1238fa8c49a92
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s
+
+define amdgpu_kernel void @test_preload_hint_kernel_1(ptr %0) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
+; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
+; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
+; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
+; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
+; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @test_preload_hint_kernel_2(i32 %0, i64 %1) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
+; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
+; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
+; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
+; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
+; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @test_preload_hint_kernel_4(i32 %0, i64 %1, <2 x float> %2, ptr %3) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
+; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
+; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
+; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
+; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
+; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x float> %2, ptr %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %17) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
+; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
+; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
+; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
+; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
+; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_preload_hint_non_kernel_2(i32 %0, i64 %1) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
+; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
+; PRELOAD-1-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
+; PRELOAD-3-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
+; PRELOAD-16-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
+; PRELOAD-20-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
+; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; NO-PRELOAD-NEXT:    call void @func(ptr [[TMP0]])
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
+; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; PRELOAD-1-NEXT:    call void @func(ptr [[TMP0]])
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
+; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; PRELOAD-3-NEXT:    call void @func(ptr [[TMP0]])
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
+; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; PRELOAD-16-NEXT:    call void @func(ptr [[TMP0]])
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
+; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; PRELOAD-20-NEXT:    call void @func(ptr [[TMP0]])
+; PRELOAD-20-NEXT:    ret void
+;
+  call void @func(ptr %0)
+  ret void
+}
+
+define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
+; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR2]] {
+; NO-PRELOAD-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
+; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-1-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
+; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-3-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
+; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-16-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
+; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-20-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
+; PRELOAD-20-NEXT:    ret void
+;
+  call void @llvm.amdgcn.set.prio(i16 %0)
+  ret void
+}
+
+define spir_kernel void @test_preload_hint_kernel_1_spir_cc(ptr %0) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
+; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
+; PRELOAD-1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
+; PRELOAD-3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
+; PRELOAD-16-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
+; PRELOAD-20-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, i64 %1) #0 {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
+; NO-PRELOAD-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
+; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
+; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
+; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
+; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
+; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
+; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-1-NEXT:    ret void
+;
+; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
+; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-3-NEXT:    ret void
+;
+; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
+; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-16-NEXT:    ret void
+;
+; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
+; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-20-NEXT:    ret void
+;
+  ret void
+}
+
+declare void @func(ptr) #0
+declare void @llvm.amdgcn.set.prio(i16)
+
+attributes #0 = { nounwind }

From 2d1e8a03f5eeff48cd7928d003fc12f728b2c7cf Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Wed, 20 Sep 2023 06:14:45 +0800
Subject: [PATCH 13/57] [EarlyCSE] Compare GEP instructions based on offset
 (#65875)

Closes #65763.
This will provide more opportunities for constant propagation for
subsequent optimizations.
---
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp       | 172 ++++++++++++++----
 llvm/test/Transforms/EarlyCSE/gep.ll          |  44 +++++
 .../PhaseOrdering/X86/unroll-vectorizer.ll    |  44 +++++
 3 files changed, 229 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/Transforms/EarlyCSE/gep.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll

diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 439235f47471e..4990fa9f8b5ea 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -67,6 +67,7 @@ STATISTIC(NumCSE,      "Number of instructions CSE'd");
 STATISTIC(NumCSECVP,   "Number of compare instructions CVP'd");
 STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
 STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
+STATISTIC(NumCSEGEP, "Number of GEP instructions CSE'd");
 STATISTIC(NumDSE,      "Number of trivial dead stores removed");
 
 DEBUG_COUNTER(CSECounter, "early-cse",
@@ -143,11 +144,11 @@ struct SimpleValue {
              !CI->getFunction()->isPresplitCoroutine();
     }
     return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
-           isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
-           isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
-           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-           isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
-           isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
+           isa<BinaryOperator>(Inst) || isa<CmpInst>(Inst) ||
+           isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+           isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+           isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst) ||
+           isa<FreezeInst>(Inst);
   }
 };
 
@@ -307,10 +308,9 @@ static unsigned getHashValueImpl(SimpleValue Val) {
                         IVI->getOperand(1),
                         hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
 
-  assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
-          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
-          isa<FreezeInst>(Inst)) &&
+  assert((isa<CallInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+          isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+          isa<UnaryOperator>(Inst) || isa<FreezeInst>(Inst)) &&
          "Invalid/unknown instruction");
 
   // Handle intrinsics with commutative operands.
@@ -548,11 +548,81 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
   // currently executing, so conservatively return false if they are in
   // different basic blocks.
   if (LHSI->isConvergent() && LHSI->getParent() != RHSI->getParent())
-      return false;
+    return false;
 
   return LHSI->isIdenticalTo(RHSI);
 }
 
+//===----------------------------------------------------------------------===//
+// GEPValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct GEPValue {
+  Instruction *Inst;
+  std::optional<int64_t> ConstantOffset;
+
+  GEPValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  GEPValue(Instruction *I, std::optional<int64_t> ConstantOffset)
+      : Inst(I), ConstantOffset(ConstantOffset) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    return isa<GetElementPtrInst>(Inst);
+  }
+};
+
+} // namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<GEPValue> {
+  static inline GEPValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+
+  static inline GEPValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static unsigned getHashValue(const GEPValue &Val);
+  static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
+};
+
+} // end namespace llvm
+
+unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
+  auto *GEP = cast<GetElementPtrInst>(Val.Inst);
+  if (Val.ConstantOffset.has_value())
+    return hash_combine(GEP->getOpcode(), GEP->getPointerOperand(),
+                        Val.ConstantOffset.value());
+  return hash_combine(
+      GEP->getOpcode(),
+      hash_combine_range(GEP->value_op_begin(), GEP->value_op_end()));
+}
+
+bool DenseMapInfo<GEPValue>::isEqual(const GEPValue &LHS, const GEPValue &RHS) {
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHS.Inst == RHS.Inst;
+  auto *LGEP = cast<GetElementPtrInst>(LHS.Inst);
+  auto *RGEP = cast<GetElementPtrInst>(RHS.Inst);
+  if (LGEP->getPointerOperand() != RGEP->getPointerOperand())
+    return false;
+  if (LHS.ConstantOffset.has_value() && RHS.ConstantOffset.has_value())
+    return LHS.ConstantOffset.value() == RHS.ConstantOffset.value();
+  return LGEP->isIdenticalToWhenDefined(RGEP);
+}
+
 //===----------------------------------------------------------------------===//
 // EarlyCSE implementation
 //===----------------------------------------------------------------------===//
@@ -647,6 +717,13 @@ class EarlyCSE {
       ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
   CallHTType AvailableCalls;
 
+  using GEPMapAllocatorTy =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<GEPValue, Value *>>;
+  using GEPHTType = ScopedHashTable<GEPValue, Value *, DenseMapInfo<GEPValue>,
+                                    GEPMapAllocatorTy>;
+  GEPHTType AvailableGEPs;
+
   /// This is the current generation of the memory value.
   unsigned CurrentGeneration = 0;
 
@@ -667,9 +744,11 @@ class EarlyCSE {
   class NodeScope {
   public:
     NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
-      : Scope(AvailableValues), LoadScope(AvailableLoads),
-        InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+              GEPHTType &AvailableGEPs)
+        : Scope(AvailableValues), LoadScope(AvailableLoads),
+          InvariantScope(AvailableInvariants), CallScope(AvailableCalls),
+          GEPScope(AvailableGEPs) {}
     NodeScope(const NodeScope &) = delete;
     NodeScope &operator=(const NodeScope &) = delete;
 
@@ -678,6 +757,7 @@ class EarlyCSE {
     LoadHTType::ScopeTy LoadScope;
     InvariantHTType::ScopeTy InvariantScope;
     CallHTType::ScopeTy CallScope;
+    GEPHTType::ScopeTy GEPScope;
   };
 
   // Contains all the needed information to create a stack for doing a depth
@@ -688,13 +768,13 @@ class EarlyCSE {
   public:
     StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
               InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
-              unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
+              GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n,
+              DomTreeNode::const_iterator child,
               DomTreeNode::const_iterator end)
         : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
           EndIter(end),
           Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
-                 AvailableCalls)
-          {}
+                 AvailableCalls, AvailableGEPs) {}
     StackNode(const StackNode &) = delete;
     StackNode &operator=(const StackNode &) = delete;
 
@@ -1214,6 +1294,20 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
   return Result;
 }
 
+static void combineIRFlags(Instruction &From, Value *To) {
+  if (auto *I = dyn_cast<Instruction>(To)) {
+    // If I being poison triggers UB, there is no need to drop those
+    // flags. Otherwise, only retain flags present on both I and Inst.
+    // TODO: Currently some fast-math flags are not treated as
+    // poison-generating even though they should. Until this is fixed,
+    // always retain flags present on both I and Inst for floating point
+    // instructions.
+    if (isa<FPMathOperator>(I) ||
+        (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
+      I->andIRFlags(&From);
+  }
+}
+
 bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
                                 const ParseMemoryInst &Later) {
   // Can we remove Earlier store because of Later store?
@@ -1439,16 +1533,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
           continue;
         }
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          // If I being poison triggers UB, there is no need to drop those
-          // flags. Otherwise, only retain flags present on both I and Inst.
-          // TODO: Currently some fast-math flags are not treated as
-          // poison-generating even though they should. Until this is fixed,
-          // always retain flags present on both I and Inst for floating point
-          // instructions.
-          if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
-            I->andIRFlags(&Inst);
-        }
+        combineIRFlags(Inst, V);
         Inst.replaceAllUsesWith(V);
         salvageKnowledge(&Inst, &AC);
         removeMSSA(Inst);
@@ -1561,6 +1646,31 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
+    // Compare GEP instructions based on offset.
+    if (GEPValue::canHandle(&Inst)) {
+      auto *GEP = cast<GetElementPtrInst>(&Inst);
+      APInt Offset = APInt(SQ.DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+      GEPValue GEPVal(GEP, GEP->accumulateConstantOffset(SQ.DL, Offset)
+                               ? Offset.trySExtValue()
+                               : std::nullopt);
+      if (Value *V = AvailableGEPs.lookup(GEPVal)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE GEP: " << Inst << "  to: " << *V
+                          << '\n');
+        combineIRFlags(Inst, V);
+        Inst.replaceAllUsesWith(V);
+        salvageKnowledge(&Inst, &AC);
+        removeMSSA(Inst);
+        Inst.eraseFromParent();
+        Changed = true;
+        ++NumCSEGEP;
+        continue;
+      }
+
+      // Otherwise, just remember that we have this GEP.
+      AvailableGEPs.insert(GEPVal, &Inst);
+      continue;
+    }
+
     // A release fence requires that all stores complete before it, but does
     // not prevent the reordering of following loads 'before' the fence.  As a
     // result, we don't need to consider it as writing to memory and don't need
@@ -1675,7 +1785,7 @@ bool EarlyCSE::run() {
   // Process the root node.
   nodesToProcess.push_back(new StackNode(
       AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
-      CurrentGeneration, DT.getRootNode(),
+      AvailableGEPs, CurrentGeneration, DT.getRootNode(),
       DT.getRootNode()->begin(), DT.getRootNode()->end()));
 
   assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
@@ -1698,10 +1808,10 @@ bool EarlyCSE::run() {
     } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
       // Push the next child onto the stack.
       DomTreeNode *child = NodeToProcess->nextChild();
-      nodesToProcess.push_back(
-          new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
-                        AvailableCalls, NodeToProcess->childGeneration(),
-                        child, child->begin(), child->end()));
+      nodesToProcess.push_back(new StackNode(
+          AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+          AvailableGEPs, NodeToProcess->childGeneration(), child,
+          child->begin(), child->end()));
     } else {
       // It has been processed, and there are no more children to process,
       // so delete it and pop it off the stack.
diff --git a/llvm/test/Transforms/EarlyCSE/gep.ll b/llvm/test/Transforms/EarlyCSE/gep.ll
new file mode 100644
index 0000000000000..499b5ac8de0af
--- /dev/null
+++ b/llvm/test/Transforms/EarlyCSE/gep.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -S -passes=early-cse -earlycse-debug-hash | FileCheck %s
+; RUN: opt < %s -S -passes='early-cse<memssa>' | FileCheck %s
+
+%T1 = type { i64, i64, i64 }
+
+declare void @use_vec(<4 x ptr>);
+
+define void @foo(ptr %a, <4 x i64> %b, i64 %i) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[A:%.*]], <4 x i64> [[B:%.*]], i64 [[I:%.*]]) {
+; CHECK-NEXT:    [[S1A:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[N1D:%.*]] = getelementptr i8, ptr [[A]], i64 7
+; CHECK-NEXT:    [[N1G:%.*]] = getelementptr i32, ptr [[A]], i64 1
+; CHECK-NEXT:    [[N1H:%.*]] = getelementptr i8, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[V:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    call void @use_vec(<4 x ptr> [[V]])
+; CHECK-NEXT:    [[V2:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 0, i64 2, i64 1, i64 1>
+; CHECK-NEXT:    call void @use_vec(<4 x ptr> [[V2]])
+; CHECK-NEXT:    ret void
+;
+  %s1a = getelementptr i8, ptr %a, i64 8
+  %s1av = load i64, ptr %s1a
+  %s1b = getelementptr inbounds i8, ptr %a, i64 8
+  %s1bv = load i64, ptr %s1b
+  %s1c = getelementptr %T1, ptr %a, i64 0, i32 1
+  %s1cv = load i64, ptr %s1c
+  %n1d = getelementptr i8, ptr %a, i64 7
+  %n1dv = load i64, ptr %n1d
+  %s1e = getelementptr i64, ptr %a, i64 1
+  %s1ev = load i64, ptr %s1e
+  %s1f = getelementptr i32, ptr %a, i64 2
+  %s1fv = load i64, ptr %s1f
+  %n1g = getelementptr i32, ptr %a, i64 1
+  %n1gv = load i64, ptr %n1g
+  %n1h = getelementptr i8, ptr %a, i64 %i
+  %n1hv = load i64, ptr %n1h
+
+  %v = getelementptr i64, ptr %a, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+  call void @use_vec(<4 x ptr> %v)
+  %v2 = getelementptr i64, ptr %a, <4 x i64> <i64 0, i64 2, i64 1, i64 1>
+  call void @use_vec(<4 x ptr> %v2)
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
new file mode 100644
index 0000000000000..1c9e7a771ca19
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%Zip = type { { ptr, ptr }, { [32 x i8], { i64, i64 } } }
+
+define void @foo(ptr %a, <32 x i8> %_0) #0 {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr nocapture writeonly [[A:%.*]], <32 x i8> [[_0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    store <32 x i8> [[_0]], ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+start:
+  %z = alloca %Zip, align 8
+  %sroa_1 = getelementptr i8, ptr %z, i64 16
+  store <32 x i8> %_0, ptr %sroa_1, align 8
+  %len_ = getelementptr i8, ptr %z, i64 56
+  store i64 32, ptr %len_, align 8
+  %_1 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1
+  %_2 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1, i32 1
+  %len = load i64, ptr %_2, align 8
+  %_10 = getelementptr %Zip, ptr %z, i64 0, i32 1
+  br label %body
+
+body:                                             ; preds = %body, %start
+  %_34 = phi ptr [ %_34i, %body ], [ %a, %start ]
+  %idx = phi i64 [ %idx_, %body ], [ 0, %start ]
+  %_34i = getelementptr i8, ptr %_34, i64 1
+  %idx_ = add i64 %idx, 1
+  store i64 0, ptr %_1, align 8
+  %_24 = getelementptr i8, ptr %_10, i64 %idx
+  %_18 = load i8, ptr %_24, align 1
+  store i8 %_18, ptr %_34, align 1
+  %_6 = icmp eq i64 %len, %idx_
+  br i1 %_6, label %exit, label %body
+
+exit:                                             ; preds = %body
+  ret void
+}
+
+attributes #0 = { "target-cpu"="znver3" }

From e7a7a1690139e1959ee70a45150ea62620d54e9c Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 19 Sep 2023 18:15:26 -0400
Subject: [PATCH 14/57] [libc++] Fix __threading_support when used with C11
 threading (#66780)

Since we are defining these typedefs inside namespace std, we need to
refer to ::once_flag (the C Standard Library version). Otherwise
'once_flag' refers to 'std::once_flag', and that's not something we can
pass to the C Standard Library '::call_once()' function later on.
---
 libcxx/include/__threading_support | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__threading_support b/libcxx/include/__threading_support
index eaa402abae351..ccbcc652d9e50 100644
--- a/libcxx/include/__threading_support
+++ b/libcxx/include/__threading_support
@@ -102,7 +102,7 @@ typedef cnd_t __libcpp_condvar_t;
 #define _LIBCPP_CONDVAR_INITIALIZER {}
 
 // Execute once
-typedef once_flag __libcpp_exec_once_flag;
+typedef ::once_flag __libcpp_exec_once_flag;
 #define _LIBCPP_EXEC_ONCE_INITIALIZER ONCE_FLAG_INIT
 
 // Thread id

From 6af26745974e9d14765d7e961be6af45b9ccd978 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 19 Sep 2023 18:17:03 -0400
Subject: [PATCH 15/57] [clang] Improve CI output when trailing whitespace is
 found (#66649)

Fixes #66468
---
 clang/utils/ci/run-buildbot | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/utils/ci/run-buildbot b/clang/utils/ci/run-buildbot
index d117fccc7e3fb..f47ffb5cbd38d 100755
--- a/clang/utils/ci/run-buildbot
+++ b/clang/utils/ci/run-buildbot
@@ -70,7 +70,11 @@ ninja --version
 
 case "${BUILDER}" in
 check-format)
-    ! grep -rnI '[[:blank:]]$' clang/lib clang/include clang/docs
+    echo "*** Checking for trailing whitespace left in Clang source files ***"
+    if grep -rnI '[[:blank:]]$' clang/lib clang/include clang/docs; then
+        echo "*** Trailing whitespace has been found in Clang source files as described above ***"
+        exit 1
+    fi
 ;;
 build-clang)
     mkdir install

From d37496e75a137fa3711d4cdd488c45f39ef9be91 Mon Sep 17 00:00:00 2001
From: michaelrj-google <71531609+michaelrj-google@users.noreply.github.com>
Date: Tue, 19 Sep 2023 15:36:14 -0700
Subject: [PATCH 16/57] [libc] Fix printf config not working (#66834)

The list of printf copts available in config.json wasn't working because
the printf_core subdirectory was included before the printf_copts
variable was defined, making it effectively nothing for the printf
internals. Additionally, the tests weren't respecting the flags so they
would cause the tests to fail. This patch reorders the cmake in src and
adds flag handling in test.
---
 libc/src/stdio/CMakeLists.txt      |  6 +++---
 libc/test/src/stdio/CMakeLists.txt | 12 ++++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index a13321d137229..f3a75fb965c6e 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -26,9 +26,6 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/generic)
 endif()
 
-add_subdirectory(printf_core)
-add_subdirectory(scanf_core)
-
 add_entrypoint_object(
   fflush
   SRCS
@@ -286,6 +283,9 @@ add_entrypoint_object(
     ${printf_copts}
 )
 
+add_subdirectory(printf_core)
+add_subdirectory(scanf_core)
+
 add_entrypoint_object(
   ftell
   SRCS
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index e042a8bd8be68..98fa2deb8b0e2 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -112,6 +112,16 @@ add_libc_unittest(
     LibcMemoryHelpers
 )
 
+if(LIBC_CONF_PRINTF_DISABLE_FLOAT)
+  list(APPEND sprintf_test_copts "-DLIBC_COPT_PRINTF_DISABLE_FLOAT")
+endif()
+if(LIBC_CONF_PRINTF_DISABLE_INDEX_MODE)
+  list(APPEND sprintf_test_copts "-DLIBC_COPT_PRINTF_DISABLE_INDEX_MODE")
+endif()
+if(LIBC_CONF_PRINTF_DISABLE_WRITE_INT)
+  list(APPEND sprintf_test_copts "-DLIBC_COPT_PRINTF_DISABLE_WRITE_INT")
+endif()
+
 add_fp_unittest(
   sprintf_test
   UNIT_TEST_ONLY
@@ -123,6 +133,8 @@ add_fp_unittest(
     libc.src.stdio.sprintf
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.platform_defs
+  COMPILE_OPTIONS
+    ${sprintf_test_copts}
 )
 
 add_libc_unittest(

From 30d77fb80857e645b300c8f59cad9414d090e083 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Tue, 19 Sep 2023 18:40:50 -0400
Subject: [PATCH 17/57] [lit] Apply aa71680f2948's fix to an additional test

Seen at
<https://lab.llvm.org/buildbot/#/builders/216/builds/27538/steps/7/logs/FAIL__lit___shtest-external-shell-kill_py>.
---
 llvm/utils/lit/tests/lit.cfg                       | 10 ++++++++++
 llvm/utils/lit/tests/shtest-external-shell-kill.py |  2 +-
 llvm/utils/lit/tests/shtest-run-at-line.py         | 12 +++---------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/utils/lit/tests/lit.cfg b/llvm/utils/lit/tests/lit.cfg
index 438fdbd93c398..1382f7ef4ab00 100644
--- a/llvm/utils/lit/tests/lit.cfg
+++ b/llvm/utils/lit/tests/lit.cfg
@@ -73,6 +73,16 @@ config.substitutions.append(
 )
 config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
 
+# This diagnostic sometimes appears in windows when using bash as an external
+# shell.  Ignore it in lit's output where we need to strictly check only the
+# relevant output.
+config.substitutions.append(
+    (
+        "%{filter-lit}",
+        "grep -v 'bash.exe: warning: could not find /tmp, please create!'",
+    )
+)
+
 # Enable coverage.py reporting, assuming the coverage module has been installed
 # and sitecustomize.py in the virtualenv has been modified appropriately.
 if lit_config.params.get("check-coverage", None):
diff --git a/llvm/utils/lit/tests/shtest-external-shell-kill.py b/llvm/utils/lit/tests/shtest-external-shell-kill.py
index 73f8d8601af62..2de9766aa2fa2 100644
--- a/llvm/utils/lit/tests/shtest-external-shell-kill.py
+++ b/llvm/utils/lit/tests/shtest-external-shell-kill.py
@@ -20,7 +20,7 @@
 # The last FileCheck directive below checks that the debugging commands for the
 # above RUN line are not killed and do execute at the right time.
 
-# RUN: %{lit} -a %{inputs}/shtest-external-shell-kill | FileCheck %s
+# RUN: %{lit} -a %{inputs}/shtest-external-shell-kill | %{filter-lit} | FileCheck %s
 # END.
 
 #       CHECK: Command Output (stdout):
diff --git a/llvm/utils/lit/tests/shtest-run-at-line.py b/llvm/utils/lit/tests/shtest-run-at-line.py
index a1bdb039805ad..18086f6fa10d6 100644
--- a/llvm/utils/lit/tests/shtest-run-at-line.py
+++ b/llvm/utils/lit/tests/shtest-run-at-line.py
@@ -1,15 +1,9 @@
 # Check that -a/-v/-vv makes the line number of the failing RUN command clear.
 
 
-# This diagnostic sometimes appears in windows when using bash as an external
-# shell.  Ignore it so we can strictly check the relevant output.
-#
-# DEFINE: %{filter} = \
-# DEFINE:   grep -v 'bash.exe: warning: could not find /tmp, please create!'
-
-# RUN: not %{lit} -a %{inputs}/shtest-run-at-line | %{filter} | FileCheck %s
-# RUN: not %{lit} -v %{inputs}/shtest-run-at-line | %{filter} | FileCheck %s
-# RUN: not %{lit} -vv %{inputs}/shtest-run-at-line | %{filter} | FileCheck %s
+# RUN: not %{lit} -a %{inputs}/shtest-run-at-line | %{filter-lit} | FileCheck %s
+# RUN: not %{lit} -v %{inputs}/shtest-run-at-line | %{filter-lit} | FileCheck %s
+# RUN: not %{lit} -vv %{inputs}/shtest-run-at-line | %{filter-lit} | FileCheck %s
 # END.
 
 

From 69447d6afe702dee6bc1dbe17f4915789ad6889d Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow@amd.com>
Date: Tue, 1 Aug 2023 17:51:59 -0700
Subject: [PATCH 18/57] [AMDGPU] Add ASM and MC updates for preloading kernargs

Add assembler directives for preloading kernel arguments that correspond
to new fields in the kernel descriptor for the length and offset of
arguments that will be placed in SGPRs prior to kernel launch. Alignment
of the arguments in SGPRs is equivalent to the kernarg segment when
accessed via the kernarg_segment_ptr. Kernarg SGPRs are allocated
directly after other user SGPRs.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D159459
---
 .../llvm/Support/AMDHSAKernelDescriptor.h     | 22 +++++++--
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 12 ++++-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 35 ++++++++++++++
 .../Disassembler/AMDGPUDisassembler.cpp       | 26 +++++++++--
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  1 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         | 12 ++++-
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |  9 +++-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  6 +++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  2 +
 llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s           | 13 ++++--
 llvm/test/MC/AMDGPU/hsa-gfx940-v3.s           | 13 ++++--
 llvm/test/MC/AMDGPU/user-sgpr-count-diag.s    | 26 ++++++++++-
 llvm/test/MC/AMDGPU/user-sgpr-count.s         | 46 +++++++++++++++++--
 .../tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s |  7 ++-
 14 files changed, 205 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index f56f23150ad72..1bd65471d3b7c 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -173,6 +173,15 @@ enum : int32_t {
 };
 #undef KERNEL_CODE_PROPERTY
 
+// Kernarg preload specification.
+#define KERNARG_PRELOAD_SPEC(NAME, SHIFT, WIDTH)                               \
+  AMDHSA_BITS_ENUM_ENTRY(KERNARG_PRELOAD_SPEC_##NAME, SHIFT, WIDTH)
+enum : int32_t {
+  KERNARG_PRELOAD_SPEC(LENGTH, 0, 7),
+  KERNARG_PRELOAD_SPEC(OFFSET, 7, 9),
+};
+#undef KERNARG_PRELOAD_SPEC
+
 // Kernel descriptor. Must be kept backwards compatible.
 struct kernel_descriptor_t {
   uint32_t group_segment_fixed_size;
@@ -185,7 +194,8 @@ struct kernel_descriptor_t {
   uint32_t compute_pgm_rsrc1;
   uint32_t compute_pgm_rsrc2;
   uint16_t kernel_code_properties;
-  uint8_t reserved2[6];
+  uint16_t kernarg_preload;
+  uint8_t reserved3[4];
 };
 
 enum : uint32_t {
@@ -199,7 +209,8 @@ enum : uint32_t {
   COMPUTE_PGM_RSRC1_OFFSET = 48,
   COMPUTE_PGM_RSRC2_OFFSET = 52,
   KERNEL_CODE_PROPERTIES_OFFSET = 56,
-  RESERVED2_OFFSET = 58,
+  KERNARG_PRELOAD_OFFSET = 58,
+  RESERVED3_OFFSET = 60
 };
 
 static_assert(
@@ -233,8 +244,11 @@ static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc2) ==
 static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) ==
                   KERNEL_CODE_PROPERTIES_OFFSET,
               "invalid offset for kernel_code_properties");
-static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET,
-              "invalid offset for reserved2");
+static_assert(offsetof(kernel_descriptor_t, kernarg_preload) ==
+                  KERNARG_PRELOAD_OFFSET,
+              "invalid offset for kernarg_preload");
+static_assert(offsetof(kernel_descriptor_t, reserved3) == RESERVED3_OFFSET,
+              "invalid offset for reserved3");
 
 } // end namespace amdhsa
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 856def93b6047..c9d3b00caa873 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -872,6 +872,12 @@ def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
   "Requires use of fract on arguments to trig instructions"
 >;
 
+def FeatureKernargPreload : SubtargetFeature <"kernarg-preload",
+  "KernargPreload",
+  "true",
+  "Hardware supports preloading of kernel arguments in user SGPRs."
+>;
+
 // Alignment enforcement is controlled by a configuration register:
 // SH_MEM_CONFIG.alignment_mode
 def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
@@ -1185,7 +1191,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
      FeatureAtomicBufferGlobalPkAddF16Insts,
      FeaturePackedTID,
      FullRate64Ops,
-     FeatureBackOffBarrier])>;
+     FeatureBackOffBarrier,
+     FeatureKernargPreload])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
   !listconcat(FeatureISAVersion9_0_Common.Features,
@@ -1227,7 +1234,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
    FeaturePackedTID,
    FeatureArchitectedFlatScratch,
    FullRate64Ops,
-   FeatureBackOffBarrier]>;
+   FeatureBackOffBarrier,
+   FeatureKernargPreload]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b2b25eccbfabc..4c0602507d26d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1462,6 +1462,12 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
     return AMDGPU::getNSAMaxSize(getSTI());
   }
 
+  unsigned getMaxNumUserSGPRs() const {
+    return AMDGPU::getMaxNumUserSGPRs(getSTI());
+  }
+
+  bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
+
   AMDGPUTargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -4931,6 +4937,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   uint64_t NextFreeVGPR = 0;
   uint64_t AccumOffset = 0;
   uint64_t SharedVGPRCount = 0;
+  uint64_t PreloadLength = 0;
+  uint64_t PreloadOffset = 0;
   SMRange SGPRRange;
   uint64_t NextFreeSGPR = 0;
 
@@ -4999,6 +5007,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        Val, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 4;
+    } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_length") {
+      if (!hasKernargPreload())
+        return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+
+      if (Val > getMaxNumUserSGPRs())
+        return OutOfRangeError(ValRange);
+      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, Val,
+                       ValRange);
+      if (Val) {
+        ImpliedUserSGPRCount += Val;
+        PreloadLength = Val;
+      }
+    } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_offset") {
+      if (!hasKernargPreload())
+        return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+
+      if (Val >= 1024)
+        return OutOfRangeError(ValRange);
+      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, Val,
+                       ValRange);
+      if (Val)
+        PreloadOffset = Val;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
@@ -5244,6 +5274,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
                   UserSGPRCount);
 
+  if (PreloadLength && KD.kernarg_size &&
+      (PreloadLength * 4 + PreloadOffset * 4 > KD.kernarg_size))
+    return TokError("Kernarg preload length + offset is larger than the "
+                    "kernarg segment size");
+
   if (isGFX90A()) {
     if (!Seen.contains(".amdhsa_accum_offset"))
       return TokError(".amdhsa_accum_offset directive is required");
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 1b05acd5c90a7..561ed697df6ba 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1621,6 +1621,10 @@ bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
   return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
 }
 
+bool AMDGPUDisassembler::hasKernargPreload() const {
+  return AMDGPU::hasKernargPreload(STI);
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPU specific symbol handling
 //===----------------------------------------------------------------------===//
@@ -1945,10 +1949,24 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
 
     return MCDisassembler::Success;
 
-  case amdhsa::RESERVED2_OFFSET:
-    // 6 bytes from here are reserved, must be 0.
-    ReservedBytes = DE.getBytes(Cursor, 6);
-    for (int I = 0; I < 6; ++I) {
+  case amdhsa::KERNARG_PRELOAD_OFFSET:
+    using namespace amdhsa;
+    TwoByteBuffer = DE.getU16(Cursor);
+    if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) {
+      PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length",
+                      KERNARG_PRELOAD_SPEC_LENGTH);
+    }
+
+    if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) {
+      PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset",
+                      KERNARG_PRELOAD_SPEC_OFFSET);
+    }
+    return MCDisassembler::Success;
+
+  case amdhsa::RESERVED3_OFFSET:
+    // 4 bytes from here are reserved, must be 0.
+    ReservedBytes = DE.getBytes(Cursor, 4);
+    for (int I = 0; I < 4; ++I) {
       if (ReservedBytes[I] != 0)
         return MCDisassembler::Fail;
     }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 444312473a5ff..4e14219ffc80d 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -249,6 +249,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   bool isGFX11Plus() const;
 
   bool hasArchitectedFlatScratch() const;
+  bool hasKernargPreload() const;
 
   bool isMacDPP(MCInst &MI) const;
 };
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b595682c488d8..bf46dd381048c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -78,6 +78,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool UnalignedAccessMode = false;
   bool HasApertureRegs = false;
   bool SupportsXNACK = false;
+  bool KernargPreload = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for XNACK.
@@ -856,7 +857,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
                            unsigned NumRegionInstrs) const override;
 
   unsigned getMaxNumUserSGPRs() const {
-    return 16;
+    return AMDGPU::getMaxNumUserSGPRs(*this);
   }
 
   bool hasSMemRealTime() const {
@@ -1178,6 +1179,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // \returns true if the target supports the pre-NGG legacy geometry path.
   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
 
+  // \returns true if preloading kernel arguments is supported.
+  bool hasKernargPreload() const { return KernargPreload; }
+
+  // \returns true if we need to generate backwards compatible code when
+  // preloading kernel arguments.
+  bool needsKernargPreloadBackwardsCompatibility() const {
+    return hasKernargPreload() && !hasGFX940Insts();
+  }
+
   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
   bool hasCvtFP8VOP1Bug() const { return true; }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index b7bbdac58a5c1..f7ab6e56ae89a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -368,6 +368,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
                 kernel_code_properties,
                 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+  if (hasKernargPreload(STI)) {
+    PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_length ", KD,
+                kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH);
+    PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_offset ", KD,
+                kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET);
+  }
   PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
               kernel_code_properties,
               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
@@ -906,6 +912,7 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
   Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1);
   Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2);
   Streamer.emitInt16(KernelDescriptor.kernel_code_properties);
-  for (uint8_t Res : KernelDescriptor.reserved2)
+  Streamer.emitInt16(KernelDescriptor.kernarg_preload);
+  for (uint8_t Res : KernelDescriptor.reserved3)
     Streamer.emitInt8(Res);
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 63da86391e5c6..efb1737fc5985 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2047,6 +2047,8 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI) {
   return 0;
 }
 
+unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; }
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.hasFeature(AMDGPU::FeatureSouthernIslands);
 }
@@ -2143,6 +2145,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
   return STI.hasFeature(AMDGPU::FeatureVOPD);
 }
 
+unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
+  return STI.hasFeature(AMDGPU::FeatureKernargPreload);
+}
+
 int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
                          int32_t ArgNumVGPR) {
   if (has90AInsts && ArgNumAGPR)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2273ba935f5d6..5b0a72be43b9f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1148,6 +1148,7 @@ bool hasG16(const MCSubtargetInfo &STI);
 bool hasPackedD16(const MCSubtargetInfo &STI);
 bool hasGDS(const MCSubtargetInfo &STI);
 unsigned getNSAMaxSize(const MCSubtargetInfo &STI);
+unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI);
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
@@ -1174,6 +1175,7 @@ bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
 bool hasMAIInsts(const MCSubtargetInfo &STI);
 bool hasVOPD(const MCSubtargetInfo &STI);
 int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
+unsigned hasKernargPreload(const MCSubtargetInfo &STI);
 
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
index 6ab70e5c72b2f..fd84fab8af816 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
@@ -25,10 +25,10 @@
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
 // complete
-// OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
-// OBJDUMP-NEXT: 0070 c1500104 1f0f007f 7f000000 00000000
+// OBJDUMP-NEXT: 0070 c1500104 210f007f 7f008100 00000000
 
 .text
 // ASM: .text
@@ -76,6 +76,9 @@ complete:
   .amdhsa_user_sgpr_kernarg_segment_ptr 1
   .amdhsa_user_sgpr_dispatch_id 1
   .amdhsa_user_sgpr_flat_scratch_init 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_user_sgpr_kernarg_preload_length  1
+  .amdhsa_user_sgpr_kernarg_preload_offset  1
   .amdhsa_user_sgpr_private_segment_size 1
   .amdhsa_system_sgpr_private_segment_wavefront_offset 1
   .amdhsa_system_sgpr_workgroup_id_x 0
@@ -108,14 +111,16 @@ complete:
 // ASM: .amdhsa_kernel complete
 // ASM-NEXT: .amdhsa_group_segment_fixed_size 1
 // ASM-NEXT: .amdhsa_private_segment_fixed_size 1
-// ASM-NEXT: .amdhsa_kernarg_size 0
-// ASM-NEXT: .amdhsa_user_sgpr_count 15
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 16
 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
 // ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length  1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset  1
 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
 // ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
index bf2e6081a8050..9624515ecd6fb 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
@@ -25,10 +25,10 @@
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
 // complete
-// OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
-// OBJDUMP-NEXT: 0070 01510104 130f007f 5e000000 00000000
+// OBJDUMP-NEXT: 0070 01510104 150f007f 5e008100 00000000
 
 .text
 // ASM: .text
@@ -74,6 +74,9 @@ complete:
   .amdhsa_user_sgpr_queue_ptr 1
   .amdhsa_user_sgpr_kernarg_segment_ptr 1
   .amdhsa_user_sgpr_dispatch_id 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_user_sgpr_kernarg_preload_length  1
+  .amdhsa_user_sgpr_kernarg_preload_offset  1
   .amdhsa_user_sgpr_private_segment_size 1
   .amdhsa_enable_private_segment 1
   .amdhsa_system_sgpr_workgroup_id_x 0
@@ -105,12 +108,14 @@ complete:
 // ASM: .amdhsa_kernel complete
 // ASM-NEXT: .amdhsa_group_segment_fixed_size 1
 // ASM-NEXT: .amdhsa_private_segment_fixed_size 1
-// ASM-NEXT: .amdhsa_kernarg_size 0
-// ASM-NEXT: .amdhsa_user_sgpr_count 9
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 10
 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
 // ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length  1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset  1
 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
 // ASM-NEXT: .amdhsa_enable_private_segment 1
 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
index d4ada0e2765f5..63e532e0ffa37 100644
--- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
@@ -1,8 +1,9 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx810 %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s
 
 .amdhsa_kernel implied_count_too_low_0
   .amdhsa_user_sgpr_count 0
   .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 // ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs
@@ -11,7 +12,30 @@
 .amdhsa_kernel implied_count_too_low_1
   .amdhsa_user_sgpr_count 1
   .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 // ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs
 .end_amdhsa_kernel
+
+.amdhsa_kernel implied_count_too_low_2
+  .amdhsa_user_sgpr_count 2
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_preload_length 1
+  .amdhsa_accum_offset 4
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs
+.end_amdhsa_kernel
+
+.amdhsa_kernel preload_out_of_bounds_0
+  .amdhsa_user_sgpr_count 4
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_preload_length 1
+  .amdhsa_user_sgpr_kernarg_preload_offset 1
+  .amdhsa_kernarg_size 4
+  .amdhsa_accum_offset 4
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+// ERR: :[[@LINE+1]]:19: error: Kernarg preload length + offset is larger than the kernarg segment size
+.end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count.s b/llvm/test/MC/AMDGPU/user-sgpr-count.s
index ab363f91d334b..aa8970185eb04 100644
--- a/llvm/test/MC/AMDGPU/user-sgpr-count.s
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
 
 .text
 // ASM: .text
 
-.amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
-// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
 
 
 // ASM-LABEL: .amdhsa_kernel user_sgprs_implied_count
@@ -17,6 +17,7 @@
   .amdhsa_user_sgpr_dispatch_id 1
   .amdhsa_user_sgpr_flat_scratch_init 1
   .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
@@ -28,6 +29,7 @@
   .amdhsa_user_sgpr_kernarg_segment_ptr 1
   .amdhsa_user_sgpr_flat_scratch_init 1
   .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
@@ -39,6 +41,7 @@
   .amdhsa_user_sgpr_queue_ptr 1
   .amdhsa_user_sgpr_kernarg_segment_ptr 1
   .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
@@ -48,6 +51,7 @@
 // ASM: .amdhsa_user_sgpr_count 4
   .amdhsa_kernel user_sgprs_implied_count_private_segment_buffer
   .amdhsa_user_sgpr_private_segment_buffer 1
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
@@ -56,6 +60,7 @@
 // ASM-LABEL: .amdhsa_kernel explicit_user_sgpr_count_16
 .amdhsa_kernel explicit_user_sgpr_count_16
   .amdhsa_user_sgpr_count 16
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
@@ -65,6 +70,7 @@
 // ASM: .amdhsa_user_sgpr_count 0
   .amdhsa_kernel explicit_user_sgpr_count_0
   .amdhsa_user_sgpr_count 0
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
@@ -73,6 +79,7 @@
 // ASM: .amdhsa_user_sgpr_count 1
 .amdhsa_kernel explicit_user_sgpr_count_1
   .amdhsa_user_sgpr_count 1
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
@@ -82,6 +89,39 @@
   .amdhsa_user_sgpr_private_segment_buffer 1
   .amdhsa_user_sgpr_queue_ptr 1
   .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_accum_offset 4
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+.end_amdhsa_kernel
+
+.amdhsa_kernel preload_kernarg_0
+  .amdhsa_user_sgpr_count 3
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_preload_length 1
+  .amdhsa_user_sgpr_kernarg_preload_offset 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_accum_offset 4
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+.end_amdhsa_kernel
+
+.amdhsa_kernel preload_kernarg_1
+  .amdhsa_user_sgpr_count 3
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_preload_length 0
+  .amdhsa_user_sgpr_kernarg_preload_offset 10
+  .amdhsa_kernarg_size 0
+  .amdhsa_accum_offset 4
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+.end_amdhsa_kernel
+
+.amdhsa_kernel preload_kernarg_2
+  .amdhsa_user_sgpr_count 3
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_preload_length 1
+  .amdhsa_user_sgpr_kernarg_preload_offset 0
+  .amdhsa_accum_offset 4
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 32
 .end_amdhsa_kernel
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s
index 42d87dee734d3..d26189451829f 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s
@@ -110,7 +110,7 @@
 ; CHECK: .amdhsa_kernel kernel
 ; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0
 ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0
-; CHECK-NEXT: .amdhsa_kernarg_size 0
+; CHECK-NEXT: .amdhsa_kernarg_size 32
 ; CHECK-NEXT: .amdhsa_accum_offset 12
 ; CHECK-NEXT: .amdhsa_tg_split 0
 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32
@@ -145,9 +145,14 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length  2
+; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset  1
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
   .amdhsa_next_free_sgpr 0
   .amdhsa_accum_offset 12
+  .amdhsa_kernarg_size 32
+  .amdhsa_user_sgpr_kernarg_preload_length 2
+  .amdhsa_user_sgpr_kernarg_preload_offset 1
 .end_amdhsa_kernel

From a45502cdb27f3df1ab745e4a9d93e57e0c32891b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 19 Sep 2023 15:59:05 -0700
Subject: [PATCH 19/57] [bazel] Port c649f29c24c9fc1502d8d53e0c96c3d24b31de1a
 (llvm-nm --line-numbers)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 850688a7970ab..7848a82b61287 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4074,6 +4074,7 @@ cc_binary(
         ":Object",
         ":Option",
         ":Support",
+        ":Symbolize",
         ":TargetParser",
     ],
 )

From 2baf4a06ef06c51c2ef09f981f204983b0f8082c Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Tue, 19 Sep 2023 16:18:34 -0700
Subject: [PATCH 20/57] Fix test added in D150987 to account for different path
 separators which was causing the test to fail on Windows.

Should fix https://lab.llvm.org/buildbot/#/builders/216/builds/27535
---
 llvm/test/tools/llvm-nm/X86/line-numbers.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/tools/llvm-nm/X86/line-numbers.test b/llvm/test/tools/llvm-nm/X86/line-numbers.test
index 4acda8afb2a4e..e254df67f955a 100644
--- a/llvm/test/tools/llvm-nm/X86/line-numbers.test
+++ b/llvm/test/tools/llvm-nm/X86/line-numbers.test
@@ -68,7 +68,7 @@
 # RUN: llvm-mc -g --filetype=obj %t/data-dwarf.s -triple=x86_64-pc-linux -o %t/data-dwarf.o
 # RUN: llvm-nm --line-numbers %t/data-dwarf.o | FileCheck %s --check-prefix=DATA-DWARF --match-full-lines --implicit-check-not={{.}}
 
-# DATA-DWARF: 0000000000000000 D defined_data /tmp/tmp.c:1
+# DATA-DWARF: 0000000000000000 D defined_data /tmp{{\\|/}}tmp.c:1
 
 #--- main.s
 .text

From 40b0ab287f9f5c58fa35a9a97cc49f104bef45ed Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Sun, 13 Aug 2023 21:08:29 +0800
Subject: [PATCH 21/57] [SimplifyCFG] Pre-commit test for extending
 HoistThenElseCodeToIf.

Pre-commit test for D155711.

Differential Revision: https://reviews.llvm.org/D156617
---
 llvm/test/Transforms/SimplifyCFG/HoistCode.ll |  68 +++++
 .../hoist-common-code-with-unreachable.ll     |  99 +++++++
 .../SimplifyCFG/hoist-common-code.ll          | 130 ++++++++++
 .../SimplifyCFG/hoist-with-metadata.ll        | 243 +++++++++++++++++-
 4 files changed, 528 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll

diff --git a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
index 10c6f81c91ac5..0b634fac8c640 100644
--- a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
+++ b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
@@ -17,6 +17,39 @@ F:              ; preds = %0
   ret void
 }
 
+define void @foo_switch(i64 %C, ptr %P) {
+; CHECK-LABEL: @foo_switch(
+; CHECK-NEXT:    switch i64 [[C:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       bb0:
+; CHECK-NEXT:    store i32 7, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i64 %C, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:              ; preds = %0
+  store i32 7, ptr %P
+  ret void
+bb1:              ; preds = %0
+  store i32 7, ptr %P
+  ret void
+bb2:              ; preds = %0
+  store i32 7, ptr %P
+  ret void
+}
+
 define float @PR39535min(float %x) {
 ; CHECK-LABEL: @PR39535min(
 ; CHECK-NEXT:  entry:
@@ -38,3 +71,38 @@ cond.end:
   %cond = phi fast float [ 0.0, %cond.true ], [ %x, %cond.false ]
   ret float %cond
 }
+
+define float @PR39535min_switch(i64 %i, float %x) {
+; CHECK-LABEL: @PR39535min_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[END:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi fast float [ [[X:%.*]], [[BB1]] ], [ [[X]], [[BB2]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret float [[COND]]
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  br label %end
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %cond = phi fast float [ 0.0, %bb0 ], [ %x, %bb1 ], [ %x, %bb2 ]
+  ret float %cond
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
new file mode 100644
index 0000000000000..8cb6339713d6f
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='simplifycfg<hoist-common-insts;no-sink-common-insts>' -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+
+define i1 @common_instr_with_unreachable(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @common_instr_with_unreachable(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  switch i64 %a, label %unreachable [
+  i64 0, label %bb0
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+unreachable:
+  unreachable
+
+bb0:                                              ; preds = %start
+  %0 = icmp eq i64 %b, %c
+  br label %exit
+
+bb1:                                              ; preds = %start
+  %1 = icmp eq i64 %b, %c
+  br label %exit
+
+bb2:                                              ; preds = %start
+  %2 = icmp eq i64 %b, %c
+  br label %exit
+
+exit:                                             ; preds = %bb2, %bb1, %bb0
+  %result = phi i1 [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
+  ret i1 %result
+}
+
+define i1 @common_instr_with_unreachable_2(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @common_instr_with_unreachable_2(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB1:%.*]] [
+; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  switch i64 %a, label %bb1 [
+  i64 0, label %bb0
+  i64 1, label %unreachable
+  i64 2, label %bb2
+  ]
+
+unreachable:
+  unreachable
+
+bb0:                                              ; preds = %start
+  %0 = icmp eq i64 %b, %c
+  br label %exit
+
+bb1:                                              ; preds = %start
+  %1 = icmp eq i64 %b, %c
+  br label %exit
+
+bb2:                                              ; preds = %start
+  %2 = icmp eq i64 %b, %c
+  br label %exit
+
+exit:                                             ; preds = %bb2, %bb1, %bb0
+  %result = phi i1 [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
+  ret i1 %result
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
index 556e1b9ca0662..43fb8faad7cfd 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
@@ -24,3 +24,133 @@ F:              ; preds = %0
   ret void
 }
 
+define void @test_switch(i64 %i, ptr %Q) {
+; CHECK-LABEL: @test_switch(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       bb0:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[A]])
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
+; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[B]])
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[Q]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[C]])
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:              ; preds = %0
+  store i32 1, ptr %Q
+  %A = load i32, ptr %Q               ; <i32> [#uses=1]
+  call void @bar( i32 %A )
+  ret void
+bb1:              ; preds = %0
+  store i32 1, ptr %Q
+  %B = load i32, ptr %Q               ; <i32> [#uses=1]
+  call void @bar( i32 %B )
+  ret void
+bb2:              ; preds = %0
+  store i32 1, ptr %Q
+  %C = load i32, ptr %Q               ; <i32> [#uses=1]
+  call void @bar( i32 %C )
+  ret void
+}
+
+define i1 @common_instr_on_switch(i64 %a, i64 %b, i64 %c) unnamed_addr {
+; CHECK-LABEL: @common_instr_on_switch(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  switch i64 %a, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:                                              ; preds = %start
+  %0 = icmp eq i64 %b, %c
+  br label %exit
+
+bb1:                                              ; preds = %start
+  %1 = icmp eq i64 %b, %c
+  br label %exit
+
+bb2:                                              ; preds = %start
+  %2 = icmp eq i64 %b, %c
+  br label %exit
+
+exit:                                             ; preds = %bb2, %bb1, %bb0
+  %result = phi i1 [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
+  ret i1 %result
+}
+
+define i1 @partial_common_instr_on_switch(i64 %a, i64 %b, i64 %c) unnamed_addr {
+; CHECK-LABEL: @partial_common_instr_on_switch(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  switch i64 %a, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:                                              ; preds = %start
+  %0 = icmp eq i64 %b, %c
+  br label %exit
+
+bb1:                                              ; preds = %start
+  %1 = icmp ne i64 %b, %c
+  br label %exit
+
+bb2:                                              ; preds = %start
+  %2 = icmp eq i64 %b, %c
+  br label %exit
+
+exit:                                             ; preds = %bb2, %bb1, %bb0
+  %result = phi i1 [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
+  ret i1 %result
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
index b999b07fc24b4..b53224c944f11 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
@@ -19,10 +19,45 @@ out:
   ret void
 }
 
+define void @hoist_range_switch(i64 %i, ptr %p) {
+; CHECK-LABEL: @hoist_range_switch(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    br label [[OUT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret void
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:
+  %t = load i8, ptr %p, !range !0
+  br label %out
+bb1:
+  %e = load i8, ptr %p, !range !1
+  br label %out
+bb2:
+  %f = load i8, ptr %p, !range !3
+  br label %out
+out:
+  ret void
+}
+
 define void @hoist_both_noundef(i1 %c, ptr %p) {
 ; CHECK-LABEL: @hoist_both_noundef(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !1
+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -40,6 +75,42 @@ out:
   ret void
 }
 
+
+define void @hoist_both_noundef_switch(i64 %i, ptr %p) {
+; CHECK-LABEL: @hoist_both_noundef_switch(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
+; CHECK-NEXT:    br label [[OUT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret void
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:
+  %t = load i8, ptr %p, !noundef !2
+  br label %out
+bb1:
+  %e = load i8, ptr %p, !noundef !2
+  br label %out
+bb2:
+  %f = load i8, ptr %p, !noundef !2
+  br label %out
+out:
+  ret void
+}
+
 define void @hoist_one_noundef(i1 %c, ptr %p) {
 ; CHECK-LABEL: @hoist_one_noundef(
 ; CHECK-NEXT:  if:
@@ -61,10 +132,45 @@ out:
   ret void
 }
 
+define void @hoist_one_noundef_switch(i64 %i, ptr %p) {
+; CHECK-LABEL: @hoist_one_noundef_switch(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
+; CHECK-NEXT:    br label [[OUT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret void
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:
+  %t = load i8, ptr %p, !noundef !2
+  br label %out
+bb1:
+  %e = load i8, ptr %p
+  br label %out
+bb2:
+  %f = load i8, ptr %p, !noundef !2
+  br label %out
+out:
+  ret void
+}
+
 define void @hoist_dereferenceable(i1 %c, ptr %p) {
 ; CHECK-LABEL: @hoist_dereferenceable(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !2
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -79,10 +185,45 @@ out:
   ret void
 }
 
+define void @hoist_dereferenceable_switch(i64 %i, ptr %p) {
+; CHECK-LABEL: @hoist_dereferenceable_switch(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
+; CHECK-NEXT:    br label [[OUT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !6
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !7
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret void
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:
+  %t = load ptr, ptr %p, !dereferenceable !{i64 10}
+  br label %out
+bb1:
+  %e = load ptr, ptr %p, !dereferenceable !{i64 20}
+  br label %out
+bb2:
+  %f = load ptr, ptr %p, !dereferenceable !{i64 30}
+  br label %out
+out:
+  ret void
+}
+
 define void @hoist_dereferenceable_or_null(i1 %c, ptr %p) {
 ; CHECK-LABEL: @hoist_dereferenceable_or_null(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !2
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !5
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -97,11 +238,46 @@ out:
   ret void
 }
 
+define void @hoist_dereferenceable_or_null_switch(i64 %i, ptr %p) {
+; CHECK-LABEL: @hoist_dereferenceable_or_null_switch(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !6
+; CHECK-NEXT:    br label [[OUT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !5
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !7
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret void
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:
+  %t = load ptr, ptr %p, !dereferenceable_or_null !{i64 20}
+  br label %out
+bb1:
+  %e = load ptr, ptr %p, !dereferenceable_or_null !{i64 10}
+  br label %out
+bb2:
+  %f = load ptr, ptr %p, !dereferenceable_or_null !{i64 30}
+  br label %out
+out:
+  ret void
+}
+
 ; !range violation only returns poison, and is thus safe to speculate.
 define i32 @speculate_range(i1 %c, ptr dereferenceable(8) align 8 %p) {
 ; CHECK-LABEL: @speculate_range(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG8:![0-9]+]]
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], i32 [[V]], i32 0
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
 ;
@@ -122,7 +298,7 @@ join:
 define ptr @speculate_nonnull(i1 %c, ptr dereferenceable(8) align 8 %p) {
 ; CHECK-LABEL: @speculate_nonnull(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !1
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !4
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
 ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
 ;
@@ -143,7 +319,7 @@ join:
 define ptr @speculate_align(i1 %c, ptr dereferenceable(8) align 8 %p) {
 ; CHECK-LABEL: @speculate_align(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !4
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !9
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
 ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
 ;
@@ -162,7 +338,7 @@ join:
 define void @hoist_fpmath(i1 %c, double %x) {
 ; CHECK-LABEL: @hoist_fpmath(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !5
+; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -177,14 +353,57 @@ out:
   ret void
 }
 
+define void @hoist_fpmath_switch(i64 %i, double %x) {
+; CHECK-LABEL: @hoist_fpmath_switch(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
+; CHECK-NEXT:    br label [[OUT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[E:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !11
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[F:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !12
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret void
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:
+  %t = fadd double %x, 1.0, !fpmath !{ float 2.5 }
+  br label %out
+bb1:
+  %e = fadd double %x, 1.0, !fpmath !{ float 5.0 }
+  br label %out
+bb2:
+  %f = fadd double %x, 1.0, !fpmath !{ float 7.5 }
+  br label %out
+out:
+  ret void
+}
+
 !0 = !{ i8 0, i8 1 }
 !1 = !{ i8 3, i8 5 }
 !2 = !{}
+!3 = !{ i8 7, i8 9 }
 ;.
 ; CHECK: [[RNG0]] = !{i8 0, i8 1, i8 3, i8 5}
-; CHECK: [[META1:![0-9]+]] = !{}
-; CHECK: [[META2:![0-9]+]] = !{i64 10}
-; CHECK: [[RNG3]] = !{i32 0, i32 10}
-; CHECK: [[META4:![0-9]+]] = !{i64 4}
-; CHECK: [[META5:![0-9]+]] = !{float 2.500000e+00}
+; CHECK: [[RNG1]] = !{i8 0, i8 1}
+; CHECK: [[RNG2]] = !{i8 3, i8 5}
+; CHECK: [[RNG3]] = !{i8 7, i8 9}
+; CHECK: [[META4:![0-9]+]] = !{}
+; CHECK: [[META5:![0-9]+]] = !{i64 10}
+; CHECK: [[META6:![0-9]+]] = !{i64 20}
+; CHECK: [[META7:![0-9]+]] = !{i64 30}
+; CHECK: [[RNG8]] = !{i32 0, i32 10}
+; CHECK: [[META9:![0-9]+]] = !{i64 4}
+; CHECK: [[META10:![0-9]+]] = !{float 2.500000e+00}
+; CHECK: [[META11:![0-9]+]] = !{float 5.000000e+00}
+; CHECK: [[META12:![0-9]+]] = !{float 7.500000e+00}
 ;.

From 96ea48ff5dcba46af350f5300eafd7f7394ba606 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Tue, 19 Sep 2023 17:57:30 +0800
Subject: [PATCH 22/57] [SimplifyCFG] Hoist common instructions on Switch.

Sink common instructions are not always performance friendly. We need to implement hoist common instructions on switch instruction to solve the following problem:
```
define i1 @foo(i64 %a, i64 %b, i64 %c, i64 %d) {
start:
  %test = icmp eq i64 %a, %d
  br i1 %test, label %switch_bb, label %exit

switch_bb:                                        ; preds = %start
  switch i64 %a, label %bb0 [
    i64 1, label %bb1
    i64 2, label %bb2
  ]

bb0:                                              ; preds = %switch_bb
  %0 = icmp eq i64 %b, %c
  br label %exit

bb1:                                              ; preds = %switch_bb
  %1 = icmp eq i64 %b, %c
  br label %exit

bb2:                                              ; preds = %switch_bb
  %2 = icmp eq i64 %b, %c
  br label %exit

exit:                                             ; preds = %bb2, %bb1, %bb0, %start
  %result = phi i1 [ false, %start ], [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
  ret i1 %result
}
```
The pre-commit test is D156617.

Reviewed By: XChy, nikic

Differential Revision: https://reviews.llvm.org/D155711
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 370 ++++++-----
 .../AArch64/patchable-function-entry-bti.ll   |   2 +-
 llvm/test/Transforms/SimplifyCFG/HoistCode.ll |  16 +-
 .../hoist-common-code-with-unreachable.ll     |  72 ++-
 .../SimplifyCFG/hoist-common-code.ll          |  66 +-
 .../SimplifyCFG/hoist-common-skip.ll          | 592 ++++++++++++++++++
 .../SimplifyCFG/hoist-with-metadata.ll        | 126 +---
 7 files changed, 927 insertions(+), 317 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 14cabd275d5b1..897058ed53518 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -271,7 +271,10 @@ class SimplifyCFGOpt {
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
 
-  bool HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly);
+  bool hoistCommonCodeFromSuccessors(BasicBlock *BB, bool EqTermsOnly);
+  bool hoistSuccIdenticalTerminatorToSwitchOrIf(
+      Instruction *TI, Instruction *I1,
+      SmallVectorImpl<Instruction *> &OtherSuccTIs);
   bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
   bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -1408,8 +1411,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI,
 }
 
 // If we would need to insert a select that uses the value of this invoke
-// (comments in HoistThenElseCodeToIf explain why we would need to do this), we
-// can't hoist the invoke, as there is nowhere to put the select in this case.
+// (comments in hoistSuccIdenticalTerminatorToSwitchOrIf explain why we would
+// need to do this), we can't hoist the invoke, as there is nowhere to put the
+// select in this case.
 static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
                                 Instruction *I1, Instruction *I2) {
   for (BasicBlock *Succ : successors(BB1)) {
@@ -1424,9 +1428,9 @@ static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
   return true;
 }
 
-// Get interesting characteristics of instructions that `HoistThenElseCodeToIf`
-// didn't hoist. They restrict what kind of instructions can be reordered
-// across.
+// Get interesting characteristics of instructions that
+// `hoistCommonCodeFromSuccessors` didn't hoist. They restrict what kind of
+// instructions can be reordered across.
 enum SkipFlags {
   SkipReadMem = 1,
   SkipSideEffect = 2,
@@ -1484,7 +1488,7 @@ static bool isSafeToHoistInstr(Instruction *I, unsigned Flags) {
 
 static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified = false);
 
-/// Helper function for HoistThenElseCodeToIf. Return true if identical
+/// Helper function for hoistCommonCodeFromSuccessors. Return true if identical
 /// instructions \p I1 and \p I2 can and should be hoisted.
 static bool shouldHoistCommonInstructions(Instruction *I1, Instruction *I2,
                                           const TargetTransformInfo &TTI) {
@@ -1515,62 +1519,51 @@ static bool shouldHoistCommonInstructions(Instruction *I1, Instruction *I2,
   return true;
 }
 
-/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
-/// in the two blocks up into the branch block. The caller of this function
-/// guarantees that BI's block dominates BB1 and BB2. If EqTermsOnly is given,
-/// only perform hoisting in case both blocks only contain a terminator. In that
-/// case, only the original BI will be replaced and selects for PHIs are added.
-bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly) {
+/// Hoist any common code in the successor blocks up into the block. This
+/// function guarantees that BB dominates all successors. If EqTermsOnly is
+/// given, only perform hoisting in case both blocks only contain a terminator.
+/// In that case, only the original BI will be replaced and selects for PHIs are
+/// added.
+bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
+                                                   bool EqTermsOnly) {
   // This does very trivial matching, with limited scanning, to find identical
-  // instructions in the two blocks.  In particular, we don't want to get into
-  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
+  // instructions in the two blocks. In particular, we don't want to get into
+  // O(N1*N2*...) situations here where Ni are the sizes of these successors. As
   // such, we currently just scan for obviously identical instructions in an
   // identical order, possibly separated by the same number of non-identical
   // instructions.
-  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
-  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
+  unsigned int SuccSize = succ_size(BB);
+  if (SuccSize < 2)
+    return false;
 
   // If either of the blocks has it's address taken, then we can't do this fold,
   // because the code we'd hoist would no longer run when we jump into the block
   // by it's address.
-  if (BB1->hasAddressTaken() || BB2->hasAddressTaken())
-    return false;
+  for (auto *Succ : successors(BB))
+    if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+      return false;
 
-  BasicBlock::iterator BB1_Itr = BB1->begin();
-  BasicBlock::iterator BB2_Itr = BB2->begin();
+  auto *TI = BB->getTerminator();
 
-  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
-  // Skip debug info if it is not identical.
-  DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
-  DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
-  if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
-    while (isa<DbgInfoIntrinsic>(I1))
-      I1 = &*BB1_Itr++;
-    while (isa<DbgInfoIntrinsic>(I2))
-      I2 = &*BB2_Itr++;
+  // The second of pair is a SkipFlags bitmask.
+  using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
+  SmallVector<SuccIterPair, 8> SuccIterPairs;
+  for (auto *Succ : successors(BB)) {
+    BasicBlock::iterator SuccItr = Succ->begin();
+    if (isa<PHINode>(*SuccItr))
+      return false;
+    SuccIterPairs.push_back(SuccIterPair(SuccItr, 0));
   }
-  if (isa<PHINode>(I1))
-    return false;
-
-  BasicBlock *BIParent = BI->getParent();
-
-  bool Changed = false;
-
-  auto _ = make_scope_exit([&]() {
-    if (Changed)
-      ++NumHoistCommonCode;
-  });
 
   // Check if only hoisting terminators is allowed. This does not add new
   // instructions to the hoist location.
   if (EqTermsOnly) {
     // Skip any debug intrinsics, as they are free to hoist.
-    auto *I1NonDbg = &*skipDebugIntrinsics(I1->getIterator());
-    auto *I2NonDbg = &*skipDebugIntrinsics(I2->getIterator());
-    if (!I1NonDbg->isIdenticalToWhenDefined(I2NonDbg))
-      return false;
-    if (!I1NonDbg->isTerminator())
-      return false;
+    for (auto &SuccIter : make_first_range(SuccIterPairs)) {
+      auto *INonDbg = &*skipDebugIntrinsics(SuccIter);
+      if (!INonDbg->isTerminator())
+        return false;
+    }
     // Now we know that we only need to hoist debug intrinsics and the
     // terminator. Let the loop below handle those 2 cases.
   }
@@ -1579,154 +1572,234 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly) {
   // many instructions we skip, serving as a compilation time control as well as
   // preventing excessive increase of life ranges.
   unsigned NumSkipped = 0;
+  // If we find an unreachable instruction at the beginning of a basic block, we
+  // can still hoist instructions from the rest of the basic blocks.
+  if (SuccIterPairs.size() > 2) {
+    erase_if(SuccIterPairs,
+             [](const auto &Pair) { return isa<UnreachableInst>(Pair.first); });
+    if (SuccIterPairs.size() < 2)
+      return false;
+  }
 
-  // Record any skipped instuctions that may read memory, write memory or have
-  // side effects, or have implicit control flow.
-  unsigned SkipFlagsBB1 = 0;
-  unsigned SkipFlagsBB2 = 0;
+  bool Changed = false;
 
   for (;;) {
+    auto *SuccIterPairBegin = SuccIterPairs.begin();
+    auto &BB1ItrPair = *SuccIterPairBegin++;
+    auto OtherSuccIterPairRange =
+        iterator_range(SuccIterPairBegin, SuccIterPairs.end());
+    auto OtherSuccIterRange = make_first_range(OtherSuccIterPairRange);
+
+    Instruction *I1 = &*BB1ItrPair.first;
+    auto *BB1 = I1->getParent();
+
+    // Skip debug info if it is not identical.
+    bool AllDbgInstsAreIdentical = all_of(OtherSuccIterRange, [I1](auto &Iter) {
+      Instruction *I2 = &*Iter;
+      return I1->isIdenticalToWhenDefined(I2);
+    });
+    if (!AllDbgInstsAreIdentical) {
+      while (isa<DbgInfoIntrinsic>(I1))
+        I1 = &*++BB1ItrPair.first;
+      for (auto &SuccIter : OtherSuccIterRange) {
+        Instruction *I2 = &*SuccIter;
+        while (isa<DbgInfoIntrinsic>(I2))
+          I2 = &*++SuccIter;
+      }
+    }
+
+    bool AllInstsAreIdentical = true;
+    bool HasTerminator = I1->isTerminator();
+    for (auto &SuccIter : OtherSuccIterRange) {
+      Instruction *I2 = &*SuccIter;
+      HasTerminator |= I2->isTerminator();
+      if (AllInstsAreIdentical && !I1->isIdenticalToWhenDefined(I2))
+        AllInstsAreIdentical = false;
+    }
+
     // If we are hoisting the terminator instruction, don't move one (making a
     // broken BB), instead clone it, and remove BI.
-    if (I1->isTerminator() || I2->isTerminator()) {
+    if (HasTerminator) {
       // If any instructions remain in the block, we cannot hoist terminators.
-      if (NumSkipped || !I1->isIdenticalToWhenDefined(I2))
+      if (NumSkipped || SuccSize != SuccIterPairs.size() ||
+          !AllInstsAreIdentical)
         return Changed;
-      goto HoistTerminator;
+      SmallVector<Instruction *, 8> Insts;
+      for (auto &SuccIter : OtherSuccIterRange)
+        Insts.push_back(&*SuccIter);
+      return hoistSuccIdenticalTerminatorToSwitchOrIf(TI, I1, Insts) || Changed;
     }
 
-    if (I1->isIdenticalToWhenDefined(I2) &&
-        // Even if the instructions are identical, it may not be safe to hoist
-        // them if we have skipped over instructions with side effects or their
-        // operands weren't hoisted.
-        isSafeToHoistInstr(I1, SkipFlagsBB1) &&
-        isSafeToHoistInstr(I2, SkipFlagsBB2) &&
-        shouldHoistCommonInstructions(I1, I2, TTI)) {
-      if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
-        assert(isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+    if (AllInstsAreIdentical) {
+      unsigned SkipFlagsBB1 = BB1ItrPair.second;
+      AllInstsAreIdentical =
+          isSafeToHoistInstr(I1, SkipFlagsBB1) &&
+          all_of(OtherSuccIterPairRange, [=](const auto &Pair) {
+            Instruction *I2 = &*Pair.first;
+            unsigned SkipFlagsBB2 = Pair.second;
+            // Even if the instructions are identical, it may not
+            // be safe to hoist them if we have skipped over
+            // instructions with side effects or their operands
+            // weren't hoisted.
+            return isSafeToHoistInstr(I2, SkipFlagsBB2) &&
+                   shouldHoistCommonInstructions(I1, I2, TTI);
+          });
+    }
+
+    if (AllInstsAreIdentical) {
+      BB1ItrPair.first++;
+      if (isa<DbgInfoIntrinsic>(I1)) {
         // The debug location is an integral part of a debug info intrinsic
         // and can't be separated from it or replaced.  Instead of attempting
         // to merge locations, simply hoist both copies of the intrinsic.
-        I1->moveBeforePreserving(BI);
-        I2->moveBeforePreserving(BI);
-        Changed = true;
+        I1->moveBeforePreserving(TI);
+        for (auto &SuccIter : OtherSuccIterRange) {
+          auto *I2 = &*SuccIter++;
+          assert(isa<DbgInfoIntrinsic>(I2));
+          I2->moveBeforePreserving(TI);
+        }
       } else {
         // For a normal instruction, we just move one to right before the
         // branch, then replace all uses of the other with the first.  Finally,
         // we remove the now redundant second instruction.
-        I1->moveBeforePreserving(BI);
-        if (!I2->use_empty())
-          I2->replaceAllUsesWith(I1);
-        I1->andIRFlags(I2);
-        combineMetadataForCSE(I1, I2, true);
-
-        // I1 and I2 are being combined into a single instruction.  Its debug
-        // location is the merged locations of the original instructions.
-        I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-
-        I2->eraseFromParent();
+        I1->moveBeforePreserving(TI);
+        BB->splice(TI->getIterator(), BB1, I1->getIterator());
+        for (auto &SuccIter : OtherSuccIterRange) {
+          Instruction *I2 = &*SuccIter++;
+          assert(I2 != I1);
+          if (!I2->use_empty())
+            I2->replaceAllUsesWith(I1);
+          I1->andIRFlags(I2);
+          combineMetadataForCSE(I1, I2, true);
+          // I1 and I2 are being combined into a single instruction.  Its debug
+          // location is the merged locations of the original instructions.
+          I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+          I2->eraseFromParent();
+        }
       }
+      if (!Changed)
+        NumHoistCommonCode += SuccIterPairs.size();
       Changed = true;
-      ++NumHoistCommonInstrs;
+      NumHoistCommonInstrs += SuccIterPairs.size();
     } else {
       if (NumSkipped >= HoistCommonSkipLimit)
         return Changed;
       // We are about to skip over a pair of non-identical instructions. Record
       // if any have characteristics that would prevent reordering instructions
       // across them.
-      SkipFlagsBB1 |= skippedInstrFlags(I1);
-      SkipFlagsBB2 |= skippedInstrFlags(I2);
+      for (auto &SuccIterPair : SuccIterPairs) {
+        Instruction *I = &*SuccIterPair.first++;
+        SuccIterPair.second |= skippedInstrFlags(I);
+      }
       ++NumSkipped;
     }
-
-    I1 = &*BB1_Itr++;
-    I2 = &*BB2_Itr++;
-    // Skip debug info if it is not identical.
-    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
-    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
-    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
-      while (isa<DbgInfoIntrinsic>(I1))
-        I1 = &*BB1_Itr++;
-      while (isa<DbgInfoIntrinsic>(I2))
-        I2 = &*BB2_Itr++;
-    }
   }
+}
 
-  return Changed;
+bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
+    Instruction *TI, Instruction *I1,
+    SmallVectorImpl<Instruction *> &OtherSuccTIs) {
+
+  auto *BI = dyn_cast<BranchInst>(TI);
+
+  bool Changed = false;
+  BasicBlock *TIParent = TI->getParent();
+  BasicBlock *BB1 = I1->getParent();
 
-HoistTerminator:
-  // It may not be possible to hoist an invoke.
+  // Use only for an if statement.
+  auto *I2 = *OtherSuccTIs.begin();
+  auto *BB2 = I2->getParent();
+  if (BI) {
+    assert(OtherSuccTIs.size() == 1);
+    assert(BI->getSuccessor(0) == I1->getParent());
+    assert(BI->getSuccessor(1) == I2->getParent());
+  }
+
+  // In the case of an if statement, we try to hoist an invoke.
   // FIXME: Can we define a safety predicate for CallBr?
-  if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
-    return Changed;
+  // FIXME: Test case llvm/test/Transforms/SimplifyCFG/2009-06-15-InvokeCrash.ll
+  // removed in 4c923b3b3fd0ac1edebf0603265ca3ba51724937 commit?
+  if (isa<InvokeInst>(I1) && (!BI || !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
+    return false;
 
   // TODO: callbr hoisting currently disabled pending further study.
   if (isa<CallBrInst>(I1))
-    return Changed;
+    return false;
 
   for (BasicBlock *Succ : successors(BB1)) {
     for (PHINode &PN : Succ->phis()) {
       Value *BB1V = PN.getIncomingValueForBlock(BB1);
-      Value *BB2V = PN.getIncomingValueForBlock(BB2);
-      if (BB1V == BB2V)
-        continue;
+      for (Instruction *OtherSuccTI : OtherSuccTIs) {
+        Value *BB2V = PN.getIncomingValueForBlock(OtherSuccTI->getParent());
+        if (BB1V == BB2V)
+          continue;
 
-      // Check for passingValueIsAlwaysUndefined here because we would rather
-      // eliminate undefined control flow then converting it to a select.
-      if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
-          passingValueIsAlwaysUndefined(BB2V, &PN))
-        return Changed;
+        // In the case of an if statement, check for
+        // passingValueIsAlwaysUndefined here because we would rather eliminate
+        // undefined control flow then converting it to a select.
+        if (!BI || passingValueIsAlwaysUndefined(BB1V, &PN) ||
+            passingValueIsAlwaysUndefined(BB2V, &PN))
+          return false;
+      }
     }
   }
 
   // Okay, it is safe to hoist the terminator.
   Instruction *NT = I1->clone();
-  NT->insertInto(BIParent, BI->getIterator());
+  NT->insertInto(TIParent, TI->getIterator());
   if (!NT->getType()->isVoidTy()) {
     I1->replaceAllUsesWith(NT);
-    I2->replaceAllUsesWith(NT);
+    for (Instruction *OtherSuccTI : OtherSuccTIs)
+      OtherSuccTI->replaceAllUsesWith(NT);
     NT->takeName(I1);
   }
   Changed = true;
-  ++NumHoistCommonInstrs;
+  NumHoistCommonInstrs += OtherSuccTIs.size() + 1;
 
   // Ensure terminator gets a debug location, even an unknown one, in case
   // it involves inlinable calls.
-  NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+  SmallVector<DILocation *, 4> Locs;
+  Locs.push_back(I1->getDebugLoc());
+  for (auto *OtherSuccTI : OtherSuccTIs)
+    Locs.push_back(OtherSuccTI->getDebugLoc());
+  NT->setDebugLoc(DILocation::getMergedLocations(Locs));
 
   // PHIs created below will adopt NT's merged DebugLoc.
   IRBuilder<NoFolder> Builder(NT);
 
-  // Hoisting one of the terminators from our successor is a great thing.
-  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
-  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
-  // nodes, so we insert select instruction to compute the final result.
-  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
-  for (BasicBlock *Succ : successors(BB1)) {
-    for (PHINode &PN : Succ->phis()) {
-      Value *BB1V = PN.getIncomingValueForBlock(BB1);
-      Value *BB2V = PN.getIncomingValueForBlock(BB2);
-      if (BB1V == BB2V)
-        continue;
+  // In the case of an if statement, hoisting one of the terminators from our
+  // successor is a great thing. Unfortunately, the successors of the if/else
+  // blocks may have PHI nodes in them.  If they do, all PHI entries for BB1/BB2
+  // must agree for all PHI nodes, so we insert select instruction to compute
+  // the final result.
+  if (BI) {
+    std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
+    for (BasicBlock *Succ : successors(BB1)) {
+      for (PHINode &PN : Succ->phis()) {
+        Value *BB1V = PN.getIncomingValueForBlock(BB1);
+        Value *BB2V = PN.getIncomingValueForBlock(BB2);
+        if (BB1V == BB2V)
+          continue;
 
-      // These values do not agree.  Insert a select instruction before NT
-      // that determines the right value.
-      SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (!SI) {
-        // Propagate fast-math-flags from phi node to its replacement select.
-        IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-        if (isa<FPMathOperator>(PN))
-          Builder.setFastMathFlags(PN.getFastMathFlags());
-
-        SI = cast<SelectInst>(
-            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
-                                 BB1V->getName() + "." + BB2V->getName(), BI));
-      }
+        // These values do not agree.  Insert a select instruction before NT
+        // that determines the right value.
+        SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+        if (!SI) {
+          // Propagate fast-math-flags from phi node to its replacement select.
+          IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+          if (isa<FPMathOperator>(PN))
+            Builder.setFastMathFlags(PN.getFastMathFlags());
+
+          SI = cast<SelectInst>(Builder.CreateSelect(
+              BI->getCondition(), BB1V, BB2V,
+              BB1V->getName() + "." + BB2V->getName(), BI));
+        }
 
-      // Make the PHI node use the select for all incoming values for BB1/BB2
-      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
-        if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
-          PN.setIncomingValue(i, SI);
+        // Make the PHI node use the select for all incoming values for BB1/BB2
+        for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+          if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
+            PN.setIncomingValue(i, SI);
+      }
     }
   }
 
@@ -1734,16 +1807,16 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly) {
 
   // Update any PHI nodes in our new successors.
   for (BasicBlock *Succ : successors(BB1)) {
-    AddPredecessorToBlock(Succ, BIParent, BB1);
+    AddPredecessorToBlock(Succ, TIParent, BB1);
     if (DTU)
-      Updates.push_back({DominatorTree::Insert, BIParent, Succ});
+      Updates.push_back({DominatorTree::Insert, TIParent, Succ});
   }
 
   if (DTU)
-    for (BasicBlock *Succ : successors(BI))
-      Updates.push_back({DominatorTree::Delete, BIParent, Succ});
+    for (BasicBlock *Succ : successors(TI))
+      Updates.push_back({DominatorTree::Delete, TIParent, Succ});
 
-  EraseTerminatorAndDCECond(BI);
+  EraseTerminatorAndDCECond(TI);
   if (DTU)
     DTU->applyUpdates(Updates);
   return Changed;
@@ -2777,8 +2850,8 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
     Value *OrigV = PN.getIncomingValueForBlock(BB);
     Value *ThenV = PN.getIncomingValueForBlock(ThenBB);
 
-    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
-    // Skip PHIs which are trivial.
+    // FIXME: Try to remove some of the duplication with
+    // hoistCommonCodeFromSuccessors. Skip PHIs which are trivial.
     if (ThenV == OrigV)
       continue;
 
@@ -6815,6 +6888,10 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (ReduceSwitchRange(SI, Builder, DL, TTI))
     return requestResimplify();
 
+  if (HoistCommon &&
+      hoistCommonCodeFromSuccessors(SI->getParent(), !Options.HoistCommonInsts))
+    return requestResimplify();
+
   return false;
 }
 
@@ -7081,7 +7158,8 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistCommon && HoistThenElseCodeToIf(BI, !Options.HoistCommonInsts))
+      if (HoistCommon && hoistCommonCodeFromSuccessors(
+                             BI->getParent(), !Options.HoistCommonInsts))
         return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
diff --git a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
index 0dfb5764f55b8..15657730c2cdc 100644
--- a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
+++ b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
@@ -70,7 +70,7 @@ entry:
     i64 4, label %sw.bb4
   ]
 sw.bb0:
-  call void asm sideeffect "", ""()
+  call void asm sideeffect "nop", ""()
   ret void
 sw.bb1:
   call void asm sideeffect "", ""()
diff --git a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
index 0b634fac8c640..4088ecfc81898 100644
--- a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
+++ b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
@@ -19,21 +19,9 @@ F:              ; preds = %0
 
 define void @foo_switch(i64 %C, ptr %P) {
 ; CHECK-LABEL: @foo_switch(
-; CHECK-NEXT:    switch i64 [[C:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       common.ret:
-; CHECK-NEXT:    ret void
-; CHECK:       bb0:
+; CHECK-NEXT:  common.ret:
 ; CHECK-NEXT:    store i32 7, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET]]
-; CHECK:       bb2:
-; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK-NEXT:    ret void
 ;
   switch i64 %C, label %bb0 [
   i64 1, label %bb1
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
index 8cb6339713d6f..d948737c16c92 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
@@ -4,25 +4,8 @@
 define i1 @common_instr_with_unreachable(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: @common_instr_with_unreachable(
 ; CHECK-NEXT:  start:
-; CHECK-NEXT:    switch i64 [[A:%.*]], label [[UNREACHABLE:%.*]] [
-; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       unreachable:
-; CHECK-NEXT:    unreachable
-; CHECK:       bb0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
-; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK-NEXT:    ret i1 [[TMP0]]
 ;
 start:
   switch i64 %a, label %unreachable [
@@ -54,43 +37,90 @@ exit:                                             ; preds = %bb2, %bb1, %bb0
 define i1 @common_instr_with_unreachable_2(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: @common_instr_with_unreachable_2(
 ; CHECK-NEXT:  start:
-; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB1:%.*]] [
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP0]]
+;
+start:
+  switch i64 %a, label %bb1 [
+  i64 0, label %bb0
+  i64 1, label %unreachable
+  i64 2, label %bb2
+  ]
+
+unreachable:
+  unreachable
+
+bb0:                                              ; preds = %start
+  %0 = icmp eq i64 %b, %c
+  br label %exit
+
+bb1:                                              ; preds = %start
+  %1 = icmp eq i64 %b, %c
+  br label %exit
+
+bb2:                                              ; preds = %start
+  %2 = icmp eq i64 %b, %c
+  br label %exit
+
+exit:                                             ; preds = %bb2, %bb1, %bb0
+  %result = phi i1 [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
+  ret i1 %result
+}
+
+declare void @no_return()
+declare void @foo()
+
+define i1 @not_only_unreachable(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @not_only_unreachable(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[UNREACHABLE:%.*]] [
 ; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
 ; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
 ; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    call void @no_return()
+; CHECK-NEXT:    unreachable
 ; CHECK:       bb0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
+; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
 ; CHECK-NEXT:    ret i1 [[RESULT]]
 ;
 start:
-  switch i64 %a, label %bb1 [
+  switch i64 %a, label %unreachable [
   i64 0, label %bb0
-  i64 1, label %unreachable
+  i64 1, label %bb1
   i64 2, label %bb2
   ]
 
 unreachable:
+  call void @no_return()
   unreachable
 
 bb0:                                              ; preds = %start
   %0 = icmp eq i64 %b, %c
+  call void @foo()
   br label %exit
 
 bb1:                                              ; preds = %start
   %1 = icmp eq i64 %b, %c
+  call void @foo()
   br label %exit
 
 bb2:                                              ; preds = %start
   %2 = icmp eq i64 %b, %c
+  call void @foo()
   br label %exit
 
 exit:                                             ; preds = %bb2, %bb1, %bb0
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
index 43fb8faad7cfd..bfe31d8345d50 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
@@ -26,27 +26,11 @@ F:              ; preds = %0
 
 define void @test_switch(i64 %i, ptr %Q) {
 ; CHECK-LABEL: @test_switch(
-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       common.ret:
-; CHECK-NEXT:    ret void
-; CHECK:       bb0:
+; CHECK-NEXT:  common.ret:
 ; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
 ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4
 ; CHECK-NEXT:    call void @bar(i32 [[A]])
-; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
-; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4
-; CHECK-NEXT:    call void @bar(i32 [[B]])
-; CHECK-NEXT:    br label [[COMMON_RET]]
-; CHECK:       bb2:
-; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[Q]], align 4
-; CHECK-NEXT:    call void @bar(i32 [[C]])
-; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
   i64 1, label %bb1
@@ -69,25 +53,41 @@ bb2:              ; preds = %0
   ret void
 }
 
-define i1 @common_instr_on_switch(i64 %a, i64 %b, i64 %c) unnamed_addr {
-; CHECK-LABEL: @common_instr_on_switch(
-; CHECK-NEXT:  start:
-; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB0:%.*]] [
+; We ensure that we examine all instructions during each iteration to confirm the presence of a terminating one.
+define void @test_switch_reach_terminator(i64 %i, ptr %p) {
+; CHECK-LABEL: @test_switch_reach_terminator(
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
 ; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    i64 2, label [[COMMON_RET:%.*]]
 ; CHECK-NEXT:    ]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK-NEXT:    store i32 1, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
-; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK-NEXT:    store i32 2, ptr [[P]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:              ; preds = %0
+  store i32 1, ptr %p
+  ret void
+bb1:              ; preds = %0
+  store i32 2, ptr %p
+  ret void
+bb2:              ; preds = %0
+  ret void
+}
+
+define i1 @common_instr_on_switch(i64 %a, i64 %b, i64 %c) unnamed_addr {
+; CHECK-LABEL: @common_instr_on_switch(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP0]]
 ;
 start:
   switch i64 %a, label %bb0 [
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
index 93e822b589b14..1e06abf62c9fe 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
@@ -48,6 +48,68 @@ if.end:
   ret void
 }
 
+define void @f0_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m,  ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: @f0_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[TMP0]], 1
+; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD]], [[TMP1]]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i16 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[SUB]], 3
+; CHECK-NEXT:    [[V:%.*]] = add i16 [[SUB]], [[TMP2]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[SUB2:%.*]] = sub nsw i16 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[SUB2]], 3
+; CHECK-NEXT:    [[W:%.*]] = add i16 [[SUB2]], [[TMP3]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
+; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %0 = load i16, ptr %b, align 2
+  %add = add nsw i16 %0, 1
+  %1 = load i16, ptr %m, align 2
+  %u = add i16 %add, %1
+  br label %end
+
+bb1:
+  %2 = load i16, ptr %b, align 2
+  %sub = sub nsw i16 %2, 1
+  %3 = load i16, ptr %m, align 2
+  %4 = add i16 %sub, 3
+  %v = add i16 %sub, %4
+  br label %end
+
+bb2:
+  %5 = load i16, ptr %b, align 2
+  %sub2 = sub nsw i16 %5, 1
+  %6 = load i16, ptr %m, align 2
+  %7 = add i16 %sub2, 3
+  %w = add i16 %sub2, %7
+  br label %end
+
+end:
+  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+  store i16 %uv, ptr %d, align 2
+  ret void
+}
 
 ;; Check some instructions (e.g. add) can be reordered across instructions with side
 ;; effects, while others (e.g. load) can't.
@@ -97,6 +159,70 @@ if.end:
   ret void
 }
 
+define void @f2_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: @f2_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    [[ADD_0:%.*]] = add nsw i16 [[TMP0]], 1
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    call void @side_effects0()
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
+; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD_0]], [[TMP1]]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @no_side_effects0()
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[M]], align 2
+; CHECK-NEXT:    [[V:%.*]] = add i16 [[ADD_0]], [[TMP2]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @no_side_effects0()
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[M]], align 2
+; CHECK-NEXT:    [[W:%.*]] = add i16 [[ADD_0]], [[TMP3]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
+; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %0 = load i16, ptr %b, align 2
+  call void @side_effects0()
+  %add.0 = add nsw i16 %0, 1
+  %1 = load i16, ptr %m, align 2
+  %u = add i16 %add.0, %1
+  br label %end
+
+bb1:
+  %2 = load i16, ptr %b, align 2
+  call void @no_side_effects0()
+  %add.1 = add nsw i16 %2, 1
+  %3 = load i16, ptr %m, align 2
+  %v = add i16 %add.1, %3
+  br label %end
+
+bb2:
+  %4 = load i16, ptr %b, align 2
+  call void @no_side_effects0()
+  %add.2 = add nsw i16 %4, 1
+  %5 = load i16, ptr %m, align 2
+  %w = add i16 %add.2, %5
+  br label %end
+
+end:
+  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+  store i16 %uv, ptr %d, align 2
+  ret void
+}
 
 ;; Check indeed it was the side effects that prevented hoisting the load
 ;; in the previous test.
@@ -143,6 +269,67 @@ if.end:
   ret void
 }
 
+define void @f3_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: @f3_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    [[ADD_0:%.*]] = add nsw i16 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
+; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD_0]], [[TMP1]]
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    call void @no_side_effects0()
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @no_side_effects1()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @no_side_effects1()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
+; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %0 = load i16, ptr %b, align 2
+  call void @no_side_effects0()
+  %add.0 = add nsw i16 %0, 1
+  %1 = load i16, ptr %m, align 2
+  %u = add i16 %add.0, %1
+  br label %end
+
+bb1:
+  %2 = load i16, ptr %b, align 2
+  call void @no_side_effects1()
+  %add.1 = add nsw i16 %2, 1
+  %3 = load i16, ptr %m, align 2
+  %v = add i16 %add.1, %3
+  br label %end
+
+bb2:
+  %4 = load i16, ptr %b, align 2
+  call void @no_side_effects1()
+  %add.2 = add nsw i16 %4, 1
+  %5 = load i16, ptr %m, align 2
+  %w = add i16 %add.2, %5
+  br label %end
+
+end:
+  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+  store i16 %uv, ptr %d, align 2
+  ret void
+}
+
 ;; Check some instructions (e.g. sdiv) are not speculatively executed.
 
 ;; Division by non-zero constant OK to speculate ...
@@ -186,6 +373,63 @@ if.end:
   ret void
 }
 
+define void @f4_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: @f4_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 [[TMP0]], 2
+; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    call void @side_effects0()
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @side_effects1()
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @side_effects1()
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
+; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %0 = load i16, ptr %b, align 2
+  call void @side_effects0()
+  %div.0 = sdiv i16 %0, 2
+  %u = add i16 %div.0, %0
+  br label %if.end
+
+bb1:
+  %1 = load i16, ptr %b, align 2
+  call void @side_effects1()
+  %div.1 = sdiv i16 %1, 2
+  %v = add i16 %div.1, %1
+  br label %if.end
+
+bb2:
+  %2 = load i16, ptr %b, align 2
+  call void @side_effects1()
+  %div.2 = sdiv i16 %2, 2
+  %w = add i16 %div.2, %2
+  br label %if.end
+
+if.end:
+  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+  store i16 %uv, ptr %d, align 2
+  ret void
+}
+
 ;; ... but not a general division ...
 define void @f5(i1 %c, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
 ; CHECK-LABEL: @f5(
@@ -230,6 +474,67 @@ if.end:
   ret void
 }
 
+define void @f5_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: @f5_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    call void @side_effects0()
+; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 211, [[TMP0]]
+; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @side_effects1()
+; CHECK-NEXT:    [[DIV_1:%.*]] = sdiv i16 211, [[TMP0]]
+; CHECK-NEXT:    [[V:%.*]] = add i16 [[DIV_1]], [[TMP0]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @side_effects1()
+; CHECK-NEXT:    [[DIV_2:%.*]] = sdiv i16 211, [[TMP0]]
+; CHECK-NEXT:    [[W:%.*]] = add i16 [[DIV_2]], [[TMP0]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
+; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %0 = load i16, ptr %b, align 2
+  call void @side_effects0()
+  %div.0 = sdiv i16 211, %0
+  %u = add i16 %div.0, %0
+  br label %end
+
+bb1:
+  %1 = load i16, ptr %b, align 2
+  call void @side_effects1()
+  %div.1 = sdiv i16 211, %1
+  %v = add i16 %div.1, %1
+  br label %end
+
+bb2:
+  %2 = load i16, ptr %b, align 2
+  call void @side_effects1()
+  %div.2 = sdiv i16 211, %2
+  %w = add i16 %div.2, %2
+  br label %end
+
+end:
+  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+  store i16 %uv, ptr %d, align 2
+  ret void
+}
+
 ;; ... and it's also OK to hoist the division when there's no speculation happening.
 define void @f6(i1 %c, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
 ; CHECK-LABEL: @f6(
@@ -271,6 +576,63 @@ if.end:
   ret void
 }
 
+define void @f6_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: @f6_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 211, [[TMP0]]
+; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    call void @no_side_effects0()
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @no_side_effects1()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @no_side_effects1()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
+; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %0 = load i16, ptr %b, align 2
+  call void @no_side_effects0()
+  %div.0 = sdiv i16 211, %0
+  %u = add i16 %div.0, %0
+  br label %end
+
+bb1:
+  %1 = load i16, ptr %b, align 2
+  call void @no_side_effects1()
+  %div.1 = sdiv i16 211, %1
+  %v = add i16 %div.1, %1
+  br label %end
+
+bb2:
+  %2 = load i16, ptr %b, align 2
+  call void @no_side_effects1()
+  %div.2 = sdiv i16 211, %2
+  %w = add i16 %div.2, %2
+  br label %end
+
+end:
+  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+  store i16 %uv, ptr %d, align 2
+  ret void
+}
+
 ;; No reorder of store over a load.
 define i16 @f7(i1 %c, ptr %a, ptr %b) {
 ; CHECK-LABEL: @f7(
@@ -306,6 +668,55 @@ if.end:
   ret i16 %v
 }
 
+define i16 @f7_switch(i64 %i, ptr %a, ptr %b) {
+; CHECK-LABEL: @f7_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[VA:%.*]] = load i16, ptr [[A:%.*]], align 2
+; CHECK-NEXT:    store i16 0, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B]], align 2
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 2
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 2
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
+; CHECK-NEXT:    ret i16 [[V]]
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %va = load i16, ptr %a, align 2
+  store i16 0, ptr %b, align 2
+  br label %end
+
+bb1:
+  %vb = load i16, ptr %b, align 2
+  store i16 0, ptr %b, align 2
+  br label %end
+
+bb2:
+  %vc = load i16, ptr %b, align 2
+  store i16 0, ptr %b, align 2
+  br label %end
+
+end:
+  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
+  ret i16 %v
+}
+
 ;; Can reorder load over another load
 define i16 @f8(i1 %cond, ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @f8(
@@ -346,6 +757,59 @@ if.end:
   ret i16 %w
 }
 
+define i16 @f8_switch(i64 %i, ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: @f8_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_0:%.*]] = load i16, ptr [[C:%.*]], align 2
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[VA:%.*]] = load i16, ptr [[A:%.*]], align 2
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
+; CHECK-NEXT:    [[U:%.*]] = phi i16 [ [[C_0]], [[BB0]] ], [ [[C_0]], [[BB1]] ], [ [[C_0]], [[BB2]] ]
+; CHECK-NEXT:    [[W:%.*]] = add i16 [[V]], [[U]]
+; CHECK-NEXT:    ret i16 [[W]]
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %va = load i16, ptr %a, align 2
+  %c.0 = load i16, ptr %c
+  br label %end
+
+bb1:
+  %vb = load i16, ptr %b, align 2
+  %c.1 = load i16, ptr %c
+  br label %end
+
+bb2:
+  %vc = load i16, ptr %b, align 2
+  %c.2 = load i16, ptr %c
+  br label %end
+
+end:
+  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
+  %u = phi i16 [ %c.0, %bb0 ], [ %c.1, %bb1 ], [ %c.2, %bb2 ]
+
+  %w = add i16 %v, %u
+
+  ret i16 %w
+}
+
 ;; Currently won't reorder volatile and non-volatile loads.
 define i16 @f9(i1 %cond, ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @f9(
@@ -387,6 +851,61 @@ if.end:
   ret i16 %w
 }
 
+define i16 @f9_switch(i64 %i, ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: @f9_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[VA:%.*]] = load volatile i16, ptr [[A:%.*]], align 2
+; CHECK-NEXT:    [[C_0:%.*]] = load i16, ptr [[C:%.*]], align 2
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    [[C_1:%.*]] = load i16, ptr [[C]], align 2
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
+; CHECK-NEXT:    [[C_2:%.*]] = load i16, ptr [[C]], align 2
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
+; CHECK-NEXT:    [[U:%.*]] = phi i16 [ [[C_0]], [[BB0]] ], [ [[C_1]], [[BB1]] ], [ [[C_2]], [[BB2]] ]
+; CHECK-NEXT:    [[W:%.*]] = add i16 [[V]], [[U]]
+; CHECK-NEXT:    ret i16 [[W]]
+;
+entry:
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %va = load volatile i16, ptr %a, align 2
+  %c.0 = load i16, ptr %c
+  br label %end
+
+bb1:
+  %vb = load i16, ptr %b, align 2
+  %c.1 = load i16, ptr %c
+  br label %end
+
+bb2:
+  %vc = load i16, ptr %b, align 2
+  %c.2 = load i16, ptr %c
+  br label %end
+
+end:
+  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
+  %u = phi i16 [ %c.0, %bb0 ], [ %c.1, %bb1 ], [ %c.2, %bb2 ]
+
+  %w = add i16 %v, %u
+
+  ret i16 %w
+}
+
 ;; Don't hoist stacksaves across inalloca allocas
 define void @f10(i1 %cond) {
 ; CHECK-LABEL: @f10(
@@ -438,6 +957,79 @@ end:
   ret void
 }
 
+define void @f10_switch(i64 %i) {
+; CHECK-LABEL: @f10_switch(
+; CHECK-NEXT:    [[SS:%.*]] = call ptr @llvm.stacksave.p0()
+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[I1:%.*]] = alloca inalloca i32, align 4
+; CHECK-NEXT:    [[SS2:%.*]] = call ptr @llvm.stacksave.p0()
+; CHECK-NEXT:    [[I2:%.*]] = alloca inalloca i64, align 8
+; CHECK-NEXT:    call void @inalloca_i64(ptr inalloca(i64) [[I2]])
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS2]])
+; CHECK-NEXT:    call void @inalloca_i32(ptr inalloca(i32) [[I1]])
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[I3:%.*]] = alloca inalloca i64, align 8
+; CHECK-NEXT:    [[SS3:%.*]] = call ptr @llvm.stacksave.p0()
+; CHECK-NEXT:    [[I4:%.*]] = alloca inalloca i64, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I4]])
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS3]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I3]])
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[I5:%.*]] = alloca inalloca i64, align 8
+; CHECK-NEXT:    [[SS4:%.*]] = call ptr @llvm.stacksave.p0()
+; CHECK-NEXT:    [[I6:%.*]] = alloca inalloca i64, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I6]])
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS4]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I5]])
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS]])
+; CHECK-NEXT:    ret void
+;
+  %ss = call ptr @llvm.stacksave()
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+
+bb0:
+  %i1 = alloca inalloca i32
+  %ss2 = call ptr @llvm.stacksave()
+  %i2 = alloca inalloca i64
+  call void @inalloca_i64(ptr inalloca(i64) %i2)
+  call void @llvm.stackrestore(ptr %ss2)
+  call void @inalloca_i32(ptr inalloca(i32) %i1)
+  br label %end
+
+bb1:
+  %i3 = alloca inalloca i64
+  %ss3 = call ptr @llvm.stacksave()
+  %i4 = alloca inalloca i64
+  call ptr @inalloca_i64(ptr inalloca(i64) %i4)
+  call void @llvm.stackrestore(ptr %ss3)
+  call ptr @inalloca_i64(ptr inalloca(i64) %i3)
+  br label %end
+
+bb2:
+  %i5 = alloca inalloca i64
+  %ss4 = call ptr @llvm.stacksave()
+  %i6 = alloca inalloca i64
+  call ptr @inalloca_i64(ptr inalloca(i64) %i6)
+  call void @llvm.stackrestore(ptr %ss4)
+  call ptr @inalloca_i64(ptr inalloca(i64) %i5)
+  br label %end
+
+end:
+  call void @llvm.stackrestore(ptr %ss)
+  ret void
+}
+
 declare void @side_effects0()
 declare void @side_effects1()
 declare void @no_side_effects0() readonly nounwind willreturn
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
index b53224c944f11..90daf38e39d52 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
@@ -21,20 +21,8 @@ out:
 
 define void @hoist_range_switch(i64 %i, ptr %p) {
 ; CHECK-LABEL: @hoist_range_switch(
-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb0:
+; CHECK-NEXT:  out:
 ; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !range [[RNG1:![0-9]+]]
-; CHECK-NEXT:    br label [[OUT:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG3:![0-9]+]]
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       out:
 ; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
@@ -57,7 +45,7 @@ out:
 define void @hoist_both_noundef(i1 %c, ptr %p) {
 ; CHECK-LABEL: @hoist_both_noundef(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !2
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -78,20 +66,8 @@ out:
 
 define void @hoist_both_noundef_switch(i64 %i, ptr %p) {
 ; CHECK-LABEL: @hoist_both_noundef_switch(
-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb0:
-; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
-; CHECK-NEXT:    br label [[OUT:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       out:
+; CHECK-NEXT:  out:
+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !2
 ; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
@@ -134,20 +110,8 @@ out:
 
 define void @hoist_one_noundef_switch(i64 %i, ptr %p) {
 ; CHECK-LABEL: @hoist_one_noundef_switch(
-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb0:
-; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
-; CHECK-NEXT:    br label [[OUT:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       out:
+; CHECK-NEXT:  out:
+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
@@ -170,7 +134,7 @@ out:
 define void @hoist_dereferenceable(i1 %c, ptr %p) {
 ; CHECK-LABEL: @hoist_dereferenceable(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !3
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -187,20 +151,8 @@ out:
 
 define void @hoist_dereferenceable_switch(i64 %i, ptr %p) {
 ; CHECK-LABEL: @hoist_dereferenceable_switch(
-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb0:
-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
-; CHECK-NEXT:    br label [[OUT:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !6
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !7
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       out:
+; CHECK-NEXT:  out:
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !3
 ; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
@@ -223,7 +175,7 @@ out:
 define void @hoist_dereferenceable_or_null(i1 %c, ptr %p) {
 ; CHECK-LABEL: @hoist_dereferenceable_or_null(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !5
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !3
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -240,20 +192,8 @@ out:
 
 define void @hoist_dereferenceable_or_null_switch(i64 %i, ptr %p) {
 ; CHECK-LABEL: @hoist_dereferenceable_or_null_switch(
-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb0:
-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !6
-; CHECK-NEXT:    br label [[OUT:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !5
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !7
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       out:
+; CHECK-NEXT:  out:
+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !3
 ; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
@@ -277,7 +217,7 @@ out:
 define i32 @speculate_range(i1 %c, ptr dereferenceable(8) align 8 %p) {
 ; CHECK-LABEL: @speculate_range(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG8:![0-9]+]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG4:![0-9]+]]
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], i32 [[V]], i32 0
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
 ;
@@ -298,7 +238,7 @@ join:
 define ptr @speculate_nonnull(i1 %c, ptr dereferenceable(8) align 8 %p) {
 ; CHECK-LABEL: @speculate_nonnull(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !4
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !2
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
 ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
 ;
@@ -319,7 +259,7 @@ join:
 define ptr @speculate_align(i1 %c, ptr dereferenceable(8) align 8 %p) {
 ; CHECK-LABEL: @speculate_align(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !9
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !5
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
 ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
 ;
@@ -338,7 +278,7 @@ join:
 define void @hoist_fpmath(i1 %c, double %x) {
 ; CHECK-LABEL: @hoist_fpmath(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
+; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !6
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -355,20 +295,8 @@ out:
 
 define void @hoist_fpmath_switch(i64 %i, double %x) {
 ; CHECK-LABEL: @hoist_fpmath_switch(
-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb0:
-; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
-; CHECK-NEXT:    br label [[OUT:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[E:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !11
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[F:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !12
-; CHECK-NEXT:    br label [[OUT]]
-; CHECK:       out:
+; CHECK-NEXT:  out:
+; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !6
 ; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
@@ -394,16 +322,10 @@ out:
 !3 = !{ i8 7, i8 9 }
 ;.
 ; CHECK: [[RNG0]] = !{i8 0, i8 1, i8 3, i8 5}
-; CHECK: [[RNG1]] = !{i8 0, i8 1}
-; CHECK: [[RNG2]] = !{i8 3, i8 5}
-; CHECK: [[RNG3]] = !{i8 7, i8 9}
-; CHECK: [[META4:![0-9]+]] = !{}
-; CHECK: [[META5:![0-9]+]] = !{i64 10}
-; CHECK: [[META6:![0-9]+]] = !{i64 20}
-; CHECK: [[META7:![0-9]+]] = !{i64 30}
-; CHECK: [[RNG8]] = !{i32 0, i32 10}
-; CHECK: [[META9:![0-9]+]] = !{i64 4}
-; CHECK: [[META10:![0-9]+]] = !{float 2.500000e+00}
-; CHECK: [[META11:![0-9]+]] = !{float 5.000000e+00}
-; CHECK: [[META12:![0-9]+]] = !{float 7.500000e+00}
+; CHECK: [[RNG1]] = !{i8 0, i8 1, i8 3, i8 5, i8 7, i8 9}
+; CHECK: [[META2:![0-9]+]] = !{}
+; CHECK: [[META3:![0-9]+]] = !{i64 10}
+; CHECK: [[RNG4]] = !{i32 0, i32 10}
+; CHECK: [[META5:![0-9]+]] = !{i64 4}
+; CHECK: [[META6:![0-9]+]] = !{float 2.500000e+00}
 ;.

From 64573da4bf0bc7883e6d4d3debc60daf665d78b8 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 19 Sep 2023 16:36:30 -0700
Subject: [PATCH 23/57] [IR] Add "Large Data Threshold" module metadata
 (#66797)

This allows us to not have to pass -mllvm flags to set the large data
threshold for (in-LLD/not-distributed) ThinLTO.

Follows https://reviews.llvm.org/D52322, which did the same for the code
model.

Since the large data threshold is tied to the code model and we disallow
mixing different code models, do the same for the large data threshold.
---
 llvm/include/llvm/IR/Module.h                 | 11 ++++++++++
 llvm/lib/IR/Module.cpp                        | 17 +++++++++++++++
 llvm/lib/LTO/LTOBackend.cpp                   |  5 +++++
 .../test/LTO/X86/Inputs/largedatathreshold.ll | 10 +++++++++
 llvm/test/LTO/X86/largedatathreshold-1.ll     | 21 +++++++++++++++++++
 llvm/test/LTO/X86/largedatathreshold-2.ll     | 20 ++++++++++++++++++
 llvm/test/LTO/X86/largedatathreshold-3.ll     | 20 ++++++++++++++++++
 7 files changed, 104 insertions(+)
 create mode 100644 llvm/test/LTO/X86/Inputs/largedatathreshold.ll
 create mode 100644 llvm/test/LTO/X86/largedatathreshold-1.ll
 create mode 100644 llvm/test/LTO/X86/largedatathreshold-2.ll
 create mode 100644 llvm/test/LTO/X86/largedatathreshold-3.ll

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 61b19409a96d0..70beddddc1c16 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -922,6 +922,17 @@ class LLVM_EXTERNAL_VISIBILITY Module {
   void setCodeModel(CodeModel::Model CL);
   /// @}
 
+  /// @}
+  /// @name Utility function for querying and setting the large data threshold
+  /// @{
+
+  /// Returns the code model (tiny, small, kernel, medium or large model)
+  std::optional<uint64_t> getLargeDataThreshold() const;
+
+  /// Set the code model (tiny, small, kernel, medium or large)
+  void setLargeDataThreshold(uint64_t Threshold);
+  /// @}
+
   /// @name Utility functions for querying and setting PGO summary
   /// @{
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 5861bbd1f293e..dba660bbe5baf 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -631,6 +631,23 @@ void Module::setCodeModel(CodeModel::Model CL) {
   addModuleFlag(ModFlagBehavior::Error, "Code Model", CL);
 }
 
+std::optional<uint64_t> Module::getLargeDataThreshold() const {
+  auto *Val =
+      cast_or_null<ConstantAsMetadata>(getModuleFlag("Large Data Threshold"));
+
+  if (!Val)
+    return std::nullopt;
+
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
+
+void Module::setLargeDataThreshold(uint64_t Threshold) {
+  // Since the large data threshold goes along with the code model, the merge
+  // behavior is the same.
+  addModuleFlag(ModFlagBehavior::Error, "Large Data Threshold",
+                ConstantInt::get(Type::getInt64Ty(Context), Threshold));
+}
+
 void Module::setProfileSummary(Metadata *M, ProfileSummary::Kind Kind) {
   if (Kind == ProfileSummary::PSK_CSInstr)
     setModuleFlag(ModFlagBehavior::Error, "CSProfileSummary", M);
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 29e2887676081..ccc4276e36dac 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -225,7 +225,12 @@ createTargetMachine(const Config &Conf, const Target *TheTarget, Module &M) {
   std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
       TheTriple, Conf.CPU, Features.getString(), Conf.Options, RelocModel,
       CodeModel, Conf.CGOptLevel));
+
   assert(TM && "Failed to create target machine");
+
+  if (std::optional<uint64_t> LargeDataThreshold = M.getLargeDataThreshold())
+    TM->setLargeDataThreshold(*LargeDataThreshold);
+
   return TM;
 }
 
diff --git a/llvm/test/LTO/X86/Inputs/largedatathreshold.ll b/llvm/test/LTO/X86/Inputs/largedatathreshold.ll
new file mode 100644
index 0000000000000..34174deb78c0e
--- /dev/null
+++ b/llvm/test/LTO/X86/Inputs/largedatathreshold.ll
@@ -0,0 +1,10 @@
+target triple = "x86_64-unknown-linux-gnu"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @bar() {
+  ret void
+}
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"Code Model", i32 3}
+!1 = !{i32 1, !"Large Data Threshold", i32 101}
diff --git a/llvm/test/LTO/X86/largedatathreshold-1.ll b/llvm/test/LTO/X86/largedatathreshold-1.ll
new file mode 100644
index 0000000000000..e3be5c11baaac
--- /dev/null
+++ b/llvm/test/LTO/X86/largedatathreshold-1.ll
@@ -0,0 +1,21 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-lto2 run -r %t.o,_start,px %t.o -o %t.s
+; RUN: llvm-objdump -d %t.s.0 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+@data = internal constant [20 x i8] zeroinitializer
+
+define ptr @_start() {
+entry:
+; CHECK-LABEL:  <_start>:
+; CHECK: leaq    (%rip), %rax
+; CHECK-NOT: movabsq
+    ret ptr @data
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"Code Model", i32 3}
+!1 = !{i32 1, !"Large Data Threshold", i32 100}
diff --git a/llvm/test/LTO/X86/largedatathreshold-2.ll b/llvm/test/LTO/X86/largedatathreshold-2.ll
new file mode 100644
index 0000000000000..103c066b744d0
--- /dev/null
+++ b/llvm/test/LTO/X86/largedatathreshold-2.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-lto2 run -r %t.o,_start,px %t.o -o %t.s
+; RUN: llvm-objdump -d %t.s.0 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+@data = internal constant [20 x i8] zeroinitializer
+
+define ptr @_start() {
+entry:
+; CHECK-LABEL:  <_start>:
+; CHECK: movabsq $0x0, %rax
+    ret ptr @data
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"Code Model", i32 3}
+!1 = !{i32 1, !"Large Data Threshold", i32 10}
diff --git a/llvm/test/LTO/X86/largedatathreshold-3.ll b/llvm/test/LTO/X86/largedatathreshold-3.ll
new file mode 100644
index 0000000000000..3c0653db334d8
--- /dev/null
+++ b/llvm/test/LTO/X86/largedatathreshold-3.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as %s -o %t0.o
+; RUN: llvm-as < %p/Inputs/largedatathreshold.ll > %t1.o
+; RUN: not llvm-lto2 run -r %t0.o,_start,px -r %t1.o,bar,px %t0.o %t1.o -o %t2.s 2>&1 | FileCheck %s
+
+; CHECK: 'Large Data Threshold': IDs have conflicting values
+
+target triple = "x86_64-unknown-linux-gnu"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+@data = internal constant [20 x i8] zeroinitializer
+
+define ptr @_start() {
+entry:
+  ret ptr @data
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"Code Model", i32 3}
+!1 = !{i32 1, !"Large Data Threshold", i32 100}

From 74338bfe0cfec8b8db24af131cdeb664e346a1b5 Mon Sep 17 00:00:00 2001
From: Jim Ingham <jingham@apple.com>
Date: Tue, 19 Sep 2023 16:44:48 -0700
Subject: [PATCH 24/57] A test was changing directory and then incorrectly
 restoring the directory to the "testdir" which is the build directory for
 that test, not the original source directory.  That caused subsequent tests
 to fail.

---
 lldb/test/API/commands/process/attach/TestProcessAttach.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/commands/process/attach/TestProcessAttach.py b/lldb/test/API/commands/process/attach/TestProcessAttach.py
index 67bb134ec0148..99ff3a4ec4db8 100644
--- a/lldb/test/API/commands/process/attach/TestProcessAttach.py
+++ b/lldb/test/API/commands/process/attach/TestProcessAttach.py
@@ -74,7 +74,8 @@ def test_attach_to_process_from_different_dir_by_id(self):
         popen = self.spawnSubprocess(exe)
 
         os.chdir(newdir)
-        self.addTearDownHook(lambda: os.chdir(testdir))
+        sourcedir = self.getSourceDir()
+        self.addTearDownHook(lambda: os.chdir(sourcedir))
         self.runCmd("process attach -p " + str(popen.pid))
 
         target = self.dbg.GetSelectedTarget()

From bfa3bc437809e0496a79a5ff690212e05b053ab3 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Tue, 19 Sep 2023 17:02:32 -0700
Subject: [PATCH 25/57] [mlir][sparse] unifies sparse_tensor.sort_coo/sort into
 one operation. (#66722)

The use cases of the two operations are largely overlapped, let's
simplify it and only use one of them.
---
 .../SparseTensor/IR/SparseTensorOps.td        |  75 +----
 .../SparseTensor/IR/SparseTensorDialect.cpp   |  45 +--
 .../Transforms/SparseBufferRewriting.cpp      | 314 ++++++++----------
 .../Transforms/SparseTensorCodegen.cpp        |  12 +-
 .../Transforms/SparseTensorPasses.cpp         |   1 -
 .../Transforms/SparseTensorRewriting.cpp      |  38 +--
 .../SparseTensor/buffer_rewriting.mlir        | 105 ++----
 mlir/test/Dialect/SparseTensor/codegen.mlir   |   6 +-
 .../SparseTensor/convert_sparse2sparse.mlir   |   2 +-
 mlir/test/Dialect/SparseTensor/invalid.mlir   |  47 ++-
 mlir/test/Dialect/SparseTensor/roundtrip.mlir |  64 +---
 .../SparseTensor/sparse_matmul_codegen.mlir   |   2 +-
 .../SparseTensor/CPU/sparse_rewrite_sort.mlir | 187 -----------
 .../CPU/sparse_rewrite_sort_coo.mlir          |  51 +--
 14 files changed, 269 insertions(+), 680 deletions(-)
 delete mode 100644 mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort.mlir

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 94301dbcd9f7b..59815fc755ee5 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -762,81 +762,32 @@ def SparseTensor_OutOp : SparseTensor_Op<"out", []>,
 // Sparse Tensor Sorting Operations.
 //===----------------------------------------------------------------------===//
 
-def SparseTensor_SortOp : SparseTensor_Op<"sort", [AttrSizedOperandSegments]>,
-    Arguments<(ins Index:$n,
-               Variadic<StridedMemRefRankOf<[AnyInteger, Index], [1]>>:$xs,
-               Variadic<StridedMemRefRankOf<[AnyType], [1]>>:$ys,
-               SparseTensorSortKindAttr:$algorithm)>  {
-  string summary = "Sorts the arrays in xs and ys lexicographically on the "
-                   "integral values found in the xs list";
-  string description = [{
-    Lexicographically sort the first `n` values in `xs` along with the values in
-    `ys`. Conceptually, the values being sorted are tuples produced by
-    `zip(zip(xs), zip(ys))`. In particular, values in `ys` needed to be sorted
-    along with values in `xs`, but values in `ys` don't affect the
-    lexicographical order. The order in which arrays appear in `xs` affects the
-    sorting result. The operator updates `xs` and `ys` in place with the result
-    of the sorting.
-
-    For example, assume x1=[4, 3], x2=[1, 2], y1=[10, 5], then the output of
-    "sort 2, x1, x2 jointly y1" are x1=[3, 4], x2=[2, 1], y1=[5, 10] while the
-    output of "sort 2, x2, x1, jointly y1" are x2=[1, 2], x1=[4, 3], y1=[10, 5].
-
-    Buffers in `xs` needs to have the same integral element type while buffers
-    in `ys` can have different numeric element types. All buffers in `xs` and
-    `ys` should have a dimension not less than `n`. The behavior of the operator
-    is undefined if this condition is not met. The operator requires at least
-    one buffer in `xs` while `ys` can be empty.
-
-    The enum attribute `algorithm` indicates the sorting algorithm used to
-    implement the operator: hybrid_quick_sort, insertion_sort_stable,
-    quick_sort, or heap_sort.
-
-    Note that this operation is "impure" in the sense that its behavior is
-    solely defined by side-effects and not SSA values.
-
-    Example:
-
-    ```mlir
-    sparse_tensor.sort insertion_sort_stable %n, %x1, %x2 jointly y1, %y2
-      : memref<?xindex>, memref<?xindex> jointly memref<?xindex>, memref<?xf32>
-    ```
-
-    ```mlir
-    sparse_tensor.sort hybrid_quick_sort %n, %x1, %x2 jointly y1, %y2
-      { alg=1 : index}
-      : memref<?xindex>, memref<?xindex> jointly memref<?xindex>, memref<?xf32>
-    ```
-  }];
-  let assemblyFormat = "$algorithm $n `,` $xs (`jointly` $ys^)? attr-dict"
-                       "`:` type($xs) (`jointly` type($ys)^)?";
-  let hasVerifier = 1;
-}
-
 def SparseTensor_SortCooOp : SparseTensor_Op<"sort_coo">,
     Arguments<(ins Index:$n, StridedMemRefRankOf<[AnyInteger, Index], [1]>:$xy,
                Variadic<StridedMemRefRankOf<[AnyType], [1]>>:$ys,
-               OptionalAttr<IndexAttr>:$nx, OptionalAttr<IndexAttr>:$ny,
+               AffineMapAttr:$perm_map, OptionalAttr<IndexAttr>:$ny,
                SparseTensorSortKindAttr:$algorithm)>  {
   let summary = "Sorts the arrays in xs and ys lexicographically on the "
                 "integral values found in the xs list";
   let description = [{
-    Sparse_tensor.sort_coo is similar to sparse_tensor.sort, except that all the
-    `xs` values and some `ys` values are put in the linear buffer `xy`. The
-    optional index attribute `nx` provides the number of `xs` values in `xy`.
-    When `nx` is not explicitly specified, its value is 1. The optional index
-    attribute `ny` provides the number of `ys` values in `xy`. When `ny` is not
-    explicitly specified, its value is 0. This instruction supports a more
-    efficient way to store the COO definition in sparse tensor type.
-
-    The buffer xy should have a dimension not less than n * (nx + ny) while the
+    Sparse_tensor.sort_coo sort the `xs` values along with some `ys` values
+    that are put in a single linear buffer `xy`.
+    The affine map attribute `perm_map` specifies the permutation to be applied on
+    the `xs` before comparison, the rank of the permutation map
+    also specifies the number of `xs` values in `xy`.
+    The optional index attribute `ny` provides the number of `ys` values in `xy`.
+    When `ny` is not explicitly specified, its value is 0.
+    This instruction supports a more efficient way to store the COO definition
+    in sparse tensor type.
+
+    The buffer xy should have a dimension not less than n * (rank(perm_map) + ny) while the
     buffers in `ys` should have a dimension not less than `n`. The behavior of
     the operator is undefined if this condition is not met.
 
     Example:
 
     ```mlir
-    sparse_tensor.sort_coo insertion_sort_stable %n, %x { nx = 2 : index}
+    sparse_tensor.sort_coo insertion_sort_stable %n, %x { perm_map = affine_map<(i,j) -> (j,i)> }
       : memref<?xindex>
     ```
 
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index e71d2a8dd623a..9675a61109477 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -1353,35 +1353,15 @@ LogicalResult SelectOp::verify() {
   return success();
 }
 
-LogicalResult SortOp::verify() {
-  if (getXs().empty())
-    return emitError("need at least one xs buffer.");
-
-  std::optional<int64_t> n = getConstantIntValue(getN());
-
-  Type xtp = getMemRefType(getXs().front()).getElementType();
-  auto checkTypes = [&](ValueRange operands,
-                        bool checkEleType = true) -> LogicalResult {
-    for (Value opnd : operands) {
-      auto mtp = getMemRefType(opnd);
-      const DynSize sh = mtp.getShape()[0];
-      // We can't check the size of dynamic dimension at compile-time, but all
-      // xs and ys should have a dimension not less than n at runtime.
-      if (n && !ShapedType::isDynamic(sh) && sh < n.value())
-        return emitError(llvm::formatv("xs and ys need to have a dimension >= n"
-                                       ": {0} < {1}",
-                                       sh, n.value()));
-
-      if (checkEleType && xtp != mtp.getElementType())
-        return emitError("mismatch xs element types");
-    }
-    return success();
-  };
-  RETURN_FAILURE_IF_FAILED(checkTypes(getXs()))
-  return n ? checkTypes(getYs(), false) : success();
-}
-
 LogicalResult SortCooOp::verify() {
+  AffineMap xPerm = getPermMap();
+  uint64_t nx = xPerm.getNumDims();
+  if (nx < 1)
+    emitError(llvm::formatv("Expected rank(perm_map) > 1, got {0}", nx));
+
+  if (!xPerm.isPermutation())
+    emitError(llvm::formatv("Expected a permutation map, got {0}", xPerm));
+
   std::optional<int64_t> cn = getConstantIntValue(getN());
   // We can't check the size of the buffers when n or buffer dimensions aren't
   // compile-time constants.
@@ -1389,12 +1369,6 @@ LogicalResult SortCooOp::verify() {
     return success();
 
   uint64_t n = cn.value();
-  uint64_t nx = 1;
-  if (auto nxAttr = getNxAttr()) {
-    nx = nxAttr.getInt();
-    if (nx < 1)
-      emitError(llvm::formatv("Expected nx > 1, got {0}", nx));
-  }
   uint64_t ny = 0;
   if (auto nyAttr = getNyAttr()) {
     ny = nyAttr.getInt();
@@ -1409,7 +1383,8 @@ LogicalResult SortCooOp::verify() {
       emitError(llvm::formatv("{0} got {1} < {2}", message, sh, minSize));
   };
 
-  checkDim(getXy(), n * (nx + ny), "Expected dimension(xy) >= n * (nx + ny)");
+  checkDim(getXy(), n * (nx + ny),
+           "Expected dimension(xy) >= n * (rank(perm_map) + ny)");
 
   for (Value opnd : getYs()) {
     checkDim(opnd, n, "Expected dimension(y) >= n");
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
index 029ecb0708941..3181395a474cf 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
@@ -45,46 +45,43 @@ static constexpr const char kShiftDownFuncNamePrefix[] = "_sparse_shift_down_";
 static constexpr const char kHeapSortFuncNamePrefix[] = "_sparse_heap_sort_";
 static constexpr const char kQuickSortFuncNamePrefix[] = "_sparse_qsort_";
 
-using FuncGeneratorType = function_ref<void(
-    OpBuilder &, ModuleOp, func::FuncOp, uint64_t, uint64_t, bool, uint32_t)>;
+using FuncGeneratorType = function_ref<void(OpBuilder &, ModuleOp, func::FuncOp,
+                                            AffineMap, uint64_t, uint32_t)>;
 
 /// Constructs a function name with this format to facilitate quick sort:
-///   <namePrefix><nx>_<x type>_<y0 type>..._<yn type> for sort
-///   <namePrefix><nx>_<x type>_coo_<ny>_<y0 type>..._<yn type> for sort_coo
+///   <namePrefix><xPerm>_<x type>_<y0 type>..._<yn type> for sort
+///   <namePrefix><xPerm>_<x type>_coo_<ny>_<y0 type>..._<yn type> for sort_coo
 static void getMangledSortHelperFuncName(llvm::raw_svector_ostream &nameOstream,
-                                         StringRef namePrefix, uint64_t nx,
-                                         uint64_t ny, bool isCoo,
-                                         ValueRange operands) {
-  nameOstream << namePrefix << nx << "_"
-              << getMemRefType(operands[xStartIdx]).getElementType();
+                                         StringRef namePrefix, AffineMap xPerm,
+                                         uint64_t ny, ValueRange operands) {
+  nameOstream << namePrefix;
+  for (auto res : xPerm.getResults())
+    nameOstream << res.cast<AffineDimExpr>().getPosition() << "_";
 
-  if (isCoo)
-    nameOstream << "_coo_" << ny;
+  nameOstream << getMemRefType(operands[xStartIdx]).getElementType();
+  nameOstream << "_coo_" << ny;
 
-  uint64_t yBufferOffset = isCoo ? 1 : nx;
+  constexpr uint64_t yBufferOffset = 1;
   for (Value v : operands.drop_front(xStartIdx + yBufferOffset))
     nameOstream << "_" << getMemRefType(v).getElementType();
 }
 
 /// Looks up a function that is appropriate for the given operands being
 /// sorted, and creates such a function if it doesn't exist yet. The
-/// parameters `nx` and `ny` tell the number of x and y values provided
-/// by the buffer in xStartIdx, and `isCoo` indicates whether the instruction
-/// being processed is a sparse_tensor.sort or sparse_tensor.sort_coo.
+/// parameters `xPerm` and `ny` tell the number of x and y values provided
+/// by the buffer in xStartIdx.
 //
 // All sorting function generators take (lo, hi, xs, ys) in `operands` as
 // parameters for the sorting functions. Other parameters, such as the recursive
 // call depth, are appended to the end of the parameter list as
 // "trailing parameters".
-static FlatSymbolRefAttr
-getMangledSortHelperFunc(OpBuilder &builder, func::FuncOp insertPoint,
-                         TypeRange resultTypes, StringRef namePrefix,
-                         uint64_t nx, uint64_t ny, bool isCoo,
-                         ValueRange operands, FuncGeneratorType createFunc,
-                         uint32_t nTrailingP = 0) {
+static FlatSymbolRefAttr getMangledSortHelperFunc(
+    OpBuilder &builder, func::FuncOp insertPoint, TypeRange resultTypes,
+    StringRef namePrefix, AffineMap xPerm, uint64_t ny, ValueRange operands,
+    FuncGeneratorType createFunc, uint32_t nTrailingP = 0) {
   SmallString<32> nameBuffer;
   llvm::raw_svector_ostream nameOstream(nameBuffer);
-  getMangledSortHelperFuncName(nameOstream, namePrefix, nx, ny, isCoo,
+  getMangledSortHelperFuncName(nameOstream, namePrefix, xPerm, ny,
                                operands.drop_back(nTrailingP));
 
   ModuleOp module = insertPoint->getParentOfType<ModuleOp>();
@@ -101,7 +98,7 @@ getMangledSortHelperFunc(OpBuilder &builder, func::FuncOp insertPoint,
         loc, nameOstream.str(),
         FunctionType::get(context, operands.getTypes(), resultTypes));
     func.setPrivate();
-    createFunc(builder, module, func, nx, ny, isCoo, nTrailingP);
+    createFunc(builder, module, func, xPerm, ny, nTrailingP);
   }
 
   return result;
@@ -110,27 +107,19 @@ getMangledSortHelperFunc(OpBuilder &builder, func::FuncOp insertPoint,
 /// Creates a code block to process each pair of (xs[i], xs[j]) for sorting.
 /// The code to process the value pairs is generated by `bodyBuilder`.
 static void forEachIJPairInXs(
-    OpBuilder &builder, Location loc, ValueRange args, uint64_t nx, uint64_t ny,
-    bool isCoo, function_ref<void(uint64_t, Value, Value, Value)> bodyBuilder) {
-  Value iOffset, jOffset;
-  if (isCoo) {
-    Value cstep = constantIndex(builder, loc, nx + ny);
-    iOffset = builder.create<arith::MulIOp>(loc, args[0], cstep);
-    jOffset = builder.create<arith::MulIOp>(loc, args[1], cstep);
-  }
-  for (uint64_t k = 0; k < nx; k++) {
-    scf::IfOp ifOp;
-    Value i, j, buffer;
-    if (isCoo) {
-      Value ck = constantIndex(builder, loc, k);
-      i = builder.create<arith::AddIOp>(loc, ck, iOffset);
-      j = builder.create<arith::AddIOp>(loc, ck, jOffset);
-      buffer = args[xStartIdx];
-    } else {
-      i = args[0];
-      j = args[1];
-      buffer = args[xStartIdx + k];
-    }
+    OpBuilder &builder, Location loc, ValueRange args, AffineMap xPerm,
+    uint64_t ny,
+    function_ref<void(uint64_t, Value, Value, Value)> bodyBuilder) {
+  Value cstep = constantIndex(builder, loc, xPerm.getNumResults() + ny);
+  Value iOffset = builder.create<arith::MulIOp>(loc, args[0], cstep);
+  Value jOffset = builder.create<arith::MulIOp>(loc, args[1], cstep);
+  for (unsigned k = 0, e = xPerm.getNumResults(); k < e; k++) {
+    unsigned actualK = xPerm.getResult(k).cast<AffineDimExpr>().getPosition();
+    Value ak = constantIndex(builder, loc, actualK);
+    Value i = builder.create<arith::AddIOp>(loc, ak, iOffset);
+    Value j = builder.create<arith::AddIOp>(loc, ak, jOffset);
+    Value buffer = args[xStartIdx];
+
     bodyBuilder(k, i, j, buffer);
   }
 }
@@ -138,21 +127,28 @@ static void forEachIJPairInXs(
 /// Creates a code block to process each pair of (xys[i], xys[j]) for sorting.
 /// The code to process the value pairs is generated by `bodyBuilder`.
 static void forEachIJPairInAllBuffers(
-    OpBuilder &builder, Location loc, ValueRange args, uint64_t nx, uint64_t ny,
-    bool isCoo, function_ref<void(uint64_t, Value, Value, Value)> bodyBuilder) {
-
-  // Create code for the first (nx + ny) buffers. When isCoo==true, these
-  // logical buffers are all from the xy buffer of the sort_coo operator.
-  forEachIJPairInXs(builder, loc, args, nx + ny, 0, isCoo, bodyBuilder);
+    OpBuilder &builder, Location loc, ValueRange args, AffineMap xPerm,
+    uint64_t ny,
+    function_ref<void(uint64_t, Value, Value, Value)> bodyBuilder) {
+
+  // Create code for the first (xPerm + ny) buffers.
+  SmallVector<AffineExpr> exps(xPerm.getResults().begin(),
+                               xPerm.getResults().end());
+  for (unsigned y = 0; y < ny; y++) {
+    exps.push_back(builder.getAffineDimExpr(y + xPerm.getNumResults()));
+  }
+  AffineMap xyPerm = AffineMap::get(exps.size(), 0, exps, builder.getContext());
+  assert(xyPerm.isPermutation());
 
-  uint64_t numHandledBuffers = isCoo ? 1 : nx + ny;
+  forEachIJPairInXs(builder, loc, args, xyPerm, 0, bodyBuilder);
 
+  constexpr uint64_t numHandledBuffers = 1;
   // Create code for the remaining buffers.
   Value i = args[0];
   Value j = args[1];
   for (const auto &arg :
        llvm::enumerate(args.drop_front(xStartIdx + numHandledBuffers))) {
-    bodyBuilder(arg.index() + nx + ny, i, j, arg.value());
+    bodyBuilder(arg.index() + xPerm.getNumResults() + ny, i, j, arg.value());
   }
 }
 
@@ -168,7 +164,7 @@ static void forEachIJPairInAllBuffers(
 //     ...
 //     swap(yn[i], yn[j]);
 static void createSwap(OpBuilder &builder, Location loc, ValueRange args,
-                       uint64_t nx, uint64_t ny, bool isCoo) {
+                       AffineMap xPerm, uint64_t ny) {
   auto swapOnePair = [&](uint64_t unused, Value i, Value j, Value buffer) {
     Value vi = builder.create<memref::LoadOp>(loc, buffer, i);
     Value vj = builder.create<memref::LoadOp>(loc, buffer, j);
@@ -176,20 +172,20 @@ static void createSwap(OpBuilder &builder, Location loc, ValueRange args,
     builder.create<memref::StoreOp>(loc, vi, buffer, j);
   };
 
-  forEachIJPairInAllBuffers(builder, loc, args, nx, ny, isCoo, swapOnePair);
+  forEachIJPairInAllBuffers(builder, loc, args, xPerm, ny, swapOnePair);
 }
 
 /// Creates code to compare all the (xs[i], xs[j]) pairs. The method to compare
 /// each pair is create via `compareBuilder`.
 static Value createInlinedCompareImplementation(
-    OpBuilder &builder, Location loc, ValueRange args, uint64_t nx, uint64_t ny,
-    bool isCoo,
+    OpBuilder &builder, Location loc, ValueRange args, AffineMap xPerm,
+    uint64_t ny,
     function_ref<Value(OpBuilder &, Location, Value, Value, Value, bool, bool)>
         compareBuilder) {
   Value result;
   auto bodyBuilder = [&](uint64_t k, Value i, Value j, Value buffer) {
     bool isFirstDim = (k == 0);
-    bool isLastDim = (k == nx - 1);
+    bool isLastDim = (k == xPerm.getNumResults() - 1);
     Value val =
         compareBuilder(builder, loc, i, j, buffer, isFirstDim, isLastDim);
     if (isFirstDim) {
@@ -202,7 +198,7 @@ static Value createInlinedCompareImplementation(
     }
   };
 
-  forEachIJPairInXs(builder, loc, args, nx, ny, isCoo, bodyBuilder);
+  forEachIJPairInXs(builder, loc, args, xPerm, ny, bodyBuilder);
 
   builder.setInsertionPointAfterValue(result);
   return result;
@@ -252,12 +248,12 @@ static Value createEqCompare(OpBuilder &builder, Location loc, Value i, Value j,
 //     else if (x2[2] != x2[j]))
 //       and so on ...
 static Value createInlinedEqCompare(OpBuilder &builder, Location loc,
-                                    ValueRange args, uint64_t nx, uint64_t ny,
-                                    bool isCoo, uint32_t nTrailingP = 0) {
+                                    ValueRange args, AffineMap xPerm,
+                                    uint64_t ny, uint32_t nTrailingP = 0) {
   // Compare functions don't use trailing parameters.
   (void)nTrailingP;
   assert(nTrailingP == 0);
-  return createInlinedCompareImplementation(builder, loc, args, nx, ny, isCoo,
+  return createInlinedCompareImplementation(builder, loc, args, xPerm, ny,
                                             createEqCompare);
 }
 
@@ -306,12 +302,12 @@ static Value createLessThanCompare(OpBuilder &builder, Location loc, Value i,
 //   else
 //       and so on ...
 static Value createInlinedLessThan(OpBuilder &builder, Location loc,
-                                   ValueRange args, uint64_t nx, uint64_t ny,
-                                   bool isCoo, uint32_t nTrailingP = 0) {
+                                   ValueRange args, AffineMap xPerm,
+                                   uint64_t ny, uint32_t nTrailingP = 0) {
   // Compare functions don't use trailing parameters.
   (void)nTrailingP;
   assert(nTrailingP == 0);
-  return createInlinedCompareImplementation(builder, loc, args, nx, ny, isCoo,
+  return createInlinedCompareImplementation(builder, loc, args, xPerm, ny,
                                             createLessThanCompare);
 }
 
@@ -329,8 +325,8 @@ static Value createInlinedLessThan(OpBuilder &builder, Location loc,
 //   return lo;
 //
 static void createBinarySearchFunc(OpBuilder &builder, ModuleOp module,
-                                   func::FuncOp func, uint64_t nx, uint64_t ny,
-                                   bool isCoo, uint32_t nTrailingP = 0) {
+                                   func::FuncOp func, AffineMap xPerm,
+                                   uint64_t ny, uint32_t nTrailingP = 0) {
   // Binary search doesn't use trailing parameters.
   (void)nTrailingP;
   assert(nTrailingP == 0);
@@ -368,11 +364,10 @@ static void createBinarySearchFunc(OpBuilder &builder, ModuleOp module,
 
   // Compare xs[p] < xs[mid].
   SmallVector<Value> compareOperands{p, mid};
-  uint64_t numXBuffers = isCoo ? 1 : nx;
+  constexpr uint64_t numXBuffers = 1;
   compareOperands.append(args.begin() + xStartIdx,
                          args.begin() + xStartIdx + numXBuffers);
-  Value cond2 =
-      createInlinedLessThan(builder, loc, compareOperands, nx, ny, isCoo);
+  Value cond2 = createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
   // Update lo and hi for the WhileOp as follows:
   //   if (xs[p] < xs[mid]))
   //     hi = mid;
@@ -392,10 +387,11 @@ static void createBinarySearchFunc(OpBuilder &builder, ModuleOp module,
 ///   while (xs[i] > xs[p]) i += step (step < 0)
 /// The routine returns i as well as a boolean value to indicate whether
 /// xs[i] == xs[p].
-static std::pair<Value, Value>
-createScanLoop(OpBuilder &builder, ModuleOp module, func::FuncOp func,
-               ValueRange xs, Value i, Value p, uint64_t nx, uint64_t ny,
-               bool isCoo, int step) {
+static std::pair<Value, Value> createScanLoop(OpBuilder &builder,
+                                              ModuleOp module,
+                                              func::FuncOp func, ValueRange xs,
+                                              Value i, Value p, AffineMap xPerm,
+                                              uint64_t ny, int step) {
   Location loc = func.getLoc();
   scf::WhileOp whileOp =
       builder.create<scf::WhileOp>(loc, TypeRange{i.getType()}, ValueRange{i});
@@ -413,8 +409,7 @@ createScanLoop(OpBuilder &builder, ModuleOp module, func::FuncOp func,
     compareOperands.push_back(before->getArgument(0));
   }
   compareOperands.append(xs.begin(), xs.end());
-  Value cond =
-      createInlinedLessThan(builder, loc, compareOperands, nx, ny, isCoo);
+  Value cond = createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
   builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
 
   Block *after =
@@ -429,7 +424,7 @@ createScanLoop(OpBuilder &builder, ModuleOp module, func::FuncOp func,
   compareOperands[0] = i;
   compareOperands[1] = p;
   Value compareEq =
-      createInlinedEqCompare(builder, loc, compareOperands, nx, ny, isCoo);
+      createInlinedEqCompare(builder, loc, compareOperands, xPerm, ny);
 
   return std::make_pair(whileOp.getResult(0), compareEq);
 }
@@ -438,67 +433,63 @@ createScanLoop(OpBuilder &builder, ModuleOp module, func::FuncOp func,
 /// if compareFunc(data[b], data[a]) returns true. The new insertion point is
 /// right after the swap instructions.
 static scf::IfOp createCompareThenSwap(OpBuilder &builder, Location loc,
-                                       uint64_t nx, uint64_t ny, bool isCoo,
+                                       AffineMap xPerm, uint64_t ny,
                                        SmallVectorImpl<Value> &swapOperands,
                                        SmallVectorImpl<Value> &compareOperands,
                                        Value a, Value b) {
   // Compare(data[b], data[a]).
   compareOperands[0] = b;
   compareOperands[1] = a;
-  Value cond =
-      createInlinedLessThan(builder, loc, compareOperands, nx, ny, isCoo);
+  Value cond = createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
   scf::IfOp ifOp = builder.create<scf::IfOp>(loc, cond, /*else=*/false);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   swapOperands[0] = b;
   swapOperands[1] = a;
-  createSwap(builder, loc, swapOperands, nx, ny, isCoo);
+  createSwap(builder, loc, swapOperands, xPerm, ny);
   return ifOp;
 }
 
 /// Creates code to insert the 3rd element to a list of two sorted elements.
-static void createInsert3rd(OpBuilder &builder, Location loc, uint64_t nx,
-                            uint64_t ny, bool isCoo,
-                            SmallVectorImpl<Value> &swapOperands,
+static void createInsert3rd(OpBuilder &builder, Location loc, AffineMap xPerm,
+                            uint64_t ny, SmallVectorImpl<Value> &swapOperands,
                             SmallVectorImpl<Value> &compareOperands, Value v0,
                             Value v1, Value v2) {
-  scf::IfOp ifOp = createCompareThenSwap(builder, loc, nx, ny, isCoo,
-                                         swapOperands, compareOperands, v1, v2);
-  createCompareThenSwap(builder, loc, nx, ny, isCoo, swapOperands,
-                        compareOperands, v0, v1);
+  scf::IfOp ifOp = createCompareThenSwap(builder, loc, xPerm, ny, swapOperands,
+                                         compareOperands, v1, v2);
+  createCompareThenSwap(builder, loc, xPerm, ny, swapOperands, compareOperands,
+                        v0, v1);
   builder.setInsertionPointAfter(ifOp);
 }
 
 /// Creates code to sort 3 elements.
-static void createSort3(OpBuilder &builder, Location loc, uint64_t nx,
-                        uint64_t ny, bool isCoo,
-                        SmallVectorImpl<Value> &swapOperands,
+static void createSort3(OpBuilder &builder, Location loc, AffineMap xPerm,
+                        uint64_t ny, SmallVectorImpl<Value> &swapOperands,
                         SmallVectorImpl<Value> &compareOperands, Value v0,
                         Value v1, Value v2) {
   // Sort the first 2 elements.
-  scf::IfOp ifOp1 = createCompareThenSwap(
-      builder, loc, nx, ny, isCoo, swapOperands, compareOperands, v0, v1);
+  scf::IfOp ifOp1 = createCompareThenSwap(builder, loc, xPerm, ny, swapOperands,
+                                          compareOperands, v0, v1);
   builder.setInsertionPointAfter(ifOp1);
 
   // Insert the 3th element.
-  createInsert3rd(builder, loc, nx, ny, isCoo, swapOperands, compareOperands,
-                  v0, v1, v2);
+  createInsert3rd(builder, loc, xPerm, ny, swapOperands, compareOperands, v0,
+                  v1, v2);
 }
 
 /// Creates code to sort 5 elements.
-static void createSort5(OpBuilder &builder, Location loc, uint64_t nx,
-                        uint64_t ny, bool isCoo,
-                        SmallVectorImpl<Value> &swapOperands,
+static void createSort5(OpBuilder &builder, Location loc, AffineMap xPerm,
+                        uint64_t ny, SmallVectorImpl<Value> &swapOperands,
                         SmallVectorImpl<Value> &compareOperands, Value v0,
                         Value v1, Value v2, Value v3, Value v4) {
   // Sort the first 3 elements.
-  createSort3(builder, loc, nx, ny, isCoo, swapOperands, compareOperands, v0,
-              v1, v2);
+  createSort3(builder, loc, xPerm, ny, swapOperands, compareOperands, v0, v1,
+              v2);
 
   auto insert4th = [&]() {
     scf::IfOp ifOp = createCompareThenSwap(
-        builder, loc, nx, ny, isCoo, swapOperands, compareOperands, v2, v3);
-    createInsert3rd(builder, loc, nx, ny, isCoo, swapOperands, compareOperands,
-                    v0, v1, v2);
+        builder, loc, xPerm, ny, swapOperands, compareOperands, v2, v3);
+    createInsert3rd(builder, loc, xPerm, ny, swapOperands, compareOperands, v0,
+                    v1, v2);
     builder.setInsertionPointAfter(ifOp);
   };
 
@@ -506,8 +497,8 @@ static void createSort5(OpBuilder &builder, Location loc, uint64_t nx,
   insert4th();
 
   // Insert the 5th element.
-  scf::IfOp ifOp = createCompareThenSwap(builder, loc, nx, ny, isCoo,
-                                         swapOperands, compareOperands, v3, v4);
+  scf::IfOp ifOp = createCompareThenSwap(builder, loc, xPerm, ny, swapOperands,
+                                         compareOperands, v3, v4);
   insert4th();
   builder.setInsertionPointAfter(ifOp);
 }
@@ -517,11 +508,10 @@ static void createSort5(OpBuilder &builder, Location loc, uint64_t nx,
 /// the number of values in range [lo, hi) is more than a threshold, we also
 /// include the middle of [lo, mi) and [mi, hi) and sort a total of five values.
 static void createChoosePivot(OpBuilder &builder, ModuleOp module,
-                              func::FuncOp func, uint64_t nx, uint64_t ny,
-                              bool isCoo, Value lo, Value hi, Value mi,
-                              ValueRange args) {
+                              func::FuncOp func, AffineMap xPerm, uint64_t ny,
+                              Value lo, Value hi, Value mi, ValueRange args) {
   SmallVector<Value> compareOperands{mi, lo};
-  uint64_t numXBuffers = isCoo ? 1 : nx;
+  constexpr uint64_t numXBuffers = 1;
   compareOperands.append(args.begin() + xStartIdx,
                          args.begin() + xStartIdx + numXBuffers);
   SmallVector<Value> swapOperands{mi, lo};
@@ -537,8 +527,8 @@ static void createChoosePivot(OpBuilder &builder, ModuleOp module,
 
   // When len < 1000, choose pivot from median of 3 values.
   builder.setInsertionPointToStart(&lenIf.getThenRegion().front());
-  createSort3(builder, loc, nx, ny, isCoo, swapOperands, compareOperands, lo,
-              mi, hi);
+  createSort3(builder, loc, xPerm, ny, swapOperands, compareOperands, lo, mi,
+              hi);
 
   // When len >= 1000, choose pivot from median of 5 values.
   builder.setInsertionPointToStart(&lenIf.getElseRegion().front());
@@ -549,8 +539,8 @@ static void createChoosePivot(OpBuilder &builder, ModuleOp module,
   Value b = builder.create<arith::AddIOp>(loc, mi, hiP1);
   // Value b is the middle between [mi, hi].
   b = builder.create<arith::ShRUIOp>(loc, b, c1);
-  createSort5(builder, loc, nx, ny, isCoo, swapOperands, compareOperands, lo, a,
-              mi, b, hi);
+  createSort5(builder, loc, xPerm, ny, swapOperands, compareOperands, lo, a, mi,
+              b, hi);
 
   builder.setInsertionPointAfter(lenIf);
 }
@@ -586,8 +576,8 @@ static void createChoosePivot(OpBuilder &builder, ModuleOp module,
 //   }
 // }
 static void createPartitionFunc(OpBuilder &builder, ModuleOp module,
-                                func::FuncOp func, uint64_t nx, uint64_t ny,
-                                bool isCoo, uint32_t nTrailingP = 0) {
+                                func::FuncOp func, AffineMap xPerm, uint64_t ny,
+                                uint32_t nTrailingP = 0) {
   // Quick sort partition doesn't use trailing parameters.
   (void)nTrailingP;
   assert(nTrailingP == 0);
@@ -606,7 +596,7 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module,
 
   Value i = lo;
   Value j = builder.create<arith::SubIOp>(loc, hi, c1);
-  createChoosePivot(builder, module, func, nx, ny, isCoo, i, j, p, args);
+  createChoosePivot(builder, module, func, xPerm, ny, i, j, p, args);
   Value trueVal = constantI1(builder, loc, true); // The value for while (true)
   SmallVector<Value, 4> operands{i, j, p, trueVal}; // Exactly four values.
   SmallVector<Type, 4> types{i.getType(), j.getType(), p.getType(),
@@ -628,14 +618,14 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module,
   j = after->getArgument(1);
   p = after->getArgument(2);
 
-  uint64_t numXBuffers = isCoo ? 1 : nx;
+  constexpr uint64_t numXBuffers = 1;
   auto [iresult, iCompareEq] =
       createScanLoop(builder, module, func, args.slice(xStartIdx, numXBuffers),
-                     i, p, nx, ny, isCoo, 1);
+                     i, p, xPerm, ny, 1);
   i = iresult;
   auto [jresult, jCompareEq] =
       createScanLoop(builder, module, func, args.slice(xStartIdx, numXBuffers),
-                     j, p, nx, ny, isCoo, -1);
+                     j, p, xPerm, ny, -1);
   j = jresult;
 
   // If i < j:
@@ -645,7 +635,7 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module,
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   SmallVector<Value> swapOperands{i, j};
   swapOperands.append(args.begin() + xStartIdx, args.end());
-  createSwap(builder, loc, swapOperands, nx, ny, isCoo);
+  createSwap(builder, loc, swapOperands, xPerm, ny);
   // If the pivot is moved, update p with the new pivot.
   Value icond =
       builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, i, p);
@@ -737,8 +727,8 @@ static Value createSubTwoDividedByTwo(OpBuilder &builder, Location loc,
 // }
 //
 static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
-                                func::FuncOp func, uint64_t nx, uint64_t ny,
-                                bool isCoo, uint32_t nTrailingP) {
+                                func::FuncOp func, AffineMap xPerm, uint64_t ny,
+                                uint32_t nTrailingP) {
   // The value n is passed in as a trailing parameter.
   assert(nTrailingP == 1);
   OpBuilder::InsertionGuard insertionGuard(builder);
@@ -768,7 +758,7 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
   builder.setInsertionPointToStart(&ifNc.getThenRegion().front());
   Value c1 = constantIndex(builder, loc, 1);
   SmallVector<Value> compareOperands{start, start};
-  uint64_t numXBuffers = isCoo ? 1 : nx;
+  constexpr uint64_t numXBuffers = 1;
   compareOperands.append(args.begin() + xStartIdx,
                          args.begin() + xStartIdx + numXBuffers);
 
@@ -794,7 +784,7 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
     compareOperands[0] = lChildIdx;
     compareOperands[1] = rChildIdx;
     Value cond2 =
-        createInlinedLessThan(builder, loc, compareOperands, nx, ny, isCoo);
+        createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
     scf::IfOp if2 =
         builder.create<scf::IfOp>(loc, ifTypes, cond2, /*else=*/true);
     builder.setInsertionPointToStart(&if2.getThenRegion().front());
@@ -825,8 +815,7 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
   childIdx = before->getArgument(2);
   compareOperands[0] = start;
   compareOperands[1] = childIdx;
-  Value cond =
-      createInlinedLessThan(builder, loc, compareOperands, nx, ny, isCoo);
+  Value cond = createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
   builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
 
   // The after-region of the WhileOp.
@@ -836,7 +825,7 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
   childIdx = after->getArgument(2);
   SmallVector<Value> swapOperands{start, childIdx};
   swapOperands.append(args.begin() + xStartIdx, args.end());
-  createSwap(builder, loc, swapOperands, nx, ny, isCoo);
+  createSwap(builder, loc, swapOperands, xPerm, ny);
   start = childIdx;
   Value cond2 =
       builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::uge, t, child);
@@ -869,8 +858,8 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
 //      shiftdown(lo, lo, l-1)
 // }
 static void createHeapSortFunc(OpBuilder &builder, ModuleOp module,
-                               func::FuncOp func, uint64_t nx, uint64_t ny,
-                               bool isCoo, uint32_t nTrailingP) {
+                               func::FuncOp func, AffineMap xPerm, uint64_t ny,
+                               uint32_t nTrailingP) {
   // Heap sort function doesn't have trailing parameters.
   (void)nTrailingP;
   assert(nTrailingP == 0);
@@ -897,7 +886,7 @@ static void createHeapSortFunc(OpBuilder &builder, ModuleOp module,
   shiftDownOperands.append(args.begin() + xStartIdx, args.end());
   shiftDownOperands.push_back(n);
   FlatSymbolRefAttr shiftDownFunc = getMangledSortHelperFunc(
-      builder, func, TypeRange(), kShiftDownFuncNamePrefix, nx, ny, isCoo,
+      builder, func, TypeRange(), kShiftDownFuncNamePrefix, xPerm, ny,
       shiftDownOperands, createShiftDownFunc, /*nTrailingP=*/1);
   builder.create<func::CallOp>(loc, shiftDownFunc, TypeRange(),
                                shiftDownOperands);
@@ -912,7 +901,7 @@ static void createHeapSortFunc(OpBuilder &builder, ModuleOp module,
   loplm1 = builder.create<arith::SubIOp>(loc, loplm1, c1);
   SmallVector<Value> swapOperands{lo, loplm1};
   swapOperands.append(args.begin() + xStartIdx, args.end());
-  createSwap(builder, loc, swapOperands, nx, ny, isCoo);
+  createSwap(builder, loc, swapOperands, xPerm, ny);
   shiftDownOperands[1] = lo;
   shiftDownOperands[shiftDownOperands.size() - 1] =
       builder.create<arith::SubIOp>(loc, l, c1);
@@ -928,7 +917,7 @@ static void createHeapSortFunc(OpBuilder &builder, ModuleOp module,
 /// the bigger partition to be processed by the enclosed while-loop.
 static std::pair<Value, Value>
 createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func,
-                ValueRange args, uint64_t nx, uint64_t ny, bool isCoo,
+                ValueRange args, AffineMap xPerm, uint64_t ny,
                 uint32_t nTrailingP) {
   MLIRContext *context = module.getContext();
   Location loc = func.getLoc();
@@ -937,8 +926,8 @@ createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func,
   SmallVector<Type, 2> types(2, lo.getType()); // Only two types.
 
   FlatSymbolRefAttr partitionFunc = getMangledSortHelperFunc(
-      builder, func, {IndexType::get(context)}, kPartitionFuncNamePrefix, nx,
-      ny, isCoo, args.drop_back(nTrailingP), createPartitionFunc);
+      builder, func, {IndexType::get(context)}, kPartitionFuncNamePrefix, xPerm,
+      ny, args.drop_back(nTrailingP), createPartitionFunc);
   Value p = builder
                 .create<func::CallOp>(loc, partitionFunc,
                                       TypeRange{IndexType::get(context)},
@@ -1008,8 +997,8 @@ createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func,
 //   }
 // }
 static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
-                                 func::FuncOp func, uint64_t nx, uint64_t ny,
-                                 bool isCoo, uint32_t nTrailingP) {
+                                 func::FuncOp func, AffineMap xPerm,
+                                 uint64_t ny, uint32_t nTrailingP) {
   // Stable sort function doesn't use trailing parameters.
   (void)nTrailingP;
   assert(nTrailingP == 0);
@@ -1034,8 +1023,8 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
   SmallVector<Value> operands{lo, i};
   operands.append(args.begin() + xStartIdx, args.end());
   FlatSymbolRefAttr searchFunc = getMangledSortHelperFunc(
-      builder, func, {IndexType::get(context)}, kBinarySearchFuncNamePrefix, nx,
-      ny, isCoo, operands, createBinarySearchFunc);
+      builder, func, {IndexType::get(context)}, kBinarySearchFuncNamePrefix,
+      xPerm, ny, operands, createBinarySearchFunc);
   Value p = builder
                 .create<func::CallOp>(loc, searchFunc, TypeRange{c1.getType()},
                                       operands)
@@ -1045,7 +1034,7 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
   operands[0] = operands[1] = i;
   SmallVector<Value> d;
   forEachIJPairInAllBuffers(
-      builder, loc, operands, nx, ny, isCoo,
+      builder, loc, operands, xPerm, ny,
       [&](uint64_t unused, Value i, Value unused2, Value buffer) {
         d.push_back(builder.create<memref::LoadOp>(loc, buffer, i));
       });
@@ -1061,7 +1050,7 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
   operands[1] = imj;
   operands[0] = builder.create<arith::SubIOp>(loc, imj, c1);
   forEachIJPairInAllBuffers(
-      builder, loc, operands, nx, ny, isCoo,
+      builder, loc, operands, xPerm, ny,
       [&](uint64_t unused, Value imjm1, Value imj, Value buffer) {
         Value t = builder.create<memref::LoadOp>(loc, buffer, imjm1);
         builder.create<memref::StoreOp>(loc, t, buffer, imj);
@@ -1071,7 +1060,7 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
   builder.setInsertionPointAfter(forOpJ);
   operands[0] = operands[1] = p;
   forEachIJPairInAllBuffers(
-      builder, loc, operands, nx, ny, isCoo,
+      builder, loc, operands, xPerm, ny,
       [&](uint64_t k, Value p, Value usused, Value buffer) {
         builder.create<memref::StoreOp>(loc, d[k], buffer, p);
       });
@@ -1123,8 +1112,8 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
 // }
 //
 static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
-                                func::FuncOp func, uint64_t nx, uint64_t ny,
-                                bool isCoo, uint32_t nTrailingP) {
+                                func::FuncOp func, AffineMap xPerm, uint64_t ny,
+                                uint32_t nTrailingP) {
   assert(nTrailingP == 1 || nTrailingP == 0);
   bool isHybrid = (nTrailingP == 1);
   OpBuilder::InsertionGuard insertionGuard(builder);
@@ -1173,7 +1162,7 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
     // When len <= limit.
     builder.setInsertionPointToStart(&lenIf.getThenRegion().front());
     FlatSymbolRefAttr insertionSortFunc = getMangledSortHelperFunc(
-        builder, func, TypeRange(), kSortStableFuncNamePrefix, nx, ny, isCoo,
+        builder, func, TypeRange(), kSortStableFuncNamePrefix, xPerm, ny,
         ValueRange(args).drop_back(nTrailingP), createSortStableFunc);
     builder.create<func::CallOp>(loc, insertionSortFunc, TypeRange(),
                                  ValueRange(args).drop_back(nTrailingP));
@@ -1193,7 +1182,7 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
     // When depth exceeds limit.
     builder.setInsertionPointToStart(&depthIf.getThenRegion().front());
     FlatSymbolRefAttr heapSortFunc = getMangledSortHelperFunc(
-        builder, func, TypeRange(), kHeapSortFuncNamePrefix, nx, ny, isCoo,
+        builder, func, TypeRange(), kHeapSortFuncNamePrefix, xPerm, ny,
         ValueRange(args).drop_back(nTrailingP), createHeapSortFunc);
     builder.create<func::CallOp>(loc, heapSortFunc, TypeRange(),
                                  ValueRange(args).drop_back(nTrailingP));
@@ -1203,7 +1192,7 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
     builder.setInsertionPointToStart(&depthIf.getElseRegion().front());
     args.back() = depthLimit;
     std::tie(lo, hi) =
-        createQuickSort(builder, module, func, args, nx, ny, isCoo, nTrailingP);
+        createQuickSort(builder, module, func, args, xPerm, ny, nTrailingP);
     builder.create<scf::YieldOp>(loc, ValueRange{lo, hi});
 
     builder.setInsertionPointAfter(depthIf);
@@ -1216,7 +1205,7 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
     hi = lenIf.getResult(1);
   } else {
     std::tie(lo, hi) =
-        createQuickSort(builder, module, func, args, nx, ny, isCoo, nTrailingP);
+        createQuickSort(builder, module, func, args, xPerm, ny, nTrailingP);
   }
 
   // New [lo, hi) for the next while-loop iteration.
@@ -1229,9 +1218,8 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
 
 /// Implements the rewriting for operator sort and sort_coo.
 template <typename OpTy>
-LogicalResult matchAndRewriteSortOp(OpTy op, ValueRange xys, uint64_t nx,
-                                    uint64_t ny, bool isCoo,
-                                    PatternRewriter &rewriter) {
+LogicalResult matchAndRewriteSortOp(OpTy op, ValueRange xys, AffineMap xPerm,
+                                    uint64_t ny, PatternRewriter &rewriter) {
   Location loc = op.getLoc();
   SmallVector<Value> operands{constantIndex(rewriter, loc, 0), op.getN()};
 
@@ -1285,8 +1273,8 @@ LogicalResult matchAndRewriteSortOp(OpTy op, ValueRange xys, uint64_t nx,
   }
 
   FlatSymbolRefAttr func =
-      getMangledSortHelperFunc(rewriter, insertPoint, TypeRange(), funcName, nx,
-                               ny, isCoo, operands, funcGenerator, nTrailingP);
+      getMangledSortHelperFunc(rewriter, insertPoint, TypeRange(), funcName,
+                               xPerm, ny, operands, funcGenerator, nTrailingP);
   rewriter.replaceOpWithNewOp<func::CallOp>(op, func, TypeRange(), operands);
   return success();
 }
@@ -1296,7 +1284,6 @@ LogicalResult matchAndRewriteSortOp(OpTy op, ValueRange xys, uint64_t nx,
 //===---------------------------------------------------------------------===//
 
 namespace {
-
 /// Sparse rewriting rule for the push_back operator.
 struct PushBackRewriter : OpRewritePattern<PushBackOp> {
 public:
@@ -1410,20 +1397,6 @@ struct PushBackRewriter : OpRewritePattern<PushBackOp> {
   bool enableBufferInitialization;
 };
 
-/// Sparse rewriting rule for the sort operator.
-struct SortRewriter : public OpRewritePattern<SortOp> {
-public:
-  using OpRewritePattern<SortOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SortOp op,
-                                PatternRewriter &rewriter) const override {
-    SmallVector<Value> xys(op.getXs());
-    xys.append(op.getYs().begin(), op.getYs().end());
-    return matchAndRewriteSortOp(op, xys, op.getXs().size(), /*ny=*/0,
-                                 /*isCoo=*/false, rewriter);
-  }
-};
-
 /// Sparse rewriting rule for the sort_coo operator.
 struct SortCooRewriter : public OpRewritePattern<SortCooOp> {
 public:
@@ -1434,16 +1407,13 @@ struct SortCooRewriter : public OpRewritePattern<SortCooOp> {
     SmallVector<Value> xys;
     xys.push_back(op.getXy());
     xys.append(op.getYs().begin(), op.getYs().end());
-    uint64_t nx = 1;
-    if (auto nxAttr = op.getNxAttr())
-      nx = nxAttr.getInt();
 
+    auto xPerm = op.getPermMap();
     uint64_t ny = 0;
     if (auto nyAttr = op.getNyAttr())
       ny = nyAttr.getInt();
 
-    return matchAndRewriteSortOp(op, xys, nx, ny,
-                                 /*isCoo=*/true, rewriter);
+    return matchAndRewriteSortOp(op, xys, xPerm, ny, rewriter);
   }
 };
 
@@ -1457,5 +1427,5 @@ void mlir::populateSparseBufferRewriting(RewritePatternSet &patterns,
                                          bool enableBufferInitialization) {
   patterns.add<PushBackRewriter>(patterns.getContext(),
                                  enableBufferInitialization);
-  patterns.add<SortRewriter, SortCooRewriter>(patterns.getContext());
+  patterns.add<SortCooRewriter>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 557c5c471c4a7..4419c39c69927 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -890,8 +890,9 @@ class SparseCompressConverter : public OpConversionPattern<CompressOp> {
     // If the innermost level is ordered, we need to sort the coordinates
     // in the "added" array prior to applying the compression.
     if (dstType.isOrderedLvl(dstType.getLvlRank() - 1))
-      rewriter.create<SortOp>(loc, count, ValueRange{added}, ValueRange{},
-                              SparseTensorSortKind::HybridQuickSort);
+      rewriter.create<SortCooOp>(
+          loc, count, added, ValueRange{}, rewriter.getMultiDimIdentityMap(1),
+          rewriter.getIndexAttr(0), SparseTensorSortKind::HybridQuickSort);
     // While performing the insertions, we also need to reset the elements
     // of the values/filled-switch by only iterating over the set elements,
     // to ensure that the runtime complexity remains proportional to the
@@ -1486,9 +1487,10 @@ struct SparseNewOpConverter : public OpConversionPattern<NewOp> {
       scf::IfOp ifOp =
           rewriter.create<scf::IfOp>(loc, notSorted, /*else*/ false);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
-      rewriter.create<SortCooOp>(
-          loc, nse, xs, ValueRange{ys}, rewriter.getIndexAttr(lvlRank),
-          rewriter.getIndexAttr(0), SparseTensorSortKind::HybridQuickSort);
+      auto xPerm = rewriter.getMultiDimIdentityMap(lvlRank);
+      rewriter.create<SortCooOp>(loc, nse, xs, ValueRange{ys}, xPerm,
+                                 rewriter.getIndexAttr(0),
+                                 SparseTensorSortKind::HybridQuickSort);
       rewriter.setInsertionPointAfter(ifOp);
     }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index ca7d8a7850b0b..7d2f0c7f139cd 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -207,7 +207,6 @@ struct SparseTensorCodegenPass
     ConversionTarget target(*ctx);
     // Most ops in the sparse dialect must go!
     target.addIllegalDialect<SparseTensorDialect>();
-    target.addLegalOp<SortOp>();
     target.addLegalOp<SortCooOp>();
     target.addLegalOp<PushBackOp>();
     // Storage specifier outlives sparse tensor pipeline.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 47f7dad08c8c9..277903dc55b74 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1206,29 +1206,23 @@ struct ConvertRewriter : public OpRewritePattern<ConvertOp> {
       // Retrieve the values-array.
       Value y = genToValues(rewriter, loc, src);
       const auto encSrc = srcTp.getEncoding();
-      // Sort the COO tensor so that its elements are ordered via increasing
-      // coordinates for the storage ordering of the dst tensor.  Use SortCoo
-      // if the COO tensor has the same ordering as the dst tensor.
-      if (dimRank > 1 && srcTp.hasSameDimToLvl(dstTp)) {
-        Value xs = genToCoordinatesBuffer(rewriter, loc, src);
-        rewriter.create<SortCooOp>(
-            loc, nnz, xs, ValueRange{y}, rewriter.getIndexAttr(dimRank),
-            rewriter.getIndexAttr(0), SparseTensorSortKind::HybridQuickSort);
-      } else {
-        // Gather the coordinates-arrays in the dst tensor storage order.
-        SmallVector<Value> xs(dstLvlRank);
-        const Level srcLvlRank = srcTp.getLvlRank();
-        for (Level srcLvl = 0; srcLvl < srcLvlRank; srcLvl++) {
-          // FIXME: `toOrigDim` is deprecated
-          Dimension dim = toOrigDim(encSrc, srcLvl);
-          // FIXME: `toStoredDim` is deprecated
-          Level dstLvl = toStoredDim(encDst, dim);
-          xs[dstLvl] =
-              genToCoordinates(rewriter, loc, src, srcLvl, /*cooStart=*/0);
-        }
-        rewriter.create<SortOp>(loc, nnz, xs, ValueRange{y},
-                                SparseTensorSortKind::HybridQuickSort);
+      // Builds the dstLvl -> srcLvl permutation maps.
+      SmallVector<AffineExpr> es(dstLvlRank);
+      const Level srcLvlRank = srcTp.getLvlRank();
+      for (Level srcLvl = 0; srcLvl < srcLvlRank; srcLvl++) {
+        // FIXME: `toOrigDim` is deprecated
+        Dimension dim = toOrigDim(encSrc, srcLvl);
+        // FIXME: `toStoredDim` is deprecated
+        Level dstLvl = toStoredDim(encDst, dim);
+        es[dstLvl] = rewriter.getAffineDimExpr(srcLvl);
       }
+      auto xPerm = AffineMap::get(dstLvlRank, 0, es, rewriter.getContext());
+      assert(xPerm.isPermutation()); // must be a permutation.
+
+      Value xs = genToCoordinatesBuffer(rewriter, loc, src);
+      rewriter.create<SortCooOp>(loc, nnz, xs, ValueRange{y}, xPerm,
+                                 rewriter.getIndexAttr(0),
+                                 SparseTensorSortKind::HybridQuickSort);
     }
 
     // For each element in the COO tensor, insert the element to the dst tensor.
diff --git a/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir b/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir
index 0036bd5c3310b..c96a55aa1e8b2 100644
--- a/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir
+++ b/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir
@@ -75,123 +75,64 @@ func.func @sparse_push_back_inbound(%arg0: index, %arg1: memref<?xf64>, %arg2: f
 
 // -----
 
-// CHECK-LABEL:   func.func private @_sparse_partition_1_i8_f32_index
-// CHECK-LABEL:   func.func private @_sparse_qsort_1_i8_f32_index
-// CHECK-LABEL:   func.func @sparse_sort_1d2v_quick
-func.func @sparse_sort_1d2v_quick(%arg0: index, %arg1: memref<10xi8>, %arg2: memref<?xf32>, %arg3: memref<10xindex>)
-   -> (memref<10xi8>, memref<?xf32>, memref<10xindex>) {
-  sparse_tensor.sort quick_sort %arg0, %arg1 jointly %arg2, %arg3 : memref<10xi8> jointly memref<?xf32>, memref<10xindex>
-  return %arg1, %arg2, %arg3 : memref<10xi8>, memref<?xf32>, memref<10xindex>
-}
-
-// -----
-
-// Only check the generated supporting function now. We have integration test
-// to verify correctness of the generated code.
-//
-// CHECK-DAG:     func.func private @_sparse_partition_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) -> index {
-// CHECK-DAG:     func.func private @_sparse_qsort_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) {
-// CHECK-LABEL:   func.func @sparse_sort_3d_quick
-func.func @sparse_sort_3d_quick(%arg0: index, %arg1: memref<10xindex>, %arg2: memref<?xindex>, %arg3: memref<10xindex>) -> (memref<10xindex>, memref<?xindex>, memref<10xindex>) {
-  sparse_tensor.sort quick_sort %arg0, %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-  return %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-}
-
-// -----
-
-// Only check the generated supporting function now. We have integration test
-// to verify correctness of the generated code.
-//
-// CHECK-DAG:     func.func private @_sparse_binary_search_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) -> index {
-// CHECK-DAG:     func.func private @_sparse_sort_stable_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) {
-// CHECK-DAG:     func.func private @_sparse_shift_down_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>, %arg5: index) {
-// CHECK-DAG:     func.func private @_sparse_heap_sort_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) {
-// CHECK-DAG:     func.func private @_sparse_partition_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) -> index {
-// CHECK-DAG:     func.func private @_sparse_hybrid_qsort_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>, %arg5: i64) {
-// CHECK-LABEL:   func.func @sparse_sort_3d_hybrid
-func.func @sparse_sort_3d_hybrid(%arg0: index, %arg1: memref<10xindex>, %arg2: memref<?xindex>, %arg3: memref<10xindex>) -> (memref<10xindex>, memref<?xindex>, memref<10xindex>) {
-  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-  return %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-}
-
-// -----
-
-// Only check the generated supporting functions. We have integration test to
-// verify correctness of the generated code.
-//
-// CHECK-DAG:     func.func private @_sparse_binary_search_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) -> index {
-// CHECK-DAG:     func.func private @_sparse_sort_stable_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) {
-// CHECK-LABEL:   func.func @sparse_sort_3d_stable
-func.func @sparse_sort_3d_stable(%arg0: index, %arg1: memref<10xindex>, %arg2: memref<?xindex>, %arg3: memref<10xindex>) -> (memref<10xindex>, memref<?xindex>, memref<10xindex>) {
-  sparse_tensor.sort insertion_sort_stable %arg0, %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-  return %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-}
-
-// -----
+#ID_MAP=affine_map<(d0, d1) -> (d0, d1)>
 
 // Only check the generated supporting functions. We have integration test to
 // verify correctness of the generated code.
 //
-// CHECK-DAG:     func.func private @_sparse_shift_down_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>, %arg5: index) {
-// CHECK-DAG:     func.func private @_sparse_heap_sort_3_index(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xindex>, %arg4: memref<?xindex>) {
-// CHECK-LABEL:   func.func @sparse_sort_3d_heap
-func.func @sparse_sort_3d_heap(%arg0: index, %arg1: memref<10xindex>, %arg2: memref<?xindex>, %arg3: memref<10xindex>) -> (memref<10xindex>, memref<?xindex>, memref<10xindex>) {
-  sparse_tensor.sort heap_sort %arg0, %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-  return %arg1, %arg2, %arg3 : memref<10xindex>, memref<?xindex>, memref<10xindex>
-}
-
-// -----
-
-// Only check the generated supporting functions. We have integration test to
-// verify correctness of the generated code.
-//
-// CHECK-DAG:     func.func private @_sparse_partition_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
-// CHECK-DAG:     func.func private @_sparse_qsort_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
+// CHECK-DAG:     func.func private @_sparse_partition_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
+// CHECK-DAG:     func.func private @_sparse_qsort_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_quick
 func.func @sparse_sort_coo_quick(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo quick_sort %arg0, %arg1 jointly %arg2, %arg3 {nx = 2 : index, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort_coo quick_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
 
 // -----
 
+#ID_MAP=affine_map<(d0, d1) -> (d0, d1)>
+
 // Only check the generated supporting functions. We have integration test to
 // verify correctness of the generated code.
 //
-// CHECK-DAG:     func.func private @_sparse_binary_search_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
-// CHECK-DAG:     func.func private @_sparse_sort_stable_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
-// CHECK-DAG:     func.func private @_sparse_shift_down_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>, %arg5: index) {
-// CHECK-DAG:     func.func private @_sparse_heap_sort_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
-// CHECK-DAG:     func.func private @_sparse_partition_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
-// CHECK-DAG:     func.func private @_sparse_hybrid_qsort_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>, %arg5: i64) {
+// CHECK-DAG:     func.func private @_sparse_binary_search_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
+// CHECK-DAG:     func.func private @_sparse_sort_stable_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
+// CHECK-DAG:     func.func private @_sparse_shift_down_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>, %arg5: index) {
+// CHECK-DAG:     func.func private @_sparse_heap_sort_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
+// CHECK-DAG:     func.func private @_sparse_partition_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
+// CHECK-DAG:     func.func private @_sparse_hybrid_qsort_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>, %arg5: i64) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_hybrid
 func.func @sparse_sort_coo_hybrid(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 jointly %arg2, %arg3 {nx = 2 : index, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
 
 // -----
 
+#ID_MAP=affine_map<(d0, d1) -> (d0, d1)>
+
 // Only check the generated supporting functions. We have integration test to
 // verify correctness of the generated code.
 //
-// CHECK-DAG:     func.func private @_sparse_binary_search_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
-// CHECK-DAG:     func.func private @_sparse_sort_stable_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
+// CHECK-DAG:     func.func private @_sparse_binary_search_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) -> index {
+// CHECK-DAG:     func.func private @_sparse_sort_stable_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_stable
 func.func @sparse_sort_coo_stable(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 jointly %arg2, %arg3 {nx = 2 : index, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
 
 // -----
 
+#ID_MAP=affine_map<(d0, d1) -> (d0, d1)>
+
 // Only check the generated supporting functions. We have integration test to
 // verify correctness of the generated code.
 //
-// CHECK-DAG:     func.func private @_sparse_shift_down_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>, %arg5: index) {
-// CHECK-DAG:     func.func private @_sparse_heap_sort_2_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
+// CHECK-DAG:     func.func private @_sparse_shift_down_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>, %arg5: index) {
+// CHECK-DAG:     func.func private @_sparse_heap_sort_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_heap
 func.func @sparse_sort_coo_heap(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo heap_sort %arg0, %arg1 jointly %arg2, %arg3 {nx = 2 : index, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort_coo heap_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index f1317f23d6568..ea11a98b76ec6 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -436,7 +436,7 @@ func.func @sparse_expansion3(%arg0: index, %arg1: index) -> memref<?xindex> {
 //   CHECK-DAG: %[[A9:.*]] = arith.constant 0.000000e+00 : f64
 //   CHECK-DAG: %[[A10:.*]] = arith.constant 1 : index
 //   CHECK-DAG: %[[A11:.*]] = arith.constant 0 : index
-//       CHECK: sparse_tensor.sort hybrid_quick_sort %[[A7]], %[[A6]] : memref<?xindex>
+//       CHECK: sparse_tensor.sort_coo hybrid_quick_sort %[[A7]], %[[A6]]
 //       CHECK: %[[A12:.*]]:4 = scf.for %[[A13:.*]] = %[[A11]] to %[[A7]] step %[[A10]] iter_args(%[[A14:.*]] = %[[A0]], %[[A15:.*]] = %[[A1]], %[[A16:.*]] = %[[A2]], %[[A17:.*]] = %[[A3]])
 //       CHECK:   %[[A18:.*]] = memref.load %[[A6]]{{\[}}%[[A13]]] : memref<?xindex>
 //       CHECK:   %[[A19:.*]] = memref.load %[[A4]]{{\[}}%[[A18]]] : memref<?xf64>
@@ -484,7 +484,7 @@ func.func @sparse_compression_1d(%tensor: tensor<100xf64, #SV>,
 //       CHECK:     %[[A11:.*]] = arith.constant 0.000000e+00 : f64
 //       CHECK:     %[[A12:.*]] = arith.constant 1 : index
 //       CHECK:     %[[A13:.*]] = arith.constant 0 : index
-//       CHECK:     sparse_tensor.sort hybrid_quick_sort %[[A7]], %[[A6]] : memref<?xindex>
+//       CHECK:     sparse_tensor.sort_coo hybrid_quick_sort %[[A7]], %[[A6]]
 //       CHECK:     %[[A14:.*]]:4 = scf.for %[[A15:.*]] = %[[A13]] to %[[A7]] step %[[A12]] iter_args(%[[A16:.*]] = %[[A0]], %[[A17:.*]] = %[[A1]], %[[A18:.*]] = %[[A2]], %[[A19:.*]] = %[[A3]]) -> (memref<?xi32>, memref<?xi64>, memref<?xf64>, !sparse_tensor.storage_specifier
 //       CHECK:       %[[A20:.*]] = memref.load %[[A6]]{{\[}}%[[A15]]] : memref<?xindex>
 //       CHECK:       %[[A21:.*]] = memref.load %[[A4]]{{\[}}%[[A20]]] : memref<?xf64>
@@ -712,7 +712,7 @@ func.func @sparse_convert_element_type(%arg0: tensor<32xf32, #SparseVector>) ->
 //       CHECK: %[[A33:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[A5]], %[[A32]], %[[A14]], %[[A15]])
 //       CHECK: %[[A34:.*]] = arith.cmpi eq, %[[A33]], %[[A1]] : i1
 //       CHECK: scf.if %[[A34]] {
-//       CHECK:   sparse_tensor.sort_coo  hybrid_quick_sort %[[A10]], %[[A14]] jointly %[[A15]] {nx = 2 : index, ny = 0 : index} : memref<?xindex> jointly memref<?xf32>
+//       CHECK:   sparse_tensor.sort_coo  hybrid_quick_sort %[[A10]], %[[A14]] jointly %[[A15]] {ny = 0 : index, perm_map = #{{.*}}} : memref<?xindex> jointly memref<?xf32>
 //       CHECK: }
 //       CHECK: memref.store %[[A10]], %[[A27]]{{\[}}%[[A2]]] : memref<?xindex>
 //       CHECK: %[[A36:.*]] = sparse_tensor.storage_specifier.set %[[A30]]  crd_mem_sz at 0 with %[[A11]]
diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
index b3eb50f1755da..54cdfc690952d 100644
--- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
@@ -178,7 +178,7 @@ func.func @sparse_convert_singleton(%arg0: tensor<?xf32, #SparseSingleton64>) ->
 //       CHECK-RWT: %[[VAL_16:.*]] = sparse_tensor.load %[[VAL_17:.*]] hasInserts : tensor<?x?x?xf32, #{{.*}}>>
 //       CHECK-RWT: %[[VAL_18:.*]] = sparse_tensor.values %[[VAL_16]] : tensor<?x?x?xf32, #{{.*}}>> to memref<?xf32>
 //       CHECK-RWT: %[[VAL_19:.*]] = sparse_tensor.coordinates_buffer %[[VAL_16]] : tensor<?x?x?xf32, #{{.*}}>> to memref<?xindex>
-//       CHECK-RWT: sparse_tensor.sort_coo  hybrid_quick_sort %[[VAL_7]], %[[VAL_19]] jointly %[[VAL_18]] {nx = 3 : index, ny = 0 : index}
+//       CHECK-RWT: sparse_tensor.sort_coo  hybrid_quick_sort %[[VAL_7]], %[[VAL_19]] jointly %[[VAL_18]] {ny = 0 : index, perm_map = #map}
 //       CHECK-RWT: %[[VAL_20:.*]] = bufferization.alloc_tensor(%[[VAL_4]], %[[VAL_5]], %[[VAL_6]]) size_hint=%[[VAL_7]]
 //       CHECK-RWT: %[[VAL_21:.*]] = sparse_tensor.foreach in %[[VAL_16]] init(%[[VAL_20]])
 //       CHECK-RWT: ^bb0(%[[VAL_22:.*]]: index, %[[VAL_23:.*]]: index, %[[VAL_24:.*]]: index, %[[VAL_25:.*]]: f32, %[[VAL_26:.*]]: tensor<?x?x?xf32, #{{.*}}>>):
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index 71e6eebb30261..c0e813dcde7c5 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -790,60 +790,51 @@ func.func @sparse_tensor_foreach(%arg0: tensor<2x4xf64, #DCSR>, %arg1: f32) -> (
   return
 }
 
-// -----
-
-// TODO: a test case with empty xs doesn't work due to some parser issues.
-
-func.func @sparse_sort_x_type( %arg0: index, %arg1: memref<?xf32>) {
-  // expected-error@+1 {{operand #1 must be 1D memref of integer or index values}}
-  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1: memref<?xf32>
-}
-
-// -----
-
-func.func @sparse_sort_dim_too_small(%arg0: memref<10xindex>) {
-  %i20 = arith.constant 20 : index
-  // expected-error@+1 {{xs and ys need to have a dimension >= n: 10 < 20}}
-  sparse_tensor.sort insertion_sort_stable %i20, %arg0 : memref<10xindex>
-  return
-}
 
 // -----
 
-func.func @sparse_sort_mismatch_x_type(%arg0: index, %arg1: memref<10xindex>, %arg2: memref<10xi8>) {
-  // expected-error@+1 {{mismatch xs element types}}
-  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1, %arg2 : memref<10xindex>, memref<10xi8>
-  return
-}
-
-// -----
+#MAP = affine_map<(i,j) -> (i,j)>
 
 func.func @sparse_sort_coo_x_type( %arg0: index, %arg1: memref<?xf32>) {
   // expected-error@+1 {{operand #1 must be 1D memref of integer or index values}}
-  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1: memref<?xf32>
+  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 {perm_map = #MAP} : memref<?xf32>
   return
 }
 
 // -----
 
+#MAP = affine_map<(i,j) -> (i,j)>
+
 func.func @sparse_sort_coo_x_too_small(%arg0: memref<50xindex>) {
   %i20 = arith.constant 20 : index
-  // expected-error@+1 {{Expected dimension(xy) >= n * (nx + ny) got 50 < 60}}
-  sparse_tensor.sort_coo hybrid_quick_sort %i20, %arg0 {nx = 2 : index, ny = 1 : index} : memref<50xindex>
+  // expected-error@+1 {{Expected dimension(xy) >= n * (rank(perm_map) + ny) got 50 < 60}}
+  sparse_tensor.sort_coo hybrid_quick_sort %i20, %arg0 {perm_map = #MAP, ny = 1 : index} : memref<50xindex>
   return
 }
 
 // -----
 
+#MAP = affine_map<(i,j) -> (i,j)>
+
 func.func @sparse_sort_coo_y_too_small(%arg0: memref<60xindex>, %arg1: memref<10xf32>) {
   %i20 = arith.constant 20 : index
   // expected-error@+1 {{Expected dimension(y) >= n got 10 < 20}}
-  sparse_tensor.sort_coo insertion_sort_stable %i20, %arg0 jointly %arg1 {nx = 2 : index, ny = 1 : index} : memref<60xindex> jointly memref<10xf32>
+  sparse_tensor.sort_coo insertion_sort_stable %i20, %arg0 jointly %arg1 {perm_map = #MAP, ny = 1 : index} : memref<60xindex> jointly memref<10xf32>
   return
 }
 
 // -----
 
+#NON_PERM_MAP = affine_map<(i,j) -> (i,i)>
+
+func.func @sparse_sort_coo_no_perm(%arg0: index, %arg1: memref<?xindex>) -> (memref<?xindex>) {
+  // expected-error@+1 {{Expected a permutation map, got (d0, d1) -> (d0, d0)}}
+  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 {perm_map = #NON_PERM_MAP, ny = 1 : index}: memref<?xindex>
+  return %arg1 : memref<?xindex>
+}
+
+// -----
+
 #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
 
 func.func @sparse_alloc_escapes(%arg0: index) -> tensor<10x?xf64, #CSR> {
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index d1262cb7aea02..d252fa559a154 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -612,79 +612,29 @@ func.func @sparse_tensor_foreach(%arg0: tensor<2x4xf64, #DCSR>, %arg1: f32) -> (
 
 // -----
 
-// CHECK-LABEL: func @sparse_sort_1d0v(
-//  CHECK-SAME: %[[A:.*]]: index,
-//  CHECK-SAME: %[[B:.*]]: memref<?xindex>)
-//       CHECK: sparse_tensor.sort hybrid_quick_sort %[[A]], %[[B]] : memref<?xindex>
-//       CHECK: return %[[B]]
-func.func @sparse_sort_1d0v(%arg0: index, %arg1: memref<?xindex>) -> (memref<?xindex>) {
-  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1 : memref<?xindex>
-  return %arg1 : memref<?xindex>
-}
-
-// -----
-
-// CHECK-LABEL: func @sparse_sort_1d2v(
-//  CHECK-SAME: %[[A:.*]]: index,
-//  CHECK-SAME: %[[B:.*]]: memref<20xindex>,
-//  CHECK-SAME: %[[C:.*]]: memref<10xindex>,
-//  CHECK-SAME: %[[D:.*]]: memref<?xf32>)
-//       CHECK: sparse_tensor.sort hybrid_quick_sort %[[A]], %[[B]] jointly %[[C]], %[[D]] : memref<20xindex> jointly memref<10xindex>, memref<?xf32>
-//       CHECK: return %[[B]], %[[C]], %[[D]]
-func.func @sparse_sort_1d2v(%arg0: index, %arg1: memref<20xindex>, %arg2: memref<10xindex>, %arg3: memref<?xf32>) -> (memref<20xindex>, memref<10xindex>, memref<?xf32>) {
-  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1 jointly %arg2, %arg3 : memref<20xindex> jointly memref<10xindex>, memref<?xf32>
-  return %arg1, %arg2, %arg3 : memref<20xindex>, memref<10xindex>, memref<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @sparse_sort_2d1v(
-//  CHECK-SAME: %[[A:.*]]: index,
-//  CHECK-SAME: %[[B:.*]]: memref<10xi8>,
-//  CHECK-SAME: %[[C:.*]]: memref<20xi8>,
-//  CHECK-SAME: %[[D:.*]]: memref<10xf64>)
-//       CHECK: sparse_tensor.sort hybrid_quick_sort %[[A]], %[[B]], %[[C]] jointly %[[D]] : memref<10xi8>, memref<20xi8> jointly memref<10xf64>
-//       CHECK: return %[[B]], %[[C]], %[[D]]
-func.func @sparse_sort_2d1v(%arg0: index, %arg1: memref<10xi8>, %arg2: memref<20xi8>, %arg3: memref<10xf64>) -> (memref<10xi8>, memref<20xi8>, memref<10xf64>) {
-  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1, %arg2 jointly %arg3 : memref<10xi8>, memref<20xi8> jointly memref<10xf64>
-  return %arg1, %arg2, %arg3 : memref<10xi8>, memref<20xi8>, memref<10xf64>
-}
-
-// -----
-
-// CHECK-LABEL: func @sparse_sort_stable(
-//  CHECK-SAME: %[[A:.*]]: index,
-//  CHECK-SAME: %[[B:.*]]: memref<10xi8>,
-//  CHECK-SAME: %[[C:.*]]: memref<20xi8>,
-//  CHECK-SAME: %[[D:.*]]: memref<10xf64>)
-//       CHECK: sparse_tensor.sort insertion_sort_stable %[[A]], %[[B]], %[[C]] jointly %[[D]] : memref<10xi8>, memref<20xi8> jointly memref<10xf64>
-//       CHECK: return %[[B]], %[[C]], %[[D]]
-func.func @sparse_sort_stable(%arg0: index, %arg1: memref<10xi8>, %arg2: memref<20xi8>, %arg3: memref<10xf64>) -> (memref<10xi8>, memref<20xi8>, memref<10xf64>) {
-  sparse_tensor.sort insertion_sort_stable %arg0, %arg1, %arg2 jointly %arg3 : memref<10xi8>, memref<20xi8> jointly memref<10xf64>
-  return %arg1, %arg2, %arg3 : memref<10xi8>, memref<20xi8>, memref<10xf64>
-}
-
-// -----
+#ID_MAP = affine_map<(i,j) -> (i,j)>
 
 // CHECK-LABEL: func @sparse_sort_coo(
 //  CHECK-SAME: %[[A:.*]]: index,
 //  CHECK-SAME: %[[B:.*]]: memref<?xindex>)
-//       CHECK: sparse_tensor.sort_coo hybrid_quick_sort %[[A]], %[[B]] {nx = 2 : index, ny = 1 : index} : memref<?xindex>
+//       CHECK: sparse_tensor.sort_coo hybrid_quick_sort %[[A]], %[[B]] {ny = 1 : index, perm_map = #{{.*}}} : memref<?xindex>
 //       CHECK: return %[[B]]
 func.func @sparse_sort_coo(%arg0: index, %arg1: memref<?xindex>) -> (memref<?xindex>) {
-  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 {nx = 2 : index, ny = 1 : index}: memref<?xindex>
+  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 {perm_map = #ID_MAP, ny = 1 : index}: memref<?xindex>
   return %arg1 : memref<?xindex>
 }
 
 // -----
 
+#ID_MAP = affine_map<(i,j) -> (i,j)>
+
 // CHECK-LABEL: func @sparse_sort_coo_stable(
 //  CHECK-SAME: %[[A:.*]]: index,
 //  CHECK-SAME: %[[B:.*]]: memref<?xi64>,
 //  CHECK-SAME: %[[C:.*]]: memref<?xf32>)
-//       CHECK: sparse_tensor.sort_coo insertion_sort_stable %[[A]], %[[B]] jointly %[[C]] {nx = 2 : index, ny = 1 : index}
+//       CHECK: sparse_tensor.sort_coo insertion_sort_stable %[[A]], %[[B]] jointly %[[C]] {ny = 1 : index, perm_map = #{{.*}}}
 //       CHECK: return %[[B]], %[[C]]
 func.func @sparse_sort_coo_stable(%arg0: index, %arg1: memref<?xi64>, %arg2: memref<?xf32>) -> (memref<?xi64>, memref<?xf32>) {
-  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 jointly %arg2 {nx = 2 : index, ny = 1 : index}: memref<?xi64> jointly memref<?xf32>
+  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 jointly %arg2 {perm_map = #ID_MAP, ny = 1 : index}: memref<?xi64> jointly memref<?xf32>
   return %arg1, %arg2 : memref<?xi64>, memref<?xf32>
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
index b31ac3ef3a254..5c308dc3c5623 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
@@ -116,7 +116,7 @@
 // CHECK:               } {"Emitted from" = "linalg.generic"}
 // CHECK:               scf.yield %[[VAL_64:.*]] : index
 // CHECK:             } {"Emitted from" = "linalg.generic"}
-// CHECK:             sparse_tensor.sort  hybrid_quick_sort %[[VAL_65:.*]], %[[VAL_33]] : memref<?xindex>
+// CHECK:             sparse_tensor.sort_coo  hybrid_quick_sort %[[VAL_65:.*]], %[[VAL_33]]
 // CHECK:             %[[VAL_66:.*]]:4 = scf.for %[[VAL_67:.*]] = %[[VAL_10]] to %[[VAL_65]] step %[[VAL_11]] iter_args(%[[VAL_68:.*]] = %[[VAL_36]], %[[VAL_69:.*]] = %[[VAL_37]], %[[VAL_70:.*]] = %[[VAL_38]], %[[VAL_71:.*]] = %[[VAL_39]]) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
 // CHECK:               %[[VAL_72:.*]] = memref.load %[[VAL_32]]{{\[}}%[[VAL_67]]] : memref<4xindex>
 // CHECK:               %[[VAL_73:.*]] = memref.load %[[VAL_30]]{{\[}}%[[VAL_72]]] : memref<4xf64>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort.mlir
deleted file mode 100644
index 9e8ecad9cf282..0000000000000
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort.mlir
+++ /dev/null
@@ -1,187 +0,0 @@
-//--------------------------------------------------------------------------------------------------
-// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
-//
-// Set-up that's shared across all tests in this directory. In principle, this
-// config could be moved to lit.local.cfg. However, there are downstream users that
-//  do not use these LIT config files. Hence why this is kept inline.
-//
-// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
-// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
-// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}"
-// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}"
-// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
-// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
-// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
-//
-// DEFINE: %{env} =
-//--------------------------------------------------------------------------------------------------
-
-// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false
-// RUN: %{compile} | %{run} | FileCheck %s
-//
-// Do the same run, but now with vectorization.
-// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
-// RUN: %{compile} | %{run} | FileCheck %s
-//
-// Do the same run, but now with  VLA vectorization.
-// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
-
-module {
-  func.func private @printMemref1dI32(%ptr : memref<?xi32>) attributes { llvm.emit_c_interface }
-
-  // Stores 5 values to the memref buffer.
-  func.func @storeValuesTo(%b: memref<?xi32>, %v0: i32, %v1: i32, %v2: i32,
-    %v3: i32, %v4: i32) -> () {
-    %i0 = arith.constant 0 : index
-    %i1 = arith.constant 1 : index
-    %i2 = arith.constant 2 : index
-    %i3 = arith.constant 3 : index
-    %i4 = arith.constant 4 : index
-    memref.store %v0, %b[%i0] : memref<?xi32>
-    memref.store %v1, %b[%i1] : memref<?xi32>
-    memref.store %v2, %b[%i2] : memref<?xi32>
-    memref.store %v3, %b[%i3] : memref<?xi32>
-    memref.store %v4, %b[%i4] : memref<?xi32>
-    return
-  }
-
-  // The main driver.
-  func.func @entry() {
-    %c0 = arith.constant 0 : i32
-    %c1 = arith.constant 1 : i32
-    %c2 = arith.constant 2 : i32
-    %c3 = arith.constant 3 : i32
-    %c4 = arith.constant 4 : i32
-    %c5 = arith.constant 5 : i32
-    %c6 = arith.constant 6 : i32
-    %c7 = arith.constant 7 : i32
-    %c8 = arith.constant 8 : i32
-    %c9 = arith.constant 9 : i32
-    %c10 = arith.constant 10 : i32
-    %c100 = arith.constant 100 : i32
-
-    %i0 = arith.constant 0 : index
-    %i4 = arith.constant 4 : index
-    %i5 = arith.constant 5 : index
-
-    // Prepare a buffer.
-    %x0s = memref.alloc() : memref<5xi32>
-    %x0 = memref.cast %x0s : memref<5xi32> to memref<?xi32>
-    call @storeValuesTo(%x0, %c10, %c2, %c0, %c5, %c1)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-
-    // Sort 0 elements.
-    // Quick sort.
-    // CHECK: [10,  2,  0,  5,  1]
-    sparse_tensor.sort quick_sort %i0, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    // Stable sort.
-    // CHECK: [10,  2,  0,  5,  1]
-    sparse_tensor.sort insertion_sort_stable %i0, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    // Heap sort.
-    // CHECK: [10,  2,  0,  5,  1]
-    sparse_tensor.sort heap_sort %i0, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    // Hybrid sort.
-    // CHECK: [10,  2,  0,  5,  1]
-    sparse_tensor.sort hybrid_quick_sort %i0, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-
-    // Sort the first 4 elements, with the last valid value untouched.
-    // Quick sort.
-    // CHECK: [0,  2,  5, 10,  1]
-    sparse_tensor.sort quick_sort %i4, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    // Stable sort.
-    // CHECK: [0,  2,  5,  10,  1]
-    call @storeValuesTo(%x0, %c10, %c2, %c0, %c5, %c1)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort insertion_sort_stable %i4, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    // Heap sort.
-    // CHECK: [0,  2,  5,  10,  1]
-    call @storeValuesTo(%x0, %c10, %c2, %c0, %c5, %c1)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort heap_sort %i4, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    // Hybrid sort.
-    // CHECK: [0,  2,  5, 10,  1]
-    sparse_tensor.sort hybrid_quick_sort %i4, %x0 : memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-
-    // Prepare more buffers of different dimensions.
-    %x1s = memref.alloc() : memref<10xi32>
-    %x1 = memref.cast %x1s : memref<10xi32> to memref<?xi32>
-    %x2s = memref.alloc() : memref<6xi32>
-    %x2 = memref.cast %x2s : memref<6xi32> to memref<?xi32>
-    %y0s = memref.alloc() : memref<7xi32>
-    %y0 = memref.cast %y0s : memref<7xi32> to memref<?xi32>
-
-    // Sort "parallel arrays".
-    // CHECK: [1,  1,  2,  5,  10]
-    // CHECK: [3,  3,  1,  10,  1
-    // CHECK: [9,  9,  4,  7,  2
-    // CHECK: [7,  8,  10,  9,  6
-    call @storeValuesTo(%x0, %c10, %c2, %c1, %c5, %c1)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%x1, %c1, %c1, %c3, %c10, %c3)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%x2, %c2, %c4, %c9, %c7, %c9)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%y0, %c6, %c10, %c8, %c9, %c7)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort quick_sort %i5, %x0, %x1, %x2 jointly %y0
-      : memref<?xi32>, memref<?xi32>, memref<?xi32> jointly memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%x1) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%x2) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%y0) : (memref<?xi32>) -> ()
-    // Stable sort.
-    // CHECK: [1,  1,  2,  5,  10]
-    // CHECK: [3,  3,  1,  10,  1
-    // CHECK: [9,  9,  4,  7,  2
-    // CHECK: [8,  7,  10,  9,  6
-    call @storeValuesTo(%x0, %c10, %c2, %c1, %c5, %c1)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%x1, %c1, %c1, %c3, %c10, %c3)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%x2, %c2, %c4, %c9, %c7, %c9)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%y0, %c6, %c10, %c8, %c9, %c7)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort insertion_sort_stable %i5, %x0, %x1, %x2 jointly %y0
-      : memref<?xi32>, memref<?xi32>, memref<?xi32> jointly memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%x1) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%x2) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%y0) : (memref<?xi32>) -> ()
-    // Heap sort.
-    // CHECK: [1,  1,  2,  5,  10]
-    // CHECK: [3,  3,  1,  10,  1
-    // CHECK: [9,  9,  4,  7,  2
-    // CHECK: [7,  8,  10,  9,  6
-    call @storeValuesTo(%x0, %c10, %c2, %c1, %c5, %c1)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%x1, %c1, %c1, %c3, %c10, %c3)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%x2, %c2, %c4, %c9, %c7, %c9)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    call @storeValuesTo(%y0, %c6, %c10, %c8, %c9, %c7)
-      : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort heap_sort %i5, %x0, %x1, %x2 jointly %y0
-      : memref<?xi32>, memref<?xi32>, memref<?xi32> jointly memref<?xi32>
-    call @printMemref1dI32(%x0) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%x1) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%x2) : (memref<?xi32>) -> ()
-    call @printMemref1dI32(%y0) : (memref<?xi32>) -> ()
-
-    // Release the buffers.
-    memref.dealloc %x0 : memref<?xi32>
-    memref.dealloc %x1 : memref<?xi32>
-    memref.dealloc %x2 : memref<?xi32>
-    memref.dealloc %y0 : memref<?xi32>
-    return
-  }
-}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir
index ca5dd00d02aff..394b9a8448b54 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir
@@ -28,6 +28,8 @@
 // Do the same run, but now with  VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
 
+#ID_MAP = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
+
 module {
   // Stores 5 values to the memref buffer.
   func.func @storeValuesTo(%b: memref<?xi32>, %v0: i32, %v1: i32, %v2: i32,
@@ -94,11 +96,11 @@ module {
     %y1 = memref.cast %y1s : memref<7xi32> to memref<?xi32>
 
     // Sort "parallel arrays".
-    // CHECK: ( 1, 1, 3, 3, 10 )
-    // CHECK: ( 2, 10, 1, 1, 5 )
-    // CHECK: ( 4, 2, 9, 9, 7 )
-    // CHECK: ( 10, 6, 7, 8, 9 )
-    // CHECK: ( 7, 5, 7, 4, 9 )
+    // CHECK: ( 1, 1, 2, 5, 10 )
+    // CHECK: ( 9, 9, 4, 7, 2 )
+    // CHECK: ( 3, 3, 1, 10, 1 )
+    // CHECK: ( 7, 8, 10, 9, 6 )
+    // CHECK: ( 7, 4, 7, 9, 5 )
     call @storeValuesToStrided(%x0, %c1, %c1, %c3, %c10, %c3)
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesToStrided(%x1, %c10, %c2, %c1, %c5, %c1)
@@ -109,24 +111,25 @@ module {
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesTo(%y1, %c5, %c7, %c4, %c9, %c7)
       : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort_coo quick_sort %i5, %xy jointly %y1 {nx = 3 : index, ny = 1 : index}
+    sparse_tensor.sort_coo quick_sort %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
       : memref<?xi32> jointly memref<?xi32>
-    %x0v = vector.transfer_read %x0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
-    vector.print %x0v : vector<5xi32>
+    // Dumps memory in the same order as the perm_map such that the output is ordered.
     %x1v = vector.transfer_read %x1[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x1v : vector<5xi32>
     %x2v = vector.transfer_read %x2[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x2v : vector<5xi32>
+    %x0v = vector.transfer_read %x0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
+    vector.print %x0v : vector<5xi32>
     %y0v = vector.transfer_read %y0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %y0v : vector<5xi32>
     %y1v = vector.transfer_read %y1[%i0], %c100: memref<?xi32>, vector<5xi32>
     vector.print %y1v : vector<5xi32>
     // Stable sort.
-    // CHECK: ( 1, 1, 3, 3, 10 )
-    // CHECK: ( 2, 10, 1, 1, 5 )
-    // CHECK: ( 4, 2, 9, 9, 7 )
-    // CHECK: ( 10, 6, 8, 7, 9 )
-    // CHECK: ( 7, 5, 4, 7, 9 )
+    // CHECK: ( 1, 1, 2, 5, 10 )
+    // CHECK: ( 9, 9, 4, 7, 2 )
+    // CHECK: ( 3, 3, 1, 10, 1 )
+    // CHECK: ( 8, 7, 10, 9, 6 )
+    // CHECK: ( 4, 7, 7, 9, 5 )
     call @storeValuesToStrided(%x0, %c1, %c1, %c3, %c10, %c3)
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesToStrided(%x1, %c10, %c2, %c1, %c5, %c1)
@@ -137,24 +140,24 @@ module {
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesTo(%y1, %c5, %c7, %c4, %c9, %c7)
       : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort_coo insertion_sort_stable %i5, %xy jointly %y1 {nx = 3 : index, ny = 1 : index}
+    sparse_tensor.sort_coo insertion_sort_stable %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
       : memref<?xi32> jointly memref<?xi32>
-    %x0v2 = vector.transfer_read %x0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
-    vector.print %x0v2 : vector<5xi32>
     %x1v2 = vector.transfer_read %x1[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x1v2 : vector<5xi32>
     %x2v2 = vector.transfer_read %x2[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x2v2 : vector<5xi32>
+    %x0v2 = vector.transfer_read %x0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
+    vector.print %x0v2 : vector<5xi32>
     %y0v2 = vector.transfer_read %y0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %y0v2 : vector<5xi32>
     %y1v2 = vector.transfer_read %y1[%i0], %c100: memref<?xi32>, vector<5xi32>
     vector.print %y1v2 : vector<5xi32>
     // Heap sort.
-    // CHECK: ( 1, 1, 3, 3, 10 )
-    // CHECK: ( 2, 10, 1, 1, 5 )
-    // CHECK: ( 4, 2, 9, 9, 7 )
-    // CHECK: ( 10, 6, 8, 7, 9 )
-    // CHECK: ( 7, 5, 4, 7, 9 )
+    // CHECK: ( 1, 1, 2, 5, 10 )
+    // CHECK: ( 9, 9, 4, 7, 2 )
+    // CHECK: ( 3, 3, 1, 10, 1 )
+    // CHECK: ( 7, 8, 10, 9, 6 )
+    // CHECK: ( 7, 4, 7, 9, 5 )
     call @storeValuesToStrided(%x0, %c1, %c1, %c3, %c10, %c3)
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesToStrided(%x1, %c10, %c2, %c1, %c5, %c1)
@@ -165,14 +168,14 @@ module {
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesTo(%y1, %c5, %c7, %c4, %c9, %c7)
       : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort_coo heap_sort %i5, %xy jointly %y1 {nx = 3 : index, ny = 1 : index}
+    sparse_tensor.sort_coo heap_sort %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
       : memref<?xi32> jointly memref<?xi32>
-    %x0v3 = vector.transfer_read %x0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
-    vector.print %x0v3 : vector<5xi32>
     %x1v3 = vector.transfer_read %x1[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x1v3 : vector<5xi32>
     %x2v3 = vector.transfer_read %x2[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x2v3 : vector<5xi32>
+    %x0v3 = vector.transfer_read %x0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
+    vector.print %x0v3 : vector<5xi32>
     %y0v3 = vector.transfer_read %y0[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %y0v3 : vector<5xi32>
     %y1v3 = vector.transfer_read %y1[%i0], %c100: memref<?xi32>, vector<5xi32>

From c2e92cb490b1d9a7ecf0c97d3c2d2dc6d2d4fc70 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Tue, 19 Sep 2023 19:33:29 -0500
Subject: [PATCH 26/57] [Docs] Fix table after previous document update

Summary:
Someone broke this table in the documentation. Fix it by adding the
proper spacing.
---
 clang/docs/OpenMPSupport.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 293a56d68f2d6..e5d95ee76f191 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -42,7 +42,7 @@ General improvements
 
 
 GPU devices support
-====================
+===================
 
 Data-sharing modes
 ------------------
@@ -204,7 +204,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc                         | library shutdown (omp_pause_resource[_all])                  | :good:`done`             | D55078                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | metadirectives                                               | :part:`mostly done`        | D91944                                                                |
+| misc                         | metadirectives                                               | :part:`mostly done`      | D91944                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc                         | conditional modifier for lastprivate clause                  | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
@@ -267,7 +267,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | indirect clause on declare target directive                  | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | allow virtual functions calls for mapped object on device    | :part:`partial`        |                                                                       |
+| device                       | allow virtual functions calls for mapped object on device    | :part:`partial`          |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | interop construct                                            | :part:`partial`          | parsing/sema done: D98558, D98834, D98815                             |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+

From dd477ebd235e17e9a87cada3fe83fcd4491e53d4 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Wed, 20 Sep 2023 03:34:39 +0300
Subject: [PATCH 27/57] [Sparc] Remove LEA instructions (NFCI) (#65850)

LEA_ADDri and LEAX_ADDri are printed / encoded the same way as ADDri. I
had to change the type of simm13Op so that it can be used in both 32-
and 64-bit modes. This required the changes in operands of some
InstAliases.
---
 .../Sparc/Disassembler/SparcDisassembler.cpp  |  4 +-
 .../Sparc/MCTargetDesc/SparcInstPrinter.cpp   | 10 +----
 .../Sparc/MCTargetDesc/SparcInstPrinter.h     |  2 +-
 llvm/lib/Target/Sparc/SparcInstr64Bit.td      |  6 ---
 llvm/lib/Target/Sparc/SparcInstrAliases.td    | 38 +++++++++----------
 llvm/lib/Target/Sparc/SparcInstrInfo.td       | 17 ++++-----
 llvm/test/CodeGen/SPARC/fp128-split.ll        |  8 ++--
 7 files changed, 35 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index f1f7a171c9eaf..828d638723587 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -342,7 +342,7 @@ static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address,
 
 static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address,
                                  const MCDisassembler *Decoder) {
-  unsigned tgt = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
-  MI.addOperand(MCOperand::createImm(tgt));
+  assert(isUInt<13>(insn));
+  MI.addOperand(MCOperand::createImm(SignExtend64<13>(insn)));
   return MCDisassembler::Success;
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index c2353bb8269bf..ef77648504716 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -148,15 +148,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
 
 void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
                                        const MCSubtargetInfo &STI,
-                                       raw_ostream &O, const char *Modifier) {
-  // If this is an ADD operand, emit it like normal operands.
-  if (Modifier && !strcmp(Modifier, "arith")) {
-    printOperand(MI, opNum, STI, O);
-    O << ", ";
-    printOperand(MI, opNum + 1, STI, O);
-    return;
-  }
-
+                                       raw_ostream &O) {
   const MCOperand &Op1 = MI->getOperand(opNum);
   const MCOperand &Op2 = MI->getOperand(opNum + 1);
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
index 5cc79a21943f4..cb691a3420da7 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
@@ -47,7 +47,7 @@ class SparcInstPrinter : public MCInstPrinter {
   void printOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
                     raw_ostream &OS);
   void printMemOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
-                       raw_ostream &OS, const char *Modifier = nullptr);
+                       raw_ostream &OS);
   void printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
                       raw_ostream &OS);
   bool printGetPCX(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/llvm/lib/Target/Sparc/SparcInstr64Bit.td
index 45028f156166d..189efc32cb266 100644
--- a/llvm/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -171,12 +171,6 @@ def TLS_ADDXrr : F3_1<2, 0b000000, (outs I64Regs:$rd),
                    "add $rs1, $rs2, $rd, $sym",
                    [(set i64:$rd,
                        (tlsadd i64:$rs1, i64:$rs2, tglobaltlsaddr:$sym))]>;
-
-// "LEA" form of add
-def LEAX_ADDri : F3_2<2, 0b000000,
-                     (outs I64Regs:$rd), (ins (MEMri $rs1, $simm13):$addr),
-                     "add ${addr:arith}, $rd",
-                     [(set iPTR:$rd, ADDRri:$addr)]>;
 }
 
 def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
diff --git a/llvm/lib/Target/Sparc/SparcInstrAliases.td b/llvm/lib/Target/Sparc/SparcInstrAliases.td
index 5ee8ff978a242..5d247ac641c73 100644
--- a/llvm/lib/Target/Sparc/SparcInstrAliases.td
+++ b/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -486,41 +486,41 @@ let Predicates = [HasV9] in {
 def : InstAlias<"inc $rd", (ADDri IntRegs:$rd, IntRegs:$rd, 1), 0>;
 
 // inc simm13, rd -> add rd, simm13, rd
-def : InstAlias<"inc $simm13, $rd", (ADDri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+def : InstAlias<"inc $simm13, $rd", (ADDri IntRegs:$rd, IntRegs:$rd, simm13Op:$simm13), 0>;
 
 // inccc rd -> addcc rd, 1, rd
 def : InstAlias<"inccc $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
 
 // inccc simm13, rd -> addcc rd, simm13, rd
-def : InstAlias<"inccc $simm13, $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+def : InstAlias<"inccc $simm13, $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, simm13Op:$simm13), 0>;
 
 // dec rd -> sub rd, 1, rd
 def : InstAlias<"dec $rd", (SUBri IntRegs:$rd, IntRegs:$rd, 1), 0>;
 
 // dec simm13, rd -> sub rd, simm13, rd
-def : InstAlias<"dec $simm13, $rd", (SUBri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+def : InstAlias<"dec $simm13, $rd", (SUBri IntRegs:$rd, IntRegs:$rd, simm13Op:$simm13), 0>;
 
 // deccc rd -> subcc rd, 1, rd
 def : InstAlias<"deccc $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
 
 // deccc simm13, rd -> subcc rd, simm13, rd
-def : InstAlias<"deccc $simm13, $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+def : InstAlias<"deccc $simm13, $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, simm13Op:$simm13), 0>;
 
 // btst reg_or_imm, reg -> andcc reg,reg_or_imm,%g0
 def : InstAlias<"btst $rs2, $rs1", (ANDCCrr G0, IntRegs:$rs1, IntRegs:$rs2), 0>;
-def : InstAlias<"btst $simm13, $rs1", (ANDCCri G0, IntRegs:$rs1, i32imm:$simm13), 0>;
+def : InstAlias<"btst $simm13, $rs1", (ANDCCri G0, IntRegs:$rs1, simm13Op:$simm13), 0>;
 
 // bset reg_or_imm, rd -> or rd,reg_or_imm,rd
 def : InstAlias<"bset $rs2, $rd", (ORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
-def : InstAlias<"bset $simm13, $rd", (ORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+def : InstAlias<"bset $simm13, $rd", (ORri IntRegs:$rd, IntRegs:$rd, simm13Op:$simm13), 0>;
 
 // bclr reg_or_imm, rd -> andn rd,reg_or_imm,rd
 def : InstAlias<"bclr $rs2, $rd", (ANDNrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
-def : InstAlias<"bclr $simm13, $rd", (ANDNri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+def : InstAlias<"bclr $simm13, $rd", (ANDNri IntRegs:$rd, IntRegs:$rd, simm13Op:$simm13), 0>;
 
 // btog reg_or_imm, rd -> xor rd,reg_or_imm,rd
 def : InstAlias<"btog $rs2, $rd", (XORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
-def : InstAlias<"btog $simm13, $rd", (XORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+def : InstAlias<"btog $simm13, $rd", (XORri IntRegs:$rd, IntRegs:$rd, simm13Op:$simm13), 0>;
 
 
 // clr rd -> or %g0, %g0, rd
@@ -537,7 +537,7 @@ def : InstAlias<"clr [$addr]", (STri MEMri:$addr, G0), 0>;
 
 // mov reg_or_imm, rd -> or %g0, reg_or_imm, rd
 def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
-def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, simm13Op:$simm13)>;
 
 // mov specialreg, rd -> rd specialreg, rd
 def : InstAlias<"mov $asr, $rd", (RDASR IntRegs:$rd, ASRRegs:$asr), 0>;
@@ -547,13 +547,13 @@ def : InstAlias<"mov %tbr, $rd", (RDTBR IntRegs:$rd), 0>;
 
 // mov reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
 def : InstAlias<"mov $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
-def : InstAlias<"mov $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $simm13, $asr", (WRASRri ASRRegs:$asr, G0, simm13Op:$simm13), 0>;
 def : InstAlias<"mov $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
-def : InstAlias<"mov $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $simm13, %psr", (WRPSRri G0, simm13Op:$simm13), 0>;
 def : InstAlias<"mov $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
-def : InstAlias<"mov $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $simm13, %wim", (WRWIMri G0, simm13Op:$simm13), 0>;
 def : InstAlias<"mov $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
-def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, simm13Op:$simm13), 0>;
 
 // End of Section A.3
 
@@ -566,23 +566,23 @@ let EmitPriority = 0 in
 // (aka: omit the first arg when it's g0. This is not in the manual, but is
 // supported by gnu and solaris as)
 def : InstAlias<"wr $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
-def : InstAlias<"wr $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $simm13, $asr", (WRASRri ASRRegs:$asr, G0, simm13Op:$simm13), 0>;
 def : InstAlias<"wr $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
-def : InstAlias<"wr $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $simm13, %psr", (WRPSRri G0, simm13Op:$simm13), 0>;
 def : InstAlias<"wr $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
-def : InstAlias<"wr $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $simm13, %wim", (WRWIMri G0, simm13Op:$simm13), 0>;
 def : InstAlias<"wr $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
-def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, simm13Op:$simm13), 0>;
 
 def : InstAlias<"pwr $rs2, %psr", (PWRPSRrr G0, IntRegs:$rs2), 0>;
-def : InstAlias<"pwr $simm13, %psr", (PWRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"pwr $simm13, %psr", (PWRPSRri G0, simm13Op:$simm13), 0>;
 
 // wrpr %reg, %rd -> wrpr %reg, %g0, %rd
 // wrpr  imm, %rd -> wrpr  %g0, imm, %rd
 // Nonstandard GNU extensions.
 let Predicates = [HasV9] in {
   def : InstAlias<"wrpr $rs1, $rd", (WRPRrr PRRegs:$rd, IntRegs:$rs1, G0), 0>;
-  def : InstAlias<"wrpr $simm13, $rd", (WRPRri PRRegs:$rd, G0, i32imm:$simm13), 0>;
+  def : InstAlias<"wrpr $simm13, $rd", (WRPRri PRRegs:$rd, G0, simm13Op:$simm13), 0>;
 }
 
 // flush -> flush %g0
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 0ae0dfbb00503..3e814643f39e6 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -117,7 +117,7 @@ def SETHIimm_not : PatLeaf<(i32 imm), [{
 
 // Addressing modes.
 def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
-def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
+def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [], []>;
 
 // Constrained operands for the shift operations.
 class ShiftAmtImmAsmOperand<int Bits> : AsmOperandClass {
@@ -221,7 +221,7 @@ def calltarget : Operand<i32> {
   let ParserMatchClass = SparcCallTargetAsmOperand;
 }
 
-def simm13Op : Operand<i32> {
+def simm13Op : Operand<iPTR> {
   let DecoderMethod = "DecodeSIMM13";
   let EncoderMethod = "getSImm13OpValue";
 }
@@ -815,13 +815,6 @@ defm SRA : F3_S<"sra", 0b100111, 0, sra, i32, shift_imm5, IntRegs>;
 // Section B.13 - Add Instructions, p. 108
 defm ADD   : F3_12<"add", 0b000000, add, IntRegs, i32, simm13Op>;
 
-// "LEA" forms of add (patterns to make tblgen happy)
-let Predicates = [Is32Bit], isCodeGenOnly = 1 in
-  def LEA_ADDri   : F3_2<2, 0b000000,
-                     (outs IntRegs:$rd), (ins (MEMri $rs1, $simm13):$addr),
-                     "add ${addr:arith}, $rd",
-                     [(set iPTR:$rd, ADDRri:$addr)]>;
-
 let Defs = [ICC] in
   defm ADDCC  : F3_12<"addcc", 0b010000, addc, IntRegs, i32, simm13Op>;
 
@@ -1831,6 +1824,12 @@ def : Pat<(i32 simm13:$val),
 def : Pat<(i32 imm:$val),
           (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
 
+// Frame index.
+def to_tframeindex : SDNodeXForm<frameindex, [{
+  return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
+}]>;
+def : Pat<(i32 (frameindex:$ptr)), (ADDri (i32 (to_tframeindex $ptr)), (i32 0))>;
+def : Pat<(i64 (frameindex:$ptr)), (ADDri (i64 (to_tframeindex $ptr)), (i64 0))>;
 
 // Global addresses, constant pool entries
 let Predicates = [Is32Bit] in {
diff --git a/llvm/test/CodeGen/SPARC/fp128-split.ll b/llvm/test/CodeGen/SPARC/fp128-split.ll
index f9ac4b681a6c0..7526706e0cb6f 100644
--- a/llvm/test/CodeGen/SPARC/fp128-split.ll
+++ b/llvm/test/CodeGen/SPARC/fp128-split.ll
@@ -11,8 +11,8 @@ define fp128 @testcase(fp128 %0) {
   ; CHECK:   liveins: $q0
   ; CHECK:   [[COPY:%[0-9]+]]:qfpregs = COPY $q0
   ; CHECK:   [[COPY1:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_odd64
-  ; CHECK:   [[LEAX_ADDri:%[0-9]+]]:i64regs = LEAX_ADDri %stack.0, 0
-  ; CHECK:   [[ORXri:%[0-9]+]]:i64regs = ORXri killed [[LEAX_ADDri]], 8
+  ; CHECK:   [[ADDri:%[0-9]+]]:i64regs = ADDri %stack.0, 0
+  ; CHECK:   [[ORXri:%[0-9]+]]:i64regs = ORXri killed [[ADDri]], 8
   ; CHECK:   STDFrr [[ORXri]], $g0, killed [[COPY1]] :: (store (s64) into %stack.0 + 8)
   ; CHECK:   [[COPY2:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_even64
   ; CHECK:   STDFri %stack.0, 0, killed [[COPY2]] :: (store (s64) into %stack.0, align 16)
@@ -32,8 +32,8 @@ define fp128 @testcase(fp128 %0) {
   ; CHECK:   [[COPY7:%[0-9]+]]:i64regs = COPY [[ADDEri]]
   ; CHECK:   [[SLLXri:%[0-9]+]]:i64regs = SLLXri killed [[COPY7]], 32
   ; CHECK:   [[ORXrr:%[0-9]+]]:i64regs = ORXrr killed [[SLLXri]], killed [[SRLri]]
-  ; CHECK:   [[LEAX_ADDri1:%[0-9]+]]:i64regs = LEAX_ADDri %stack.1, 0
-  ; CHECK:   [[ORXri1:%[0-9]+]]:i64regs = ORXri killed [[LEAX_ADDri1]], 8
+  ; CHECK:   [[ADDri1:%[0-9]+]]:i64regs = ADDri %stack.1, 0
+  ; CHECK:   [[ORXri1:%[0-9]+]]:i64regs = ORXri killed [[ADDri1]], 8
   ; CHECK:   STXrr [[ORXri1]], $g0, killed [[ORXrr]] :: (store (s64) into %stack.1 + 8, basealign 16)
   ; CHECK:   [[SRLri1:%[0-9]+]]:i64regs = SRLri killed [[ADDEri1]], 0
   ; CHECK:   [[COPY8:%[0-9]+]]:i64regs = COPY [[ADDEri2]]

From b927490d73eebe0a7e8071047d79a330145b0488 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Tue, 19 Sep 2023 17:37:04 -0700
Subject: [PATCH 28/57] [lldb][NFCI] Remove unused struct
 ConstString::StringIsEqual

This doesn't seem to be used at all, no need to keep it around.
---
 lldb/include/lldb/Utility/ConstString.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/lldb/include/lldb/Utility/ConstString.h b/lldb/include/lldb/Utility/ConstString.h
index c23c6fd3546e5..cbea4cbf916a4 100644
--- a/lldb/include/lldb/Utility/ConstString.h
+++ b/lldb/include/lldb/Utility/ConstString.h
@@ -78,22 +78,6 @@ class ConstString {
   ///     from \a cstr.
   explicit ConstString(const char *cstr, size_t max_cstr_len);
 
-  /// C string equality binary predicate function object for ConstString
-  /// objects.
-  struct StringIsEqual {
-    /// C equality test.
-    ///
-    /// Two C strings are equal when they are contained in ConstString objects
-    /// when their pointer values are equal to each other.
-    ///
-    /// \return
-    ///     Returns \b true if the C string in \a lhs is equal to
-    ///     the C string value in \a rhs, \b false otherwise.
-    bool operator()(const char *lhs, const char *rhs) const {
-      return lhs == rhs;
-    }
-  };
-
   /// Convert to bool operator.
   ///
   /// This allows code to check a ConstString object to see if it contains a

From 613a09d99e880850d71d5d40a0605d3c399fc021 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Wed, 20 Sep 2023 06:38:50 +0530
Subject: [PATCH 29/57] [builtins][NFC] Avoid using CRT_LDBL_128BIT in tests
 (#66832)

https://reviews.llvm.org/D153812 removed this macro from
implementations. Clean this up from tests as well.
---
 compiler-rt/test/builtins/Unit/compiler_rt_fmaxl_test.c   | 4 ++--
 compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c   | 4 ++--
 compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c | 4 ++--
 compiler-rt/test/builtins/Unit/floattitf_test.c           | 4 ++--
 compiler-rt/test/builtins/Unit/floatuntitf_test.c         | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_fmaxl_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_fmaxl_test.c
index b3c570bcc6428..7b99514aad4e3 100644
--- a/compiler-rt/test/builtins/Unit/compiler_rt_fmaxl_test.c
+++ b/compiler-rt/test/builtins/Unit/compiler_rt_fmaxl_test.c
@@ -6,7 +6,7 @@
 #include <stdio.h>
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 
 int test__compiler_rt_fmaxl(fp_t x, fp_t y) {
   fp_t crt_value = __compiler_rt_fmaxl(x, y);
@@ -43,7 +43,7 @@ fp_t cases[] = {
 #endif
 
 int main() {
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
   const unsigned N = sizeof(cases) / sizeof(cases[0]);
   unsigned i, j;
   for (i = 0; i < N; ++i) {
diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c
index d3e8c4f7f9765..f48e67e7887cd 100644
--- a/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c
+++ b/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c
@@ -6,7 +6,7 @@
 #include "fp_lib.h"
 #include "int_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 
 int test__compiler_rt_logbl(fp_t x) {
 #if defined(__ve__)
@@ -42,7 +42,7 @@ double cases[] = {
 #endif
 
 int main() {
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
   const unsigned N = sizeof(cases) / sizeof(cases[0]);
   unsigned i;
   for (i = 0; i < N; ++i) {
diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c
index 70b29890b982f..79193334c65d9 100644
--- a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c
+++ b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 
 int test__compiler_rt_scalbnl(const char *mode, fp_t x, int y) {
 #if defined(__ve__)
@@ -67,7 +67,7 @@ int iterate_cases(const char *mode) {
 #endif
 
 int main() {
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
   if (iterate_cases("default")) return 1;
 
   // Skip rounding mode tests (fesetround) because compiler-rt's quad-precision
diff --git a/compiler-rt/test/builtins/Unit/floattitf_test.c b/compiler-rt/test/builtins/Unit/floattitf_test.c
index e4ce977e5da02..abd3006135b38 100644
--- a/compiler-rt/test/builtins/Unit/floattitf_test.c
+++ b/compiler-rt/test/builtins/Unit/floattitf_test.c
@@ -7,7 +7,7 @@
 #include <float.h>
 #include <stdio.h>
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 
 /* Returns: convert a ti_int to a fp_t, rounding toward even. */
 
@@ -39,7 +39,7 @@ char assumption_3[sizeof(fp_t) * CHAR_BIT == 128] = {0};
 #endif
 
 int main() {
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
   if (test__floattitf(0, 0.0))
     return 1;
 
diff --git a/compiler-rt/test/builtins/Unit/floatuntitf_test.c b/compiler-rt/test/builtins/Unit/floatuntitf_test.c
index da3fe2340b233..db25f2b0230cd 100644
--- a/compiler-rt/test/builtins/Unit/floatuntitf_test.c
+++ b/compiler-rt/test/builtins/Unit/floatuntitf_test.c
@@ -7,7 +7,7 @@
 #include <float.h>
 #include <stdio.h>
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 
 /* Returns: convert a tu_int to a fp_t, rounding toward even. */
 
@@ -39,7 +39,7 @@ char assumption_3[sizeof(fp_t) * CHAR_BIT == 128] = {0};
 #endif
 
 int main() {
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
   if (test__floatuntitf(0, 0.0))
     return 1;
 

From ab94fbba572cd50a1e85dbc023915b1f76b98106 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Wed, 20 Sep 2023 09:16:29 +0800
Subject: [PATCH 30/57] [RISCV] Prefer Zcmp push/pop instead of save-restore
 calls. (#66046)

Zcmp push/pop can reduce more code size then save-restore calls. There
are two reasons,
1. Call for save-restore calls needs 4-8 bytes, but Zcmp push/pop only
needs 2 bytes.
2. Zcmp push/pop can also handles small shift of sp.
---
 .../Target/RISCV/RISCVMachineFunctionInfo.h   |   6 +-
 llvm/test/CodeGen/RISCV/push-pop-popret.ll    | 120 ++++++++----------
 2 files changed, 59 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 099ebb4014ca4..6ee5790b272ad 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -110,7 +110,8 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
   bool useSaveRestoreLibCalls(const MachineFunction &MF) const {
     // We cannot use fixed locations for the callee saved spill slots if the
     // function uses a varargs save area, or is an interrupt handler.
-    return MF.getSubtarget<RISCVSubtarget>().enableSaveRestore() &&
+    return !isPushable(MF) &&
+           MF.getSubtarget<RISCVSubtarget>().enableSaveRestore() &&
            VarArgsSaveSize == 0 && !MF.getFrameInfo().hasTailCall() &&
            !MF.getFunction().hasFnAttribute("interrupt");
   }
@@ -131,8 +132,7 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
     // We cannot use fixed locations for the callee saved spill slots if the
     // function uses a varargs save area.
     // TODO: Use a seperate placement for vararg registers to enable Zcmp.
-    return !useSaveRestoreLibCalls(MF) &&
-           MF.getSubtarget<RISCVSubtarget>().hasStdExtZcmp() &&
+    return MF.getSubtarget<RISCVSubtarget>().hasStdExtZcmp() &&
            !MF.getTarget().Options.DisableFramePointerElim(MF) &&
            VarArgsSaveSize == 0;
   }
diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 84e4062ca333d..ffa7cb6389d52 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -41,27 +41,25 @@ define i32 @foo() {
 ;
 ; RV32IZCMP-SR-LABEL: foo:
 ; RV32IZCMP-SR:       # %bb.0:
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_0
-; RV32IZCMP-SR-NEXT:    addi sp, sp, -512
+; RV32IZCMP-SR-NEXT:    cm.push {ra}, -64
+; RV32IZCMP-SR-NEXT:    addi sp, sp, -464
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa_offset 528
 ; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -4
 ; RV32IZCMP-SR-NEXT:    mv a0, sp
 ; RV32IZCMP-SR-NEXT:    call test@plt
-; RV32IZCMP-SR-NEXT:    li a0, 0
-; RV32IZCMP-SR-NEXT:    addi sp, sp, 512
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_0
+; RV32IZCMP-SR-NEXT:    addi sp, sp, 464
+; RV32IZCMP-SR-NEXT:    cm.popretz {ra}, 64
 ;
 ; RV64IZCMP-SR-LABEL: foo:
 ; RV64IZCMP-SR:       # %bb.0:
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_0
-; RV64IZCMP-SR-NEXT:    addi sp, sp, -512
+; RV64IZCMP-SR-NEXT:    cm.push {ra}, -64
+; RV64IZCMP-SR-NEXT:    addi sp, sp, -464
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 528
 ; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -8
 ; RV64IZCMP-SR-NEXT:    mv a0, sp
 ; RV64IZCMP-SR-NEXT:    call test@plt
-; RV64IZCMP-SR-NEXT:    li a0, 0
-; RV64IZCMP-SR-NEXT:    addi sp, sp, 512
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_0
+; RV64IZCMP-SR-NEXT:    addi sp, sp, 464
+; RV64IZCMP-SR-NEXT:    cm.popretz {ra}, 64
 ;
 ; RV32I-LABEL: foo:
 ; RV32I:       # %bb.0:
@@ -131,10 +129,10 @@ define i32 @pushpopret0(i32 signext %size){
 ;
 ; RV32IZCMP-SR-LABEL: pushpopret0:
 ; RV32IZCMP-SR:       # %bb.0: # %entry
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -4
-; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -4
 ; RV32IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV32IZCMP-SR-NEXT:    addi a0, a0, 15
@@ -142,16 +140,15 @@ define i32 @pushpopret0(i32 signext %size){
 ; RV32IZCMP-SR-NEXT:    sub a0, sp, a0
 ; RV32IZCMP-SR-NEXT:    mv sp, a0
 ; RV32IZCMP-SR-NEXT:    call callee_void@plt
-; RV32IZCMP-SR-NEXT:    li a0, 0
 ; RV32IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV32IZCMP-SR-NEXT:    cm.popretz {ra, s0}, 16
 ;
 ; RV64IZCMP-SR-LABEL: pushpopret0:
 ; RV64IZCMP-SR:       # %bb.0: # %entry
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -8
-; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -8
 ; RV64IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV64IZCMP-SR-NEXT:    slli a0, a0, 32
@@ -161,9 +158,8 @@ define i32 @pushpopret0(i32 signext %size){
 ; RV64IZCMP-SR-NEXT:    sub a0, sp, a0
 ; RV64IZCMP-SR-NEXT:    mv sp, a0
 ; RV64IZCMP-SR-NEXT:    call callee_void@plt
-; RV64IZCMP-SR-NEXT:    li a0, 0
 ; RV64IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV64IZCMP-SR-NEXT:    cm.popretz {ra, s0}, 16
 ;
 ; RV32I-LABEL: pushpopret0:
 ; RV32I:       # %bb.0: # %entry
@@ -255,10 +251,10 @@ define i32 @pushpopret1(i32 signext %size) {
 ;
 ; RV32IZCMP-SR-LABEL: pushpopret1:
 ; RV32IZCMP-SR:       # %bb.0: # %entry
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -4
-; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -4
 ; RV32IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV32IZCMP-SR-NEXT:    addi a0, a0, 15
@@ -268,14 +264,14 @@ define i32 @pushpopret1(i32 signext %size) {
 ; RV32IZCMP-SR-NEXT:    call callee_void@plt
 ; RV32IZCMP-SR-NEXT:    li a0, 1
 ; RV32IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0}, 16
 ;
 ; RV64IZCMP-SR-LABEL: pushpopret1:
 ; RV64IZCMP-SR:       # %bb.0: # %entry
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -8
-; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -8
 ; RV64IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV64IZCMP-SR-NEXT:    slli a0, a0, 32
@@ -287,7 +283,7 @@ define i32 @pushpopret1(i32 signext %size) {
 ; RV64IZCMP-SR-NEXT:    call callee_void@plt
 ; RV64IZCMP-SR-NEXT:    li a0, 1
 ; RV64IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0}, 16
 ;
 ; RV32I-LABEL: pushpopret1:
 ; RV32I:       # %bb.0: # %entry
@@ -379,10 +375,10 @@ define i32 @pushpopretneg1(i32 signext %size) {
 ;
 ; RV32IZCMP-SR-LABEL: pushpopretneg1:
 ; RV32IZCMP-SR:       # %bb.0: # %entry
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -4
-; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -4
 ; RV32IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV32IZCMP-SR-NEXT:    addi a0, a0, 15
@@ -392,14 +388,14 @@ define i32 @pushpopretneg1(i32 signext %size) {
 ; RV32IZCMP-SR-NEXT:    call callee_void@plt
 ; RV32IZCMP-SR-NEXT:    li a0, -1
 ; RV32IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0}, 16
 ;
 ; RV64IZCMP-SR-LABEL: pushpopretneg1:
 ; RV64IZCMP-SR:       # %bb.0: # %entry
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -8
-; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -8
 ; RV64IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV64IZCMP-SR-NEXT:    slli a0, a0, 32
@@ -411,7 +407,7 @@ define i32 @pushpopretneg1(i32 signext %size) {
 ; RV64IZCMP-SR-NEXT:    call callee_void@plt
 ; RV64IZCMP-SR-NEXT:    li a0, -1
 ; RV64IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0}, 16
 ;
 ; RV32I-LABEL: pushpopretneg1:
 ; RV32I:       # %bb.0: # %entry
@@ -503,10 +499,10 @@ define i32 @pushpopret2(i32 signext %size) {
 ;
 ; RV32IZCMP-SR-LABEL: pushpopret2:
 ; RV32IZCMP-SR:       # %bb.0: # %entry
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -4
-; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset ra, -8
+; RV32IZCMP-SR-NEXT:    .cfi_offset s0, -4
 ; RV32IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV32IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV32IZCMP-SR-NEXT:    addi a0, a0, 15
@@ -516,14 +512,14 @@ define i32 @pushpopret2(i32 signext %size) {
 ; RV32IZCMP-SR-NEXT:    call callee_void@plt
 ; RV32IZCMP-SR-NEXT:    li a0, 2
 ; RV32IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0}, 16
 ;
 ; RV64IZCMP-SR-LABEL: pushpopret2:
 ; RV64IZCMP-SR:       # %bb.0: # %entry
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_1
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0}, -16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 16
-; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -8
-; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset ra, -16
+; RV64IZCMP-SR-NEXT:    .cfi_offset s0, -8
 ; RV64IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa s0, 0
 ; RV64IZCMP-SR-NEXT:    slli a0, a0, 32
@@ -535,7 +531,7 @@ define i32 @pushpopret2(i32 signext %size) {
 ; RV64IZCMP-SR-NEXT:    call callee_void@plt
 ; RV64IZCMP-SR-NEXT:    li a0, 2
 ; RV64IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_1
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0}, 16
 ;
 ; RV32I-LABEL: pushpopret2:
 ; RV32I:       # %bb.0: # %entry
@@ -1220,7 +1216,7 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ;
 ; RV32IZCMP-SR-LABEL: many_args:
 ; RV32IZCMP-SR:       # %bb.0: # %entry
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_5
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -32
 ; RV32IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV32IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
 ; RV32IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
@@ -1259,11 +1255,11 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
 ; RV32IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
 ; RV32IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_5
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 32
 ;
 ; RV64IZCMP-SR-LABEL: many_args:
 ; RV64IZCMP-SR:       # %bb.0: # %entry
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_5
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -48
 ; RV64IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV64IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
 ; RV64IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
@@ -1302,7 +1298,7 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
 ; RV64IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
 ; RV64IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_5
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 48
 ;
 ; RV32I-LABEL: many_args:
 ; RV32I:       # %bb.0: # %entry
@@ -1456,7 +1452,7 @@ define void @alloca(i32 %n) nounwind {
 ;
 ; RV32IZCMP-SR-LABEL: alloca:
 ; RV32IZCMP-SR:       # %bb.0:
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_2
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s1}, -16
 ; RV32IZCMP-SR-NEXT:    addi s0, sp, 16
 ; RV32IZCMP-SR-NEXT:    mv s1, sp
 ; RV32IZCMP-SR-NEXT:    addi a0, a0, 15
@@ -1466,11 +1462,11 @@ define void @alloca(i32 %n) nounwind {
 ; RV32IZCMP-SR-NEXT:    call notdead@plt
 ; RV32IZCMP-SR-NEXT:    mv sp, s1
 ; RV32IZCMP-SR-NEXT:    addi sp, s0, -16
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_2
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s1}, 16
 ;
 ; RV64IZCMP-SR-LABEL: alloca:
 ; RV64IZCMP-SR:       # %bb.0:
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_2
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s1}, -32
 ; RV64IZCMP-SR-NEXT:    addi s0, sp, 32
 ; RV64IZCMP-SR-NEXT:    mv s1, sp
 ; RV64IZCMP-SR-NEXT:    slli a0, a0, 32
@@ -1482,7 +1478,7 @@ define void @alloca(i32 %n) nounwind {
 ; RV64IZCMP-SR-NEXT:    call notdead@plt
 ; RV64IZCMP-SR-NEXT:    mv sp, s1
 ; RV64IZCMP-SR-NEXT:    addi sp, s0, -32
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_2
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s1}, 32
 ;
 ; RV32I-LABEL: alloca:
 ; RV32I:       # %bb.0:
@@ -1790,15 +1786,15 @@ define void @foo_no_irq() nounwind{
 ;
 ; RV32IZCMP-SR-LABEL: foo_no_irq:
 ; RV32IZCMP-SR:       # %bb.0:
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_0
+; RV32IZCMP-SR-NEXT:    cm.push {ra}, -16
 ; RV32IZCMP-SR-NEXT:    call foo_test_irq@plt
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_0
+; RV32IZCMP-SR-NEXT:    cm.popret {ra}, 16
 ;
 ; RV64IZCMP-SR-LABEL: foo_no_irq:
 ; RV64IZCMP-SR:       # %bb.0:
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_0
+; RV64IZCMP-SR-NEXT:    cm.push {ra}, -16
 ; RV64IZCMP-SR-NEXT:    call foo_test_irq@plt
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_0
+; RV64IZCMP-SR-NEXT:    cm.popret {ra}, 16
 ;
 ; RV32I-LABEL: foo_no_irq:
 ; RV32I:       # %bb.0:
@@ -2739,8 +2735,7 @@ define void @callee_no_irq() nounwind{
 ;
 ; RV32IZCMP-SR-LABEL: callee_no_irq:
 ; RV32IZCMP-SR:       # %bb.0:
-; RV32IZCMP-SR-NEXT:    call t0, __riscv_save_12
-; RV32IZCMP-SR-NEXT:    addi sp, sp, -32
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -96
 ; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
 ; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
@@ -2819,13 +2814,11 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
-; RV32IZCMP-SR-NEXT:    addi sp, sp, 32
-; RV32IZCMP-SR-NEXT:    tail __riscv_restore_12
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-SR-LABEL: callee_no_irq:
 ; RV64IZCMP-SR:       # %bb.0:
-; RV64IZCMP-SR-NEXT:    call t0, __riscv_save_12
-; RV64IZCMP-SR-NEXT:    addi sp, sp, -48
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -160
 ; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
 ; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
@@ -2904,8 +2897,7 @@ define void @callee_no_irq() nounwind{
 ; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
-; RV64IZCMP-SR-NEXT:    addi sp, sp, 48
-; RV64IZCMP-SR-NEXT:    tail __riscv_restore_12
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32I-LABEL: callee_no_irq:
 ; RV32I:       # %bb.0:

From 36b37c775c285bbff9b57630e7ea9d00b918cc91 Mon Sep 17 00:00:00 2001
From: Argyrios Kyrtzidis <kyrtzidis@apple.com>
Date: Tue, 19 Sep 2023 18:18:23 -0700
Subject: [PATCH 31/57] [DependencyScanningFilesystem] Make sure the
 local/shared cache filename lookups use only absolute paths (#66122)

Previously a relative path would be used as a key for cache lookup and
if the same relative path was used from another compiler invocation with
a different working directory then the first cache entry was erroneously
returned.
---
 .../DependencyScanningFilesystem.h            | 18 ++++-
 .../DependencyScanningFilesystem.cpp          | 79 +++++++++++++++----
 clang/test/ClangScanDeps/relative-filenames.c | 38 +++++++++
 3 files changed, 117 insertions(+), 18 deletions(-)
 create mode 100644 clang/test/ClangScanDeps/relative-filenames.c

diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
index 4b4e3c7eb2ecd..dbe219b6dd8d7 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
@@ -215,6 +215,7 @@ class DependencyScanningFilesystemLocalCache {
 public:
   /// Returns entry associated with the filename or nullptr if none is found.
   const CachedFileSystemEntry *findEntryByFilename(StringRef Filename) const {
+    assert(llvm::sys::path::is_absolute_gnu(Filename));
     auto It = Cache.find(Filename);
     return It == Cache.end() ? nullptr : It->getValue();
   }
@@ -224,6 +225,7 @@ class DependencyScanningFilesystemLocalCache {
   const CachedFileSystemEntry &
   insertEntryForFilename(StringRef Filename,
                          const CachedFileSystemEntry &Entry) {
+    assert(llvm::sys::path::is_absolute_gnu(Filename));
     const auto *InsertedEntry = Cache.insert({Filename, &Entry}).first->second;
     assert(InsertedEntry == &Entry && "entry already present");
     return *InsertedEntry;
@@ -282,13 +284,14 @@ class DependencyScanningWorkerFilesystem : public llvm::vfs::ProxyFileSystem {
 public:
   DependencyScanningWorkerFilesystem(
       DependencyScanningFilesystemSharedCache &SharedCache,
-      IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS)
-      : ProxyFileSystem(std::move(FS)), SharedCache(SharedCache) {}
+      IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
 
   llvm::ErrorOr<llvm::vfs::Status> status(const Twine &Path) override;
   llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>>
   openFileForRead(const Twine &Path) override;
 
+  std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
+
   /// Returns entry for the given filename.
   ///
   /// Attempts to use the local and shared caches first, then falls back to
@@ -304,8 +307,11 @@ class DependencyScanningWorkerFilesystem : public llvm::vfs::ProxyFileSystem {
   /// For a filename that's not yet associated with any entry in the caches,
   /// uses the underlying filesystem to either look up the entry based in the
   /// shared cache indexed by unique ID, or creates new entry from scratch.
+  /// \p FilenameForLookup will always be an absolute path, and different than
+  /// \p OriginalFilename if \p OriginalFilename is relative.
   llvm::ErrorOr<const CachedFileSystemEntry &>
-  computeAndStoreResult(StringRef Filename);
+  computeAndStoreResult(StringRef OriginalFilename,
+                        StringRef FilenameForLookup);
 
   /// Scan for preprocessor directives for the given entry if necessary and
   /// returns a wrapper object with reference semantics.
@@ -388,6 +394,12 @@ class DependencyScanningWorkerFilesystem : public llvm::vfs::ProxyFileSystem {
   /// The local cache is used by the worker thread to cache file system queries
   /// locally instead of querying the global cache every time.
   DependencyScanningFilesystemLocalCache LocalCache;
+
+  /// The working directory to use for making relative paths absolute before
+  /// using them for cache lookups.
+  llvm::ErrorOr<std::string> WorkingDirForCacheLookup;
+
+  void updateWorkingDirForCacheLookup();
 };
 
 } // end namespace dependencies
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
index 31404855e3b1d..3e53c8fc57408 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -96,6 +96,7 @@ DependencyScanningFilesystemSharedCache::
 DependencyScanningFilesystemSharedCache::CacheShard &
 DependencyScanningFilesystemSharedCache::getShardForFilename(
     StringRef Filename) const {
+  assert(llvm::sys::path::is_absolute_gnu(Filename));
   return CacheShards[llvm::hash_value(Filename) % NumShards];
 }
 
@@ -109,6 +110,7 @@ DependencyScanningFilesystemSharedCache::getShardForUID(
 const CachedFileSystemEntry *
 DependencyScanningFilesystemSharedCache::CacheShard::findEntryByFilename(
     StringRef Filename) const {
+  assert(llvm::sys::path::is_absolute_gnu(Filename));
   std::lock_guard<std::mutex> LockGuard(CacheLock);
   auto It = EntriesByFilename.find(Filename);
   return It == EntriesByFilename.end() ? nullptr : It->getValue();
@@ -189,6 +191,14 @@ static bool shouldCacheStatFailures(StringRef Filename) {
   return shouldScanForDirectivesBasedOnExtension(Filename);
 }
 
+DependencyScanningWorkerFilesystem::DependencyScanningWorkerFilesystem(
+    DependencyScanningFilesystemSharedCache &SharedCache,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS)
+    : ProxyFileSystem(std::move(FS)), SharedCache(SharedCache),
+      WorkingDirForCacheLookup(llvm::errc::invalid_argument) {
+  updateWorkingDirForCacheLookup();
+}
+
 bool DependencyScanningWorkerFilesystem::shouldScanForDirectives(
     StringRef Filename) {
   return shouldScanForDirectivesBasedOnExtension(Filename);
@@ -215,44 +225,62 @@ DependencyScanningWorkerFilesystem::findEntryByFilenameWithWriteThrough(
 }
 
 llvm::ErrorOr<const CachedFileSystemEntry &>
-DependencyScanningWorkerFilesystem::computeAndStoreResult(StringRef Filename) {
-  llvm::ErrorOr<llvm::vfs::Status> Stat = getUnderlyingFS().status(Filename);
+DependencyScanningWorkerFilesystem::computeAndStoreResult(
+    StringRef OriginalFilename, StringRef FilenameForLookup) {
+  llvm::ErrorOr<llvm::vfs::Status> Stat =
+      getUnderlyingFS().status(OriginalFilename);
   if (!Stat) {
-    if (!shouldCacheStatFailures(Filename))
+    if (!shouldCacheStatFailures(OriginalFilename))
       return Stat.getError();
     const auto &Entry =
-        getOrEmplaceSharedEntryForFilename(Filename, Stat.getError());
-    return insertLocalEntryForFilename(Filename, Entry);
+        getOrEmplaceSharedEntryForFilename(FilenameForLookup, Stat.getError());
+    return insertLocalEntryForFilename(FilenameForLookup, Entry);
   }
 
   if (const auto *Entry = findSharedEntryByUID(*Stat))
-    return insertLocalEntryForFilename(Filename, *Entry);
+    return insertLocalEntryForFilename(FilenameForLookup, *Entry);
 
   auto TEntry =
-      Stat->isDirectory() ? TentativeEntry(*Stat) : readFile(Filename);
+      Stat->isDirectory() ? TentativeEntry(*Stat) : readFile(OriginalFilename);
 
   const CachedFileSystemEntry *SharedEntry = [&]() {
     if (TEntry) {
       const auto &UIDEntry = getOrEmplaceSharedEntryForUID(std::move(*TEntry));
-      return &getOrInsertSharedEntryForFilename(Filename, UIDEntry);
+      return &getOrInsertSharedEntryForFilename(FilenameForLookup, UIDEntry);
     }
-    return &getOrEmplaceSharedEntryForFilename(Filename, TEntry.getError());
+    return &getOrEmplaceSharedEntryForFilename(FilenameForLookup,
+                                               TEntry.getError());
   }();
 
-  return insertLocalEntryForFilename(Filename, *SharedEntry);
+  return insertLocalEntryForFilename(FilenameForLookup, *SharedEntry);
 }
 
 llvm::ErrorOr<EntryRef>
 DependencyScanningWorkerFilesystem::getOrCreateFileSystemEntry(
-    StringRef Filename, bool DisableDirectivesScanning) {
-  if (const auto *Entry = findEntryByFilenameWithWriteThrough(Filename))
-    return scanForDirectivesIfNecessary(*Entry, Filename,
+    StringRef OriginalFilename, bool DisableDirectivesScanning) {
+  StringRef FilenameForLookup;
+  SmallString<256> PathBuf;
+  if (llvm::sys::path::is_absolute_gnu(OriginalFilename)) {
+    FilenameForLookup = OriginalFilename;
+  } else if (!WorkingDirForCacheLookup) {
+    return WorkingDirForCacheLookup.getError();
+  } else {
+    StringRef RelFilename = OriginalFilename;
+    RelFilename.consume_front("./");
+    PathBuf = *WorkingDirForCacheLookup;
+    llvm::sys::path::append(PathBuf, RelFilename);
+    FilenameForLookup = PathBuf.str();
+  }
+  assert(llvm::sys::path::is_absolute_gnu(FilenameForLookup));
+  if (const auto *Entry =
+          findEntryByFilenameWithWriteThrough(FilenameForLookup))
+    return scanForDirectivesIfNecessary(*Entry, OriginalFilename,
                                         DisableDirectivesScanning)
         .unwrapError();
-  auto MaybeEntry = computeAndStoreResult(Filename);
+  auto MaybeEntry = computeAndStoreResult(OriginalFilename, FilenameForLookup);
   if (!MaybeEntry)
     return MaybeEntry.getError();
-  return scanForDirectivesIfNecessary(*MaybeEntry, Filename,
+  return scanForDirectivesIfNecessary(*MaybeEntry, OriginalFilename,
                                       DisableDirectivesScanning)
       .unwrapError();
 }
@@ -330,3 +358,24 @@ DependencyScanningWorkerFilesystem::openFileForRead(const Twine &Path) {
     return Result.getError();
   return DepScanFile::create(Result.get());
 }
+
+std::error_code DependencyScanningWorkerFilesystem::setCurrentWorkingDirectory(
+    const Twine &Path) {
+  std::error_code EC = ProxyFileSystem::setCurrentWorkingDirectory(Path);
+  updateWorkingDirForCacheLookup();
+  return EC;
+}
+
+void DependencyScanningWorkerFilesystem::updateWorkingDirForCacheLookup() {
+  llvm::ErrorOr<std::string> CWD =
+      getUnderlyingFS().getCurrentWorkingDirectory();
+  if (!CWD) {
+    WorkingDirForCacheLookup = CWD.getError();
+  } else if (!llvm::sys::path::is_absolute_gnu(*CWD)) {
+    WorkingDirForCacheLookup = llvm::errc::invalid_argument;
+  } else {
+    WorkingDirForCacheLookup = *CWD;
+  }
+  assert(!WorkingDirForCacheLookup ||
+         llvm::sys::path::is_absolute_gnu(*WorkingDirForCacheLookup));
+}
diff --git a/clang/test/ClangScanDeps/relative-filenames.c b/clang/test/ClangScanDeps/relative-filenames.c
new file mode 100644
index 0000000000000..03f2be7ec4c1f
--- /dev/null
+++ b/clang/test/ClangScanDeps/relative-filenames.c
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
+
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -format make -j 1 > %t/result.txt
+// RUN: FileCheck %s -input-file=%t/result.txt
+
+// CHECK: {{/|\\}}dir1{{/|\\}}t1.c
+// CHECK: {{/|\\}}dir1{{/|\\}}head.h
+// CHECK: {{/|\\}}dir2{{/|\\}}t2.c
+// CHECK: {{/|\\}}dir2{{/|\\}}head.h
+
+//--- cdb.json.template
+[
+  {
+    "directory": "DIR/dir1",
+    "command": "clang -fsyntax-only t1.c",
+    "file": "t1.c"
+  },
+  {
+    "directory": "DIR/dir2",
+    "command": "clang -fsyntax-only t2.c",
+    "file": "t2.c"
+  }
+]
+
+//--- dir1/t1.c
+#include "head.h"
+
+//--- dir1/head.h
+#ifndef BBB
+#define BBB
+#endif
+
+//--- dir2/t2.c
+#include "head.h"
+
+//--- dir2/head.h

From c557621176f5f38b5757a325cc72be0a11a91c78 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 18 Sep 2023 12:35:54 -0700
Subject: [PATCH 32/57] [NFC][hwasan] Make ShowHeapOrGlobalCandidate a method
 (#66682)

---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 148 ++++++++++++-----------
 1 file changed, 75 insertions(+), 73 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 6272e7116846c..3740cc4fc51d6 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -319,77 +319,6 @@ static uptr GetGlobalSizeFromDescriptor(uptr ptr) {
   return 0;
 }
 
-static void ShowHeapOrGlobalCandidate(uptr untagged_addr, tag_t *candidate,
-                                      tag_t *left, tag_t *right) {
-  Decorator d;
-  uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
-  HwasanChunkView chunk = FindHeapChunkByAddress(mem);
-  if (chunk.IsAllocated()) {
-    uptr offset;
-    const char *whence;
-    if (untagged_addr < chunk.End() && untagged_addr >= chunk.Beg()) {
-      offset = untagged_addr - chunk.Beg();
-      whence = "inside";
-    } else if (candidate == left) {
-      offset = untagged_addr - chunk.End();
-      whence = "after";
-    } else {
-      offset = chunk.Beg() - untagged_addr;
-      whence = "before";
-    }
-    Printf("%s", d.Error());
-    Printf("\nCause: heap-buffer-overflow\n");
-    Printf("%s", d.Default());
-    Printf("%s", d.Location());
-    Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
-           untagged_addr, offset, whence, chunk.UsedSize(), chunk.Beg(),
-           chunk.End());
-    Printf("%s", d.Allocation());
-    Printf("allocated by thread T%u here:\n", chunk.GetAllocThreadId());
-    Printf("%s", d.Default());
-    GetStackTraceFromId(chunk.GetAllocStackId()).Print();
-    return;
-  }
-  // Check whether the address points into a loaded library. If so, this is
-  // most likely a global variable.
-  const char *module_name;
-  uptr module_address;
-  Symbolizer *sym = Symbolizer::GetOrInit();
-  if (sym->GetModuleNameAndOffsetForPC(mem, &module_name, &module_address)) {
-    Printf("%s", d.Error());
-    Printf("\nCause: global-overflow\n");
-    Printf("%s", d.Default());
-    DataInfo info;
-    Printf("%s", d.Location());
-    if (sym->SymbolizeData(mem, &info) && info.start) {
-      Printf(
-          "%p is located %zd bytes %s a %zd-byte global variable "
-          "%s [%p,%p) in %s\n",
-          untagged_addr,
-          candidate == left ? untagged_addr - (info.start + info.size)
-                            : info.start - untagged_addr,
-          candidate == left ? "after" : "before", info.size, info.name,
-          info.start, info.start + info.size, module_name);
-    } else {
-      uptr size = GetGlobalSizeFromDescriptor(mem);
-      if (size == 0)
-        // We couldn't find the size of the global from the descriptors.
-        Printf(
-            "%p is located %s a global variable in "
-            "\n    #0 0x%x (%s+0x%x)\n",
-            untagged_addr, candidate == left ? "after" : "before", mem,
-            module_name, module_address);
-      else
-        Printf(
-            "%p is located %s a %zd-byte global variable in "
-            "\n    #0 0x%x (%s+0x%x)\n",
-            untagged_addr, candidate == left ? "after" : "before", size, mem,
-            module_name, module_address);
-    }
-    Printf("%s", d.Default());
-  }
-}
-
 void ReportStats() {}
 
 static void PrintTagInfoAroundAddr(tag_t *tag_ptr, uptr num_rows,
@@ -479,6 +408,8 @@ class BaseReport {
 
  protected:
   void PrintAddressDescription() const;
+  void PrintHeapOrGlobalCandidate(tag_t *candidate, tag_t *left,
+                                  tag_t *right) const;
 
   ScopedReport scoped_report;
   StackTrace *stack = nullptr;
@@ -498,6 +429,77 @@ class BaseReport {
   } heap;
 };
 
+void BaseReport::PrintHeapOrGlobalCandidate(tag_t *candidate, tag_t *left,
+                                            tag_t *right) const {
+  Decorator d;
+  uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
+  HwasanChunkView chunk = FindHeapChunkByAddress(mem);
+  if (chunk.IsAllocated()) {
+    uptr offset;
+    const char *whence;
+    if (untagged_addr < chunk.End() && untagged_addr >= chunk.Beg()) {
+      offset = untagged_addr - chunk.Beg();
+      whence = "inside";
+    } else if (candidate == left) {
+      offset = untagged_addr - chunk.End();
+      whence = "after";
+    } else {
+      offset = chunk.Beg() - untagged_addr;
+      whence = "before";
+    }
+    Printf("%s", d.Error());
+    Printf("\nCause: heap-buffer-overflow\n");
+    Printf("%s", d.Default());
+    Printf("%s", d.Location());
+    Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
+           untagged_addr, offset, whence, chunk.UsedSize(), chunk.Beg(),
+           chunk.End());
+    Printf("%s", d.Allocation());
+    Printf("allocated by thread T%u here:\n", chunk.GetAllocThreadId());
+    Printf("%s", d.Default());
+    GetStackTraceFromId(chunk.GetAllocStackId()).Print();
+    return;
+  }
+  // Check whether the address points into a loaded library. If so, this is
+  // most likely a global variable.
+  const char *module_name;
+  uptr module_address;
+  Symbolizer *sym = Symbolizer::GetOrInit();
+  if (sym->GetModuleNameAndOffsetForPC(mem, &module_name, &module_address)) {
+    Printf("%s", d.Error());
+    Printf("\nCause: global-overflow\n");
+    Printf("%s", d.Default());
+    DataInfo info;
+    Printf("%s", d.Location());
+    if (sym->SymbolizeData(mem, &info) && info.start) {
+      Printf(
+          "%p is located %zd bytes %s a %zd-byte global variable "
+          "%s [%p,%p) in %s\n",
+          untagged_addr,
+          candidate == left ? untagged_addr - (info.start + info.size)
+                            : info.start - untagged_addr,
+          candidate == left ? "after" : "before", info.size, info.name,
+          info.start, info.start + info.size, module_name);
+    } else {
+      uptr size = GetGlobalSizeFromDescriptor(mem);
+      if (size == 0)
+        // We couldn't find the size of the global from the descriptors.
+        Printf(
+            "%p is located %s a global variable in "
+            "\n    #0 0x%x (%s+0x%x)\n",
+            untagged_addr, candidate == left ? "after" : "before", mem,
+            module_name, module_address);
+      else
+        Printf(
+            "%p is located %s a %zd-byte global variable in "
+            "\n    #0 0x%x (%s+0x%x)\n",
+            untagged_addr, candidate == left ? "after" : "before", size, mem,
+            module_name, module_address);
+    }
+    Printf("%s", d.Default());
+  }
+}
+
 void BaseReport::PrintAddressDescription() const {
   Decorator d;
   int num_descriptions_printed = 0;
@@ -565,7 +567,7 @@ void BaseReport::PrintAddressDescription() const {
 
   if (!stack_allocations_count && candidate &&
       candidate_distance <= kCloseCandidateDistance) {
-    ShowHeapOrGlobalCandidate(untagged_addr, candidate, left, right);
+    PrintHeapOrGlobalCandidate(candidate, left, right);
     num_descriptions_printed++;
   }
 
@@ -607,7 +609,7 @@ void BaseReport::PrintAddressDescription() const {
   });
 
   if (candidate && num_descriptions_printed == 0) {
-    ShowHeapOrGlobalCandidate(untagged_addr, candidate, left, right);
+    PrintHeapOrGlobalCandidate(candidate, left, right);
     num_descriptions_printed++;
   }
 

From 97abf2e75597abf7324ae31b8ca80dae5f89846f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 18 Sep 2023 13:47:11 -0700
Subject: [PATCH 33/57] [NFC][hwasan] Find overflow candidate early (#66682)

---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 144 ++++++++++++++---------
 1 file changed, 89 insertions(+), 55 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 3740cc4fc51d6..d9a23ad29bc4b 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -404,12 +404,28 @@ class BaseReport {
         stack_allocations[stack_allocations_count++].CopyFrom(t);
       }
     });
+
+    candidate = FindBufferOverflowCandidate();
   }
 
  protected:
+  struct OverflowCandidate {
+    uptr untagged_addr = 0;
+    bool after = false;
+    bool is_close = false;
+
+    struct {
+      uptr begin = 0;
+      uptr end = 0;
+      u32 thread_id = 0;
+      u32 stack_id = 0;
+      bool is_allocated = false;
+    } heap;
+  };
+
+  OverflowCandidate FindBufferOverflowCandidate() const;
   void PrintAddressDescription() const;
-  void PrintHeapOrGlobalCandidate(tag_t *candidate, tag_t *left,
-                                  tag_t *right) const;
+  void PrintHeapOrGlobalCandidate() const;
 
   ScopedReport scoped_report;
   StackTrace *stack = nullptr;
@@ -427,24 +443,64 @@ class BaseReport {
     bool from_small_heap = false;
     bool is_allocated = false;
   } heap;
+
+  OverflowCandidate candidate;
 };
 
-void BaseReport::PrintHeapOrGlobalCandidate(tag_t *candidate, tag_t *left,
-                                            tag_t *right) const {
-  Decorator d;
-  uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
-  HwasanChunkView chunk = FindHeapChunkByAddress(mem);
+BaseReport::OverflowCandidate BaseReport::FindBufferOverflowCandidate() const {
+  // Check if this looks like a heap buffer overflow by scanning
+  // the shadow left and right and looking for the first adjacent
+  // object with a different memory tag. If that tag matches ptr_tag,
+  // check the allocator if it has a live chunk there.
+  tag_t *tag_ptr = reinterpret_cast<tag_t *>(MemToShadow(untagged_addr));
+  tag_t *candidate_tag_ptr = nullptr, *left = tag_ptr, *right = tag_ptr;
+  uptr candidate_distance = 0;
+  for (; candidate_distance < 1000; candidate_distance++) {
+    if (MemIsShadow(reinterpret_cast<uptr>(left)) && TagsEqual(ptr_tag, left)) {
+      candidate_tag_ptr = left;
+      break;
+    }
+    --left;
+    if (MemIsShadow(reinterpret_cast<uptr>(right)) &&
+        TagsEqual(ptr_tag, right)) {
+      candidate_tag_ptr = right;
+      break;
+    }
+    ++right;
+  }
+
+  OverflowCandidate result = {};
+  constexpr auto kCloseCandidateDistance = 1;
+  result.is_close = candidate_distance <= kCloseCandidateDistance;
+
+  result.after = candidate_tag_ptr == left;
+  result.untagged_addr =
+      ShadowToMem(reinterpret_cast<uptr>(candidate_tag_ptr));
+  HwasanChunkView chunk = FindHeapChunkByAddress(result.untagged_addr);
   if (chunk.IsAllocated()) {
+    result.heap.is_allocated = true;
+    result.heap.begin = chunk.Beg();
+    result.heap.end = chunk.End();
+    result.heap.thread_id = chunk.GetAllocThreadId();
+    result.heap.stack_id = chunk.GetAllocStackId();
+  }
+  return result;
+}
+
+void BaseReport::PrintHeapOrGlobalCandidate() const {
+  Decorator d;
+  if (candidate.heap.is_allocated) {
     uptr offset;
     const char *whence;
-    if (untagged_addr < chunk.End() && untagged_addr >= chunk.Beg()) {
-      offset = untagged_addr - chunk.Beg();
+    if (candidate.heap.begin <= untagged_addr &&
+        untagged_addr < candidate.heap.end) {
+      offset = untagged_addr - candidate.heap.begin;
       whence = "inside";
-    } else if (candidate == left) {
-      offset = untagged_addr - chunk.End();
+    } else if (candidate.after) {
+      offset = untagged_addr - candidate.heap.end;
       whence = "after";
     } else {
-      offset = chunk.Beg() - untagged_addr;
+      offset = candidate.heap.begin - untagged_addr;
       whence = "before";
     }
     Printf("%s", d.Error());
@@ -452,12 +508,13 @@ void BaseReport::PrintHeapOrGlobalCandidate(tag_t *candidate, tag_t *left,
     Printf("%s", d.Default());
     Printf("%s", d.Location());
     Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
-           untagged_addr, offset, whence, chunk.UsedSize(), chunk.Beg(),
-           chunk.End());
+           untagged_addr, offset, whence,
+           candidate.heap.end - candidate.heap.begin, candidate.heap.begin,
+           candidate.heap.end);
     Printf("%s", d.Allocation());
-    Printf("allocated by thread T%u here:\n", chunk.GetAllocThreadId());
+    Printf("allocated by thread T%u here:\n", candidate.heap.thread_id);
     Printf("%s", d.Default());
-    GetStackTraceFromId(chunk.GetAllocStackId()).Print();
+    GetStackTraceFromId(candidate.heap.stack_id).Print();
     return;
   }
   // Check whether the address points into a loaded library. If so, this is
@@ -465,36 +522,37 @@ void BaseReport::PrintHeapOrGlobalCandidate(tag_t *candidate, tag_t *left,
   const char *module_name;
   uptr module_address;
   Symbolizer *sym = Symbolizer::GetOrInit();
-  if (sym->GetModuleNameAndOffsetForPC(mem, &module_name, &module_address)) {
+  if (sym->GetModuleNameAndOffsetForPC(candidate.untagged_addr, &module_name,
+                                       &module_address)) {
     Printf("%s", d.Error());
     Printf("\nCause: global-overflow\n");
     Printf("%s", d.Default());
     DataInfo info;
     Printf("%s", d.Location());
-    if (sym->SymbolizeData(mem, &info) && info.start) {
+    if (sym->SymbolizeData(candidate.untagged_addr, &info) && info.start) {
       Printf(
           "%p is located %zd bytes %s a %zd-byte global variable "
           "%s [%p,%p) in %s\n",
           untagged_addr,
-          candidate == left ? untagged_addr - (info.start + info.size)
-                            : info.start - untagged_addr,
-          candidate == left ? "after" : "before", info.size, info.name,
+          candidate.after ? untagged_addr - (info.start + info.size)
+                          : info.start - untagged_addr,
+          candidate.after ? "after" : "before", info.size, info.name,
           info.start, info.start + info.size, module_name);
     } else {
-      uptr size = GetGlobalSizeFromDescriptor(mem);
+      uptr size = GetGlobalSizeFromDescriptor(candidate.untagged_addr);
       if (size == 0)
         // We couldn't find the size of the global from the descriptors.
         Printf(
             "%p is located %s a global variable in "
             "\n    #0 0x%x (%s+0x%x)\n",
-            untagged_addr, candidate == left ? "after" : "before", mem,
-            module_name, module_address);
+            untagged_addr, candidate.after ? "after" : "before",
+            candidate.untagged_addr, module_name, module_address);
       else
         Printf(
             "%p is located %s a %zd-byte global variable in "
             "\n    #0 0x%x (%s+0x%x)\n",
-            untagged_addr, candidate == left ? "after" : "before", size, mem,
-            module_name, module_address);
+            untagged_addr, candidate.after ? "after" : "before", size,
+            candidate.untagged_addr, module_name, module_address);
     }
     Printf("%s", d.Default());
   }
@@ -524,7 +582,7 @@ void BaseReport::PrintAddressDescription() const {
   // Check stack first. If the address is on the stack of a live thread, we
   // know it cannot be a heap / global overflow.
   for (uptr i = 0; i < stack_allocations_count; ++i) {
-    auto &allocations = stack_allocations[i];
+    const auto &allocations = stack_allocations[i];
     // TODO(fmayer): figure out how to distinguish use-after-return and
     // stack-buffer-overflow.
     Printf("%s", d.Error());
@@ -542,32 +600,8 @@ void BaseReport::PrintAddressDescription() const {
     num_descriptions_printed++;
   }
 
-  // Check if this looks like a heap buffer overflow by scanning
-  // the shadow left and right and looking for the first adjacent
-  // object with a different memory tag. If that tag matches ptr_tag,
-  // check the allocator if it has a live chunk there.
-  tag_t *tag_ptr = reinterpret_cast<tag_t*>(MemToShadow(untagged_addr));
-  tag_t *candidate = nullptr, *left = tag_ptr, *right = tag_ptr;
-  uptr candidate_distance = 0;
-  for (; candidate_distance < 1000; candidate_distance++) {
-    if (MemIsShadow(reinterpret_cast<uptr>(left)) && TagsEqual(ptr_tag, left)) {
-      candidate = left;
-      break;
-    }
-    --left;
-    if (MemIsShadow(reinterpret_cast<uptr>(right)) &&
-        TagsEqual(ptr_tag, right)) {
-      candidate = right;
-      break;
-    }
-    ++right;
-  }
-
-  constexpr auto kCloseCandidateDistance = 1;
-
-  if (!stack_allocations_count && candidate &&
-      candidate_distance <= kCloseCandidateDistance) {
-    PrintHeapOrGlobalCandidate(candidate, left, right);
+  if (!stack_allocations_count && candidate.untagged_addr && candidate.is_close) {
+    PrintHeapOrGlobalCandidate();
     num_descriptions_printed++;
   }
 
@@ -608,8 +642,8 @@ void BaseReport::PrintAddressDescription() const {
     }
   });
 
-  if (candidate && num_descriptions_printed == 0) {
-    PrintHeapOrGlobalCandidate(candidate, left, right);
+  if (candidate.untagged_addr && num_descriptions_printed == 0) {
+    PrintHeapOrGlobalCandidate();
     num_descriptions_printed++;
   }
 

From 1c91b1ebad6d70ccb68dd4e8a88711627baa484a Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 19 Sep 2023 19:28:08 -0700
Subject: [PATCH 34/57] [NFC][hwasan] Clang-format
 c557621176f5f38b5757a325cc72be0a11a91c78

---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index d9a23ad29bc4b..969741d8564a0 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -474,8 +474,7 @@ BaseReport::OverflowCandidate BaseReport::FindBufferOverflowCandidate() const {
   result.is_close = candidate_distance <= kCloseCandidateDistance;
 
   result.after = candidate_tag_ptr == left;
-  result.untagged_addr =
-      ShadowToMem(reinterpret_cast<uptr>(candidate_tag_ptr));
+  result.untagged_addr = ShadowToMem(reinterpret_cast<uptr>(candidate_tag_ptr));
   HwasanChunkView chunk = FindHeapChunkByAddress(result.untagged_addr);
   if (chunk.IsAllocated()) {
     result.heap.is_allocated = true;
@@ -600,7 +599,8 @@ void BaseReport::PrintAddressDescription() const {
     num_descriptions_printed++;
   }
 
-  if (!stack_allocations_count && candidate.untagged_addr && candidate.is_close) {
+  if (!stack_allocations_count && candidate.untagged_addr &&
+      candidate.is_close) {
     PrintHeapOrGlobalCandidate();
     num_descriptions_printed++;
   }

From 5670ef44f81ee831216f84f708d517cc79fab516 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 18 Sep 2023 15:26:36 -0700
Subject: [PATCH 35/57] [NFC][hwasan] Extract a few BaseReport::Copy methods
 (#66682)

---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 38 ++++++++++++++----------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 969741d8564a0..0edfa7a53b369 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -390,21 +390,8 @@ class BaseReport {
     if (MemIsShadow(untagged_addr))
       return;
 
-    HwasanChunkView chunk = FindHeapChunkByAddress(untagged_addr);
-    heap.begin = chunk.Beg();
-    if (heap.begin) {
-      heap.size = chunk.ActualSize();
-      heap.from_small_heap = chunk.FromSmallHeap();
-      heap.is_allocated = chunk.IsAllocated();
-    }
-
-    hwasanThreadList().VisitAllLiveThreads([&](Thread *t) {
-      if (stack_allocations_count < ARRAY_SIZE(stack_allocations) &&
-          t->AddrIsInStack(untagged_addr)) {
-        stack_allocations[stack_allocations_count++].CopyFrom(t);
-      }
-    });
-
+    CopyHeapChunk();
+    CopyStackAllocations();
     candidate = FindBufferOverflowCandidate();
   }
 
@@ -423,6 +410,8 @@ class BaseReport {
     } heap;
   };
 
+  void CopyHeapChunk();
+  void CopyStackAllocations();
   OverflowCandidate FindBufferOverflowCandidate() const;
   void PrintAddressDescription() const;
   void PrintHeapOrGlobalCandidate() const;
@@ -447,6 +436,25 @@ class BaseReport {
   OverflowCandidate candidate;
 };
 
+void BaseReport::CopyHeapChunk() {
+  HwasanChunkView chunk = FindHeapChunkByAddress(untagged_addr);
+  heap.begin = chunk.Beg();
+  if (heap.begin) {
+    heap.size = chunk.ActualSize();
+    heap.from_small_heap = chunk.FromSmallHeap();
+    heap.is_allocated = chunk.IsAllocated();
+  }
+}
+
+void BaseReport::CopyStackAllocations() {
+  hwasanThreadList().VisitAllLiveThreads([&](Thread *t) {
+    if (stack_allocations_count < ARRAY_SIZE(stack_allocations) &&
+        t->AddrIsInStack(untagged_addr)) {
+      stack_allocations[stack_allocations_count++].CopyFrom(t);
+    }
+  });
+}
+
 BaseReport::OverflowCandidate BaseReport::FindBufferOverflowCandidate() const {
   // Check if this looks like a heap buffer overflow by scanning
   // the shadow left and right and looking for the first adjacent

From 22602c48d1e0974764daa2c5558452e963b801f2 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 18 Sep 2023 16:37:20 -0700
Subject: [PATCH 36/57] [NFC][hwasan] Extract announce_by_id (#66682)

---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 0edfa7a53b369..c7683698a76de 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -586,6 +586,13 @@ void BaseReport::PrintAddressDescription() const {
         untagged_addr - heap.begin, d.Default());
   }
 
+  auto announce_by_id = [](u32 thread_id) {
+    hwasanThreadList().VisitAllLiveThreads([&](Thread *t) {
+      if (thread_id == t->unique_id())
+        t->Announce();
+    });
+  };
+
   // Check stack first. If the address is on the stack of a live thread, we
   // know it cannot be a heap / global overflow.
   for (uptr i = 0; i < stack_allocations_count; ++i) {
@@ -598,11 +605,7 @@ void BaseReport::PrintAddressDescription() const {
     Printf("Address %p is located in stack of thread T%zd\n", untagged_addr,
            allocations.thread_id());
     Printf("%s", d.Default());
-    hwasanThreadList().VisitAllLiveThreads([&](Thread *t) {
-      if (allocations.thread_id() == t->unique_id())
-        t->Announce();
-    });
-
+    announce_by_id(allocations.thread_id());
     PrintStackAllocations(allocations.get(), ptr_tag, untagged_addr);
     num_descriptions_printed++;
   }

From 41934f2d1f8f5d4f7e2139cbf47318957822d38b Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 18 Sep 2023 16:38:15 -0700
Subject: [PATCH 37/57] [NFC][hwasan] Collect heap allocations early (#66682)

---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 100 ++++++++++++++---------
 1 file changed, 61 insertions(+), 39 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index c7683698a76de..442c5b736611d 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -391,7 +391,7 @@ class BaseReport {
       return;
 
     CopyHeapChunk();
-    CopyStackAllocations();
+    CopyAllocations();
     candidate = FindBufferOverflowCandidate();
   }
 
@@ -411,7 +411,7 @@ class BaseReport {
   };
 
   void CopyHeapChunk();
-  void CopyStackAllocations();
+  void CopyAllocations();
   OverflowCandidate FindBufferOverflowCandidate() const;
   void PrintAddressDescription() const;
   void PrintHeapOrGlobalCandidate() const;
@@ -434,6 +434,15 @@ class BaseReport {
   } heap;
 
   OverflowCandidate candidate;
+
+  uptr heap_allocations_count = 0;
+  struct {
+    HeapAllocationRecord har = {};
+    uptr ring_index = 0;
+    uptr num_matching_addrs = 0;
+    uptr num_matching_addrs_4b = 0;
+    u32 free_thread_id = 0;
+  } heap_allocations[256];
 };
 
 void BaseReport::CopyHeapChunk() {
@@ -446,12 +455,28 @@ void BaseReport::CopyHeapChunk() {
   }
 }
 
-void BaseReport::CopyStackAllocations() {
+void BaseReport::CopyAllocations() {
   hwasanThreadList().VisitAllLiveThreads([&](Thread *t) {
     if (stack_allocations_count < ARRAY_SIZE(stack_allocations) &&
         t->AddrIsInStack(untagged_addr)) {
       stack_allocations[stack_allocations_count++].CopyFrom(t);
     }
+
+    if (heap_allocations_count < ARRAY_SIZE(heap_allocations)) {
+      // Scan all threads' ring buffers to find if it's a heap-use-after-free.
+      HeapAllocationRecord har;
+      uptr ring_index, num_matching_addrs, num_matching_addrs_4b;
+      if (FindHeapAllocation(t->heap_allocations(), tagged_addr, &har,
+                             &ring_index, &num_matching_addrs,
+                             &num_matching_addrs_4b)) {
+        auto &ha = heap_allocations[heap_allocations_count++];
+        ha.har = har;
+        ha.ring_index = ring_index;
+        ha.num_matching_addrs = num_matching_addrs;
+        ha.num_matching_addrs_4b = num_matching_addrs_4b;
+        ha.free_thread_id = t->unique_id();
+      }
+    }
   });
 }
 
@@ -616,42 +641,39 @@ void BaseReport::PrintAddressDescription() const {
     num_descriptions_printed++;
   }
 
-  hwasanThreadList().VisitAllLiveThreads([&](Thread *t) {
-    // Scan all threads' ring buffers to find if it's a heap-use-after-free.
-    HeapAllocationRecord har;
-    uptr ring_index, num_matching_addrs, num_matching_addrs_4b;
-    if (FindHeapAllocation(t->heap_allocations(), tagged_addr, &har,
-                           &ring_index, &num_matching_addrs,
-                           &num_matching_addrs_4b)) {
-      Printf("%s", d.Error());
-      Printf("\nCause: use-after-free\n");
-      Printf("%s", d.Location());
-      Printf("%p is located %zd bytes inside a %zd-byte region [%p,%p)\n",
-             untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
-             har.requested_size, UntagAddr(har.tagged_addr),
-             UntagAddr(har.tagged_addr) + har.requested_size);
-      Printf("%s", d.Allocation());
-      Printf("freed by thread T%u here:\n", t->unique_id());
-      Printf("%s", d.Default());
-      GetStackTraceFromId(har.free_context_id).Print();
-
-      Printf("%s", d.Allocation());
-      Printf("previously allocated by thread T%u here:\n", har.alloc_thread_id);
-      Printf("%s", d.Default());
-      GetStackTraceFromId(har.alloc_context_id).Print();
-
-      // Print a developer note: the index of this heap object
-      // in the thread's deallocation ring buffer.
-      Printf("hwasan_dev_note_heap_rb_distance: %zd %zd\n", ring_index + 1,
-             flags()->heap_history_size);
-      Printf("hwasan_dev_note_num_matching_addrs: %zd\n", num_matching_addrs);
-      Printf("hwasan_dev_note_num_matching_addrs_4b: %zd\n",
-             num_matching_addrs_4b);
-
-      t->Announce();
-      num_descriptions_printed++;
-    }
-  });
+  for (uptr i = 0; i < heap_allocations_count; ++i) {
+    const auto &ha = heap_allocations[i];
+    const HeapAllocationRecord har = ha.har;
+
+    Printf("%s", d.Error());
+    Printf("\nCause: use-after-free\n");
+    Printf("%s", d.Location());
+    Printf("%p is located %zd bytes inside a %zd-byte region [%p,%p)\n",
+           untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
+           har.requested_size, UntagAddr(har.tagged_addr),
+           UntagAddr(har.tagged_addr) + har.requested_size);
+    Printf("%s", d.Allocation());
+    Printf("freed by thread T%u here:\n", ha.free_thread_id);
+    Printf("%s", d.Default());
+    GetStackTraceFromId(har.free_context_id).Print();
+
+    Printf("%s", d.Allocation());
+    Printf("previously allocated by thread T%u here:\n", har.alloc_thread_id);
+    Printf("%s", d.Default());
+    GetStackTraceFromId(har.alloc_context_id).Print();
+
+    // Print a developer note: the index of this heap object
+    // in the thread's deallocation ring buffer.
+    Printf("hwasan_dev_note_heap_rb_distance: %zd %zd\n", ha.ring_index + 1,
+           flags()->heap_history_size);
+    Printf("hwasan_dev_note_num_matching_addrs: %zd\n", ha.num_matching_addrs);
+    Printf("hwasan_dev_note_num_matching_addrs_4b: %zd\n",
+           ha.num_matching_addrs_4b);
+
+    announce_by_id(ha.free_thread_id);
+    // TODO: announce_by_id(har.alloc_thread_id);
+    num_descriptions_printed++;
+  }
 
   if (candidate.untagged_addr && num_descriptions_printed == 0) {
     PrintHeapOrGlobalCandidate();

From 2b7f11a6523b716a385e344a778a6ccea5255e38 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 19 Sep 2023 18:09:14 -0400
Subject: [PATCH 38/57] [libc++] Warn if an unsupported compiler is used

This makes it obvious that libc++ is used in an unsupported configuration,
and the compiler probably has to be updated. It often happens that people
try to use libc++ and don't realize that their compiler is too old.

Differential Revision: https://reviews.llvm.org/D158214
---
 libcxx/include/__config | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index bf2564e2732ba..f301b6450992d 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -25,10 +25,27 @@
 #  define _LIBCPP_CLANG_VER (__clang_major__ * 100 + __clang_minor__)
 #elif defined(__GNUC__)
 #  define _LIBCPP_COMPILER_GCC
+#  define _LIBCPP_GCC_VER (__GNUC__ * 100 + __GNUC_MINOR__)
 #endif
 
 #ifdef __cplusplus
 
+// Warn if a compiler version is used that is not supported anymore
+// LLVM RELEASE Update the minimum compiler versions
+#  if defined(_LIBCPP_CLANG_VER)
+#    if _LIBCPP_CLANG_VER < 1500
+#      warning "Libc++ only supports Clang 15 and later"
+#    endif
+#  elif defined(_LIBCPP_APPLE_CLANG_VER)
+#    if _LIBCPP_APPLE_CLANG_VER < 1400
+#      warning "Libc++ only supports AppleClang 14 and later"
+#    endif
+#  elif defined(_LIBCPP_GCC_VER)
+#    if _LIBCPP_GCC_VER < 1300
+#      warning "Libc++ only supports GCC 13 and later"
+#    endif
+#  endif
+
 // The attributes supported by clang are documented at https://clang.llvm.org/docs/AttributeReference.html
 
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.

From a495b2f8cbcadf52dab0d29b7fafdc97f2a8b1da Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 19 Sep 2023 21:02:27 -0700
Subject: [PATCH 39/57] [ELF][test] Improve tests about non-SHF_ALLOC sections
 relocated by non-ABS relocations

---
 lld/test/ELF/non-abs-reloc.s | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/lld/test/ELF/non-abs-reloc.s b/lld/test/ELF/non-abs-reloc.s
index 6053afbdc3faf..2212421c2373d 100644
--- a/lld/test/ELF/non-abs-reloc.s
+++ b/lld/test/ELF/non-abs-reloc.s
@@ -1,17 +1,27 @@
 // REQUIRES: x86
-// RUN: split-file %s %t
-// RUN: llvm-mc -filetype=obj -triple=i386 %t/asm -o %t.o
-// RUN: ld.lld -T %t/lds %t.o -o %t.exe 2>&1 | FileCheck %s --implicit-check-not=warning: --implicit-check-not=error:
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: llvm-mc -filetype=obj -triple=i386 --defsym X86_32=1 asm -o a.o
+// RUN: ld.lld -T lds a.o -o a 2>&1 | FileCheck %s --implicit-check-not=warning:
 // CHECK:      warning: {{.*}}.o:(.nonalloc1+0x1): has non-ABS relocation R_386_PC32 against symbol '_start'
 // CHECK-NEXT: warning: {{.*}}.o:(.nonalloc1+0x6): has non-ABS relocation R_386_PC32 against symbol '_start'
 
-// RUN: llvm-objdump -D --no-show-raw-insn %t.exe | FileCheck --check-prefix=DISASM %s
+// RUN: llvm-objdump -D --no-show-raw-insn a | FileCheck --check-prefix=DISASM %s
 // DISASM:      Disassembly of section .nonalloc:
 // DISASM-EMPTY:
 // DISASM-NEXT: <.nonalloc>:
 // DISASM-NEXT:   0: nop
-// DISASM-NEXT:   1: calll 0x0
-// DISASM-NEXT:   6: calll 0x0
+// DISASM-NEXT:   1: call{{.}} 0x0
+// DISASM-NEXT:   6: call{{.}} 0x0
+
+/// There is currently no error for -r. See also https://github.com/ClangBuiltLinux/linux/issues/1937
+// RUN: ld.lld -T lds -r a.o -o /dev/null --fatal-warnings
+
+// RUN: llvm-mc -filetype=obj -triple=x86_64 asm -o b.o
+// RUN: ld.lld -T lds b.o -o b 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=warning:
+// RUN: llvm-objdump -D --no-show-raw-insn b | FileCheck --check-prefix=DISASM %s
+// RUN: ld.lld -T lds -r b.o -o /dev/null --fatal-warnings
+// CHECK2:      warning: {{.*}}.o:(.nonalloc1+0x1): has non-ABS relocation R_X86_64_PC32 against symbol '_start'
+// CHECK2-NEXT: warning: {{.*}}.o:(.nonalloc1+0x6): has non-ABS relocation R_X86_64_PC32 against symbol '_start'
 
 //--- lds
 SECTIONS {
@@ -34,5 +44,7 @@ _start:
 
 // GCC may relocate DW_AT_GNU_call_site_value with R_386_GOTOFF.
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98946
+.ifdef X86_32
 .section .debug_random
   .long .L0@gotoff
+.endif

From 678c1f142c0688a092bf36c98294e5302b105a41 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 19 Sep 2023 21:04:50 -0700
Subject: [PATCH 40/57] [ELF] Remove a R_ARM_PCA special case from
 relocateNonAlloc

https://reviews.llvm.org/D75042 added a special case about R_ARM_PCA to
relocateNonAlloc. This is untested and actually unused in the wild.
---
 lld/ELF/InputSection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index f97ca96bf4a85..3023cbfae4a59 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -975,7 +975,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
     std::string msg = getLocation(offset) + ": has non-ABS relocation " +
                       toString(type) + " against symbol '" + toString(sym) +
                       "'";
-    if (expr != R_PC && expr != R_ARM_PCA) {
+    if (expr != R_PC) {
       error(msg);
       return;
     }

From a93e76dd8778a5793c408eb503a46502bcf9b49c Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Wed, 20 Sep 2023 06:28:21 +0200
Subject: [PATCH 41/57] [clang][dataflow] Reorder checks to protect against a
 null pointer dereference. (#66764)

I've received a report of a null pointer dereference happening on the
`LocDst->getType()` dereference. I wasn't unfortunately able to find a
repro,
but I'd argue the new version is better for the reduced indentation
alone.
---
 clang/lib/Analysis/FlowSensitive/Transfer.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index b510114a7a355..2414a1cc026af 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -531,17 +531,18 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       auto *LocDst =
           cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Arg0));
 
+      if (LocSrc == nullptr || LocDst == nullptr)
+        return;
+
       // The assignment operators are different from the type of the destination
-      // in this model (i.e. in one of their base classes). This must be very rare
-      // and we just bail.
+      // in this model (i.e. in one of their base classes). This must be very
+      // rare and we just bail.
       if (Method->getThisObjectType().getCanonicalType().getUnqualifiedType() !=
           LocDst->getType().getCanonicalType().getUnqualifiedType())
         return;
 
-      if (LocSrc != nullptr && LocDst != nullptr) {
-        copyRecord(*LocSrc, *LocDst, Env);
-        Env.setStorageLocation(*S, *LocDst);
-      }
+      copyRecord(*LocSrc, *LocDst, Env);
+      Env.setStorageLocation(*S, *LocDst);
     }
   }
 

From 8cad4dd00077226f9fca5d176b7ef6ed9f668008 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 19 Sep 2023 22:14:25 -0700
Subject: [PATCH 42/57] [MC,X86] Property report error for modifiers with
 incorrect size

---
 .../X86/MCTargetDesc/X86ELFObjectWriter.cpp   | 44 ++++++++++++-------
 llvm/test/MC/ELF/relocation-386.s             | 14 ++++++
 llvm/test/MC/ELF/relocation.s                 |  6 +++
 3 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index d083bf245af22..373e29bf6a835 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -140,8 +140,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
     }
     llvm_unreachable("unexpected relocation type!");
   case MCSymbolRefExpr::VK_GOTOFF:
-    assert(Type == RT64_64);
     assert(!IsPCRel);
+    if (Type != RT64_64)
+      Ctx.reportError(Loc, "unsupported relocation type");
     return ELF::R_X86_64_GOTOFF64;
   case MCSymbolRefExpr::VK_TPOFF:
     assert(!IsPCRel);
@@ -229,7 +230,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
 
 enum X86_32RelType { RT32_NONE, RT32_32, RT32_16, RT32_8 };
 
-static unsigned getRelocType32(MCContext &Ctx,
+static unsigned getRelocType32(MCContext &Ctx, SMLoc Loc,
                                MCSymbolRefExpr::VariantKind Modifier,
                                X86_32RelType Type, bool IsPCRel,
                                MCFixupKind Kind) {
@@ -252,7 +253,8 @@ static unsigned getRelocType32(MCContext &Ctx,
     }
     llvm_unreachable("unexpected relocation type!");
   case MCSymbolRefExpr::VK_GOT:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     if (IsPCRel)
       return ELF::R_386_GOTPC;
     // Older versions of ld.bfd/ld.gold/lld do not support R_386_GOT32X and we
@@ -264,49 +266,61 @@ static unsigned getRelocType32(MCContext &Ctx,
                ? ELF::R_386_GOT32X
                : ELF::R_386_GOT32;
   case MCSymbolRefExpr::VK_GOTOFF:
-    assert(Type == RT32_32);
     assert(!IsPCRel);
+    if (Type != RT32_32)
+      break;
     return ELF::R_386_GOTOFF;
   case MCSymbolRefExpr::VK_TLSCALL:
     return ELF::R_386_TLS_DESC_CALL;
   case MCSymbolRefExpr::VK_TLSDESC:
     return ELF::R_386_TLS_GOTDESC;
   case MCSymbolRefExpr::VK_TPOFF:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_LE_32;
   case MCSymbolRefExpr::VK_DTPOFF:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_LDO_32;
   case MCSymbolRefExpr::VK_TLSGD:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_GD;
   case MCSymbolRefExpr::VK_GOTTPOFF:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_IE_32;
   case MCSymbolRefExpr::VK_PLT:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     return ELF::R_386_PLT32;
   case MCSymbolRefExpr::VK_INDNTPOFF:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_IE;
   case MCSymbolRefExpr::VK_NTPOFF:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_LE;
   case MCSymbolRefExpr::VK_GOTNTPOFF:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_GOTIE;
   case MCSymbolRefExpr::VK_TLSLDM:
-    assert(Type == RT32_32);
+    if (Type != RT32_32)
+      break;
     assert(!IsPCRel);
     return ELF::R_386_TLS_LDM;
   }
+  Ctx.reportError(Loc, "unsupported relocation type");
+  return ELF::R_386_NONE;
 }
 
 unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -329,7 +343,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
     break;
   case RT64_64:
     Ctx.reportError(Fixup.getLoc(), "unsupported relocation type");
-    break;
+    return ELF::R_386_NONE;
   case RT64_32:
   case RT64_32S:
     RelType = RT32_32;
@@ -341,7 +355,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
     RelType = RT32_8;
     break;
   }
-  return getRelocType32(Ctx, Modifier, RelType, IsPCRel, Kind);
+  return getRelocType32(Ctx, Fixup.getLoc(), Modifier, RelType, IsPCRel, Kind);
 }
 
 std::unique_ptr<MCObjectTargetWriter>
diff --git a/llvm/test/MC/ELF/relocation-386.s b/llvm/test/MC/ELF/relocation-386.s
index e49b25a25ce53..dd252f5ff74cb 100644
--- a/llvm/test/MC/ELF/relocation-386.s
+++ b/llvm/test/MC/ELF/relocation-386.s
@@ -1,5 +1,6 @@
 // RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -relax-relocations=false -o - | llvm-readobj -r  - | FileCheck  %s --check-prefix=CHECK --check-prefix=I386
 // RUN: llvm-mc -filetype=obj -triple i386-pc-elfiamcu %s -relax-relocations=false  -o - | llvm-readobj -r  - | FileCheck  %s --check-prefix=CHECK --check-prefix=IAMCU
+// RUN: not llvm-mc -filetype=obj -triple=i686 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error:
 
 // Test that we produce the correct relocation types and that the relocations
 // correctly point to the section or the symbol.
@@ -127,6 +128,19 @@ bar2:
         .word foo
         .byte foo
 
+.ifdef ERR
+// ERR: [[#@LINE+1]]:7: error: unsupported relocation type
+.quad foo@GOT
+// ERR: [[#@LINE+1]]:8: error: unsupported relocation type
+.short foo@GOTOFF
+// ERR: [[#@LINE+1]]:7: error: unsupported relocation type
+.dc.w foo@TPOFF
+// ERR: [[#@LINE+1]]:7: error: unsupported relocation type
+.dc.w foo@INDNTPOFF
+// ERR: [[#@LINE+1]]:7: error: unsupported relocation type
+.dc.w foo@NTPOFF
+.endif
+
         .section        zedsec,"awT",@progbits
 zed:
         .long 0
diff --git a/llvm/test/MC/ELF/relocation.s b/llvm/test/MC/ELF/relocation.s
index 8802330c90b76..797e31f529b3d 100644
--- a/llvm/test/MC/ELF/relocation.s
+++ b/llvm/test/MC/ELF/relocation.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -S --sr  - | FileCheck  %s
+// RUN: not llvm-mc -filetype=obj -triple x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error:
 
 // Test that we produce the correct relocation.
 
@@ -110,3 +111,8 @@ weak_sym:
 // CHECK-NEXT:       0x105 R_X86_64_PC32 pr23272 0x0
 // CHECK-NEXT:     ]
 // CHECK-NEXT:   }
+
+.ifdef ERR
+// ERR: [[#@LINE+1]]:7: error: unsupported relocation type
+.long foo@gotoff
+.endif

From cb97761e85dd60239c89d20c9815135248d060a2 Mon Sep 17 00:00:00 2001
From: Brandon Wu <33961136+4vtomat@users.noreply.github.com>
Date: Wed, 20 Sep 2023 14:06:45 +0800
Subject: [PATCH 43/57] [RISCV] Install sifive_vector.h to
 riscv-resource-headers (#66330)

---
 clang/lib/Headers/CMakeLists.txt         | 7 ++++++-
 clang/test/Headers/riscv-sifive-header.c | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Headers/riscv-sifive-header.c

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 334bc13799409..8e3bbdeb1d16b 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -477,7 +477,7 @@ add_header_target("loongarch-resource-headers" "${loongarch_files}")
 add_header_target("mips-resource-headers" "${mips_msa_files}")
 add_header_target("ppc-resource-headers" "${ppc_files};${ppc_wrapper_files}")
 add_header_target("ppc-htm-resource-headers" "${ppc_htm_files}")
-add_header_target("riscv-resource-headers" "${riscv_files};${riscv_generated_files}")
+add_header_target("riscv-resource-headers" "${riscv_files};${riscv_generated_files};${sifive_files}")
 add_header_target("systemz-resource-headers" "${systemz_files}")
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
@@ -623,6 +623,11 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT riscv-resource-headers)
 
+install(
+  FILES ${sifive_files}
+  DESTINATION ${header_install_dir}
+  COMPONENT riscv-resource-headers)
+
 install(
   FILES ${systemz_files}
   DESTINATION ${header_install_dir}
diff --git a/clang/test/Headers/riscv-sifive-header.c b/clang/test/Headers/riscv-sifive-header.c
new file mode 100644
index 0000000000000..d90f4990c49f7
--- /dev/null
+++ b/clang/test/Headers/riscv-sifive-header.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v %s
+// REQUIRES: riscv-registered-target
+
+// expected-no-diagnostics
+
+#include <sifive_vector.h>

From 3978f37c0f1e368849ff594d9d603ec600010f7e Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <44582521+dc03@users.noreply.github.com>
Date: Fri, 18 Aug 2023 21:33:56 +0530
Subject: [PATCH 44/57] [InferAlignment] Create tests for InferAlignment pass

These tests are in preparation for the InferAlignment pass. They consist
mainly of tests that break when alignment inference is disabled in
LoadInst and StoreInst within InstCombine.

Differential Revision: https://reviews.llvm.org/D158530
---
 llvm/test/Transforms/InferAlignment/alloca.ll |  59 +++++
 llvm/test/Transforms/InferAlignment/atomic.ll |  97 +++++++
 .../Transforms/InferAlignment/attributes.ll   |  44 ++++
 llvm/test/Transforms/InferAlignment/gep-2d.ll |  73 ++++++
 .../Transforms/InferAlignment/gep-array.ll    |  72 +++++
 .../InferAlignment/irregular-size.ll          |  55 ++++
 .../InferAlignment/propagate-assume.ll        | 248 ++++++++++++++++++
 .../test/Transforms/InferAlignment/ptrmask.ll |  77 ++++++
 .../InferAlignment/undef-and-null.ll          |  26 ++
 llvm/test/Transforms/InferAlignment/vector.ll | 111 ++++++++
 .../Transforms/InferAlignment/volatile.ll     |  32 +++
 llvm/test/Transforms/InferAlignment/vscale.ll |  36 +++
 12 files changed, 930 insertions(+)
 create mode 100644 llvm/test/Transforms/InferAlignment/alloca.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/atomic.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/attributes.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/gep-2d.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/gep-array.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/irregular-size.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/propagate-assume.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/ptrmask.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/undef-and-null.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/vector.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/volatile.ll
 create mode 100644 llvm/test/Transforms/InferAlignment/vscale.ll

diff --git a/llvm/test/Transforms/InferAlignment/alloca.ll b/llvm/test/Transforms/InferAlignment/alloca.ll
new file mode 100644
index 0000000000000..b64413336d347
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/alloca.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+; ------------------------------------------------------------------------------
+; Scalar type
+; ------------------------------------------------------------------------------
+
+define void @alloca_local(i8 %x, i32 %y) {
+; CHECK-LABEL: define void @alloca_local
+; CHECK-SAME: (i8 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 1
+; CHECK-NEXT:    [[LOAD_I8:%.*]] = load i8, ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    [[LOAD_I32:%.*]] = load i32, ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    store i8 [[X]], ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    store i32 [[Y]], ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca i32, align 1
+
+  %load.i8 = load i8, ptr %alloca, align 1
+  %load.i32 = load i32, ptr %alloca, align 1
+
+  store i8 %x, ptr %alloca, align 1
+  store i32 %y, ptr %alloca, align 1
+
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; Struct type
+; ------------------------------------------------------------------------------
+
+%struct.pair = type { { i32, i32 }, { i32, i32 } }
+
+define void @alloca_struct(i32 %x) {
+; CHECK-LABEL: define void @alloca_struct
+; CHECK-SAME: (i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA_STRUCT:%.*]] = alloca [[STRUCT_PAIR:%.*]], align 8
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr [[STRUCT_PAIR]], ptr [[ALLOCA_STRUCT]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr { i32, i32 }, ptr [[GEP_0]], i64 0, i32 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, ptr [[GEP_0]], align 1
+; CHECK-NEXT:    store i32 0, ptr [[GEP_0]], align 1
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, ptr [[GEP_1]], align 1
+; CHECK-NEXT:    store i32 0, ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca.struct = alloca %struct.pair
+
+  %gep.0 = getelementptr %struct.pair, ptr %alloca.struct, i64 0, i32 1
+  %gep.1 = getelementptr { i32, i32 }, ptr %gep.0, i64 0, i32 1
+
+  %load.2 = load i32, ptr %gep.0, align 1
+  store i32 0, ptr %gep.0, align 1
+
+  %load.1 = load i32, ptr %gep.1, align 1
+  store i32 0, ptr %gep.1, align 1
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/atomic.ll b/llvm/test/Transforms/InferAlignment/atomic.ll
new file mode 100644
index 0000000000000..23efc4381fd32
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/atomic.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S < %s -passes=no-op-function | FileCheck %s
+
+; ------------------------------------------------------------------------------
+; load/store of null
+; ------------------------------------------------------------------------------
+
+define void @load_null() {
+; CHECK-LABEL: define void @load_null() {
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr null unordered, align 4
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr null monotonic, align 4
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr null seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  %x.0 = load atomic i32, ptr null unordered, align 4
+  %x.1 = load atomic i32, ptr null monotonic, align 4
+  %x.2 = load atomic i32, ptr null seq_cst, align 4
+  ret void
+}
+
+define void @store_null() {
+; CHECK-LABEL: define void @store_null() {
+; CHECK-NEXT:    store atomic i32 0, ptr null unordered, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic i32 0, ptr null unordered, align 4
+  store atomic i32 0, ptr null monotonic, align 4
+  store atomic i32 0, ptr null seq_cst, align 4
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; load/store of global
+; ------------------------------------------------------------------------------
+@c = global i64 42
+
+define void @load_nonnull() {
+; CHECK-LABEL: define void @load_nonnull() {
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr @c unordered, align 4
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr @c monotonic, align 4
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr @c seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  %x.0 = load atomic i32, ptr @c unordered, align 4
+  %x.1 = load atomic i32, ptr @c monotonic, align 4
+  %x.2 = load atomic i32, ptr @c seq_cst, align 4
+  ret void
+}
+
+define void @store_nonnull() {
+; CHECK-LABEL: define void @store_nonnull() {
+; CHECK-NEXT:    store atomic i32 0, ptr @c unordered, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr @c monotonic, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr @c seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic i32 0, ptr @c unordered, align 4
+  store atomic i32 0, ptr @c monotonic, align 4
+  store atomic i32 0, ptr @c seq_cst, align 4
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; load/store of alloca
+; ------------------------------------------------------------------------------
+
+define void @load_alloca() {
+; CHECK-LABEL: define void @load_alloca() {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr [[ALLOCA]] unordered, align 1
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr [[ALLOCA]] monotonic, align 1
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr [[ALLOCA]] seq_cst, align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca i32
+  %x.0 = load atomic i32, ptr %alloca unordered, align 1
+  %x.1 = load atomic i32, ptr %alloca monotonic, align 1
+  %x.2 = load atomic i32, ptr %alloca seq_cst, align 1
+  ret void
+}
+
+define void @store_alloca() {
+; CHECK-LABEL: define void @store_alloca() {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] unordered, align 1
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] monotonic, align 1
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] seq_cst, align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca i32
+  store atomic i32 0, ptr %alloca unordered, align 1
+  store atomic i32 0, ptr %alloca monotonic, align 1
+  store atomic i32 0, ptr %alloca seq_cst, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/attributes.ll b/llvm/test/Transforms/InferAlignment/attributes.ll
new file mode 100644
index 0000000000000..6dce9a11d661f
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/attributes.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+define void @attribute(ptr align 32 %a) {
+; CHECK-LABEL: define void @attribute
+; CHECK-SAME: (ptr align 32 [[A:%.*]]) {
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    store i32 123, ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+  %load = load i32, ptr %a, align 1
+  store i32 123, ptr %a, align 1
+  ret void
+}
+
+define void @attribute_through_call(ptr align 32 %a) {
+; CHECK-LABEL: define void @attribute_through_call
+; CHECK-SAME: (ptr align 32 [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call ptr @call(ptr [[A]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 1
+; CHECK-NEXT:    store i32 123, ptr [[RES]], align 1
+; CHECK-NEXT:    ret void
+;
+  %res = call ptr @call(ptr %a)
+  %load = load i32, ptr %res, align 1
+  store i32 123, ptr %res, align 1
+  ret void
+}
+
+define void @attribute_return_value(ptr %a) {
+; CHECK-LABEL: define void @attribute_return_value
+; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call align 32 ptr @call(ptr [[A]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 1
+; CHECK-NEXT:    store i32 123, ptr [[RES]], align 1
+; CHECK-NEXT:    ret void
+;
+  %res = call align 32 ptr @call(ptr %a)
+  %load = load i32, ptr %res, align 1
+  store i32 123, ptr %res, align 1
+  ret void
+}
+
+declare ptr @call(ptr returned)
diff --git a/llvm/test/Transforms/InferAlignment/gep-2d.ll b/llvm/test/Transforms/InferAlignment/gep-2d.ll
new file mode 100644
index 0000000000000..b88a9be988ccf
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/gep-2d.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+; A multi-dimensional array in a nested loop.inner doing vector stores that
+; aren't yet aligned. InferAlignment can understand the addressing in the
+; Nice case to prove 16 byte alignment. In the Awkward case, the inner
+; array dimension is not even, so the stores to it won't always be aligned.
+;
+; InferAlignment should prove alignment in exactly one of the two cases.
+
+@Nice    = global [1001 x [20000 x double]] zeroinitializer, align 32
+@Awkward = global [1001 x [20001 x double]] zeroinitializer, align 32
+
+define void @nested_loop() {
+; CHECK-LABEL: define void @nested_loop() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_OUTER:%.*]]
+; CHECK:       loop.outer:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_OUTER_TAIL:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_INNER:%.*]]
+; CHECK:       loop.inner:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 0, [[LOOP_OUTER]] ], [ [[J_NEXT:%.*]], [[LOOP_INNER_TAIL:%.*]] ]
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr [1001 x [20000 x double]], ptr @Nice, i64 0, i64 [[I]], i64 [[J]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load <2 x double>, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr [1001 x [20001 x double]], ptr @Awkward, i64 0, i64 [[I]], i64 [[J]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_2]], align 8
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load <2 x double>, ptr [[GEP_2]], align 8
+; CHECK-NEXT:    br label [[LOOP_INNER_TAIL]]
+; CHECK:       loop.inner.tail:
+; CHECK-NEXT:    [[J_NEXT]] = add i64 [[J]], 2
+; CHECK-NEXT:    [[J_CMP:%.*]] = icmp eq i64 [[J_NEXT]], 556
+; CHECK-NEXT:    br i1 [[J_CMP]], label [[LOOP_OUTER_TAIL]], label [[LOOP_INNER]]
+; CHECK:       loop.outer.tail:
+; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp eq i64 [[I_NEXT]], 991
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[RETURN:%.*]], label [[LOOP_OUTER]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.outer
+
+loop.outer:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop.outer.tail ]
+  br label %loop.inner
+
+loop.inner:
+  %j = phi i64 [ 0, %loop.outer ], [ %j.next, %loop.inner.tail ]
+
+  %gep.1 = getelementptr [1001 x [20000 x double]], ptr @Nice, i64 0, i64 %i, i64 %j
+  store <2 x double><double 0.0, double 0.0>, ptr %gep.1, align 8
+  %load.1 = load <2 x double>, ptr %gep.1, align 8
+
+  %gep.2 = getelementptr [1001 x [20001 x double]], ptr @Awkward, i64 0, i64 %i, i64 %j
+  store <2 x double><double 0.0, double 0.0>, ptr %gep.2, align 8
+  %load.2 = load <2 x double>, ptr %gep.2, align 8
+
+  br label %loop.inner.tail
+
+loop.inner.tail:
+  %j.next = add i64 %j, 2
+  %j.cmp = icmp eq i64 %j.next, 556
+  br i1 %j.cmp, label %loop.outer.tail, label %loop.inner
+
+loop.outer.tail:
+  %i.next = add i64 %i, 1
+  %i.cmp = icmp eq i64 %i.next, 991
+  br i1 %i.cmp, label %return, label %loop.outer
+
+return:
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/gep-array.ll b/llvm/test/Transforms/InferAlignment/gep-array.ll
new file mode 100644
index 0000000000000..76ba55eee649e
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/gep-array.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes=no-op-function -S < %s | FileCheck %s
+
+; ------------------------------------------------------------------------------
+; Array of pair
+; ------------------------------------------------------------------------------
+
+; Check that we improve the alignment information.
+; The base pointer is 16-byte aligned and we access the field at offsets of 8
+; bytes.
+; Every element in the @array.simple array is 16-byte aligned so any access from
+; the following gep is 8-byte aligned.
+
+%pair.simple = type { ptr, i32 }
+@array.simple = global [4 x %pair.simple] zeroinitializer, align 16
+
+define void @simple_pair(i64 %idx) {
+; CHECK-LABEL: define void @simple_pair
+; CHECK-SAME: (i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [4 x %pair.simple], ptr @array.simple, i64 0, i64 [[IDX]], i32 1
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 1
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr inbounds [4 x %pair.simple], ptr @array.simple, i64 0, i64 %idx, i32 1
+
+  %res = load i32, ptr %gep, align 1
+  store i32 0, ptr %gep, align 1
+
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; Array of pair of arrays
+; ------------------------------------------------------------------------------
+
+%pair.array = type { [3 x i32], [3 x i32] }
+@array.array = internal global [3 x %pair.array] zeroinitializer
+
+define void @load_nested() {
+; CHECK-LABEL: define void @load_nested() {
+; CHECK-NEXT:    [[X_0:%.*]] = load i32, ptr @array.array, align 4
+; CHECK-NEXT:    [[X_1:%.*]] = load i32, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[X_2:%.*]] = load i32, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[X_3:%.*]] = load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[X_4:%.*]] = load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 4
+; CHECK-NEXT:    ret void
+;
+  %x.0 =  load i32, ptr @array.array, align 4
+  %x.1 =  load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 1), align 4
+  %x.2 =  load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 4
+  %x.3 =  load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 3), align 4
+  %x.4 =  load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 4
+  ret void
+}
+
+define void @store_nested() {
+; CHECK-LABEL: define void @store_nested() {
+; CHECK-NEXT:    store i32 1, ptr @array.array, align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 3), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 1, ptr @array.array, align 4
+  store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 1), align 4
+  store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 4
+  store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 3), align 4
+  store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/irregular-size.ll b/llvm/test/Transforms/InferAlignment/irregular-size.ll
new file mode 100644
index 0000000000000..caec3f55b6121
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/irregular-size.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+define void @non_pow2_size(i177 %X) {
+; CHECK-LABEL: define void @non_pow2_size
+; CHECK-SAME: (i177 [[X:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = alloca i177, align 1
+; CHECK-NEXT:    [[L1:%.*]] = load i177, ptr [[A]], align 1
+; CHECK-NEXT:    store i177 [[X]], ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+  %A = alloca i177, align 1
+  %L1 = load i177, ptr %A, align 1
+  store i177 %X, ptr %A, align 1
+  ret void
+}
+
+; TODO: For non-byte-sized vectors, current implementation assumes there is
+; padding to the next byte boundary between elements.
+@vector_i4 = constant [16 x <2 x i4>] zeroinitializer, align 8
+
+define void @load_vector_i4(i4 %X) {
+; CHECK-LABEL: define void @load_vector_i4
+; CHECK-SAME: (i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[PTR_0:%.*]] = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 1
+; CHECK-NEXT:    [[PTR_1:%.*]] = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 2
+; CHECK-NEXT:    [[PTR_2:%.*]] = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 4
+; CHECK-NEXT:    [[PTR_3:%.*]] = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 8
+; CHECK-NEXT:    [[RES_0:%.*]] = load i4, ptr [[PTR_0]], align 1
+; CHECK-NEXT:    [[RES_1:%.*]] = load i4, ptr [[PTR_1]], align 1
+; CHECK-NEXT:    [[RES_2:%.*]] = load i4, ptr [[PTR_2]], align 1
+; CHECK-NEXT:    [[RES_3:%.*]] = load i4, ptr [[PTR_3]], align 1
+; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_0]], align 1
+; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_1]], align 1
+; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_2]], align 1
+; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_3]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr.0 = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 1
+  %ptr.1 = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 2
+  %ptr.2 = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 4
+  %ptr.3 = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 8
+
+  %res.0 = load i4, ptr %ptr.0, align 1
+  %res.1 = load i4, ptr %ptr.1, align 1
+  %res.2 = load i4, ptr %ptr.2, align 1
+  %res.3 = load i4, ptr %ptr.3, align 1
+
+  store i4 %X, ptr %ptr.0, align 1
+  store i4 %X, ptr %ptr.1, align 1
+  store i4 %X, ptr %ptr.2, align 1
+  store i4 %X, ptr %ptr.3, align 1
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/propagate-assume.ll b/llvm/test/Transforms/InferAlignment/propagate-assume.ll
new file mode 100644
index 0000000000000..a5c7afa0393ba
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/propagate-assume.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+; ------------------------------------------------------------------------------
+; Simple test
+; ------------------------------------------------------------------------------
+
+define void @simple_forwardpropagate(ptr %a) {
+; CHECK-LABEL: define void @simple_forwardpropagate
+; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptrint = ptrtoint ptr %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+
+  %load.a = load i32, ptr %a, align 4
+  store i32 345, ptr %a, align 4
+
+  ret void
+}
+
+define void @simple_backpropagate(ptr %a) {
+; CHECK-LABEL: define void @simple_backpropagate
+; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    ret void
+;
+  %load.a = load i32, ptr %a, align 4
+  store i32 345, ptr %a, align 4
+
+  %ptrint = ptrtoint ptr %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+
+  ret void
+}
+
+define void @simple_forwardpropagate_bundle(ptr %a) {
+; CHECK-LABEL: define void @simple_forwardpropagate_bundle
+; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i32 32) ]
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr %a, i32 32)]
+  %load.a = load i32, ptr %a, align 4
+  store i32 345, ptr %a, align 4
+  ret void
+}
+
+define void @simple_backpropagate_bundle(ptr %a) {
+; CHECK-LABEL: define void @simple_backpropagate_bundle
+; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i32 32) ]
+; CHECK-NEXT:    ret void
+;
+  %load.a = load i32, ptr %a, align 4
+  store i32 345, ptr %a, align 4
+  call void @llvm.assume(i1 true) ["align"(ptr %a, i32 32)]
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; Complex test
+; ------------------------------------------------------------------------------
+
+define void @loop_forwardpropagate(ptr %a, ptr %b) {
+; CHECK-LABEL: define void @loop_forwardpropagate
+; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    [[PTRINT2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[MASKEDPTR2:%.*]] = and i64 [[PTRINT2]], 63
+; CHECK-NEXT:    [[MASKEDCOND2:%.*]] = icmp eq i64 [[MASKEDPTR2]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKEDCOND2]])
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[LOAD_B]], 1
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[I_NEXT]], 1648
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptrint = ptrtoint ptr %a to i64
+  %maskedptr = and i64 %ptrint, 63
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+
+  %ptrint2 = ptrtoint ptr %b to i64
+  %maskedptr2 = and i64 %ptrint2, 63
+  %maskedcond2 = icmp eq i64 %maskedptr2, 0
+  tail call void @llvm.assume(i1 %maskedcond2)
+
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %i
+  %load.b = load i32, ptr %gep.b, align 4
+  %add = add nsw i32 %load.b, 1
+
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %i
+  store i32 %add, ptr %gep.a, align 4
+
+  %i.next = add nuw nsw i64 %i, 16
+  %cmp = icmp slt i64 %i.next, 1648
+
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @loop_forwardpropagate_bundle(ptr %a, ptr %b) {
+; CHECK-LABEL: define void @loop_forwardpropagate_bundle
+; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i32 64) ]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[B]], i32 64) ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[LOAD_B]], 1
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[I_NEXT]], 1648
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.assume(i1 true) ["align"(ptr %a, i32 64)]
+  tail call void @llvm.assume(i1 true) ["align"(ptr %b, i32 64)]
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %i
+  %load.b = load i32, ptr %gep.b, align 4
+  %add = add nsw i32 %load.b, 1
+
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %i
+  store i32 %add, ptr %gep.a, align 4
+
+  %i.next = add nuw nsw i64 %i, 16
+  %cmp = icmp slt i64 %i.next, 1648
+
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Check that assume is propagated backwards through all
+; operations that are `isGuaranteedToTransferExecutionToSuccessor`
+; (it should reach the load and mark it as `align 32`).
+define void @complex_backpropagate(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: define void @complex_backpropagate
+; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    store i32 [[LOAD_B]], ptr [[A]], align 4
+; CHECK-NEXT:    [[OBJ_SIZE:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[C]], i1 false, i1 false, i1 false)
+; CHECK-NEXT:    store i64 [[OBJ_SIZE]], ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca i64
+  %load.a = load i32, ptr %a, align 4
+
+  %load.b = load i32, ptr %b
+  store i32 %load.b, ptr %a
+
+  %obj.size = call i64 @llvm.objectsize.i64.p0(ptr %c, i1 false)
+  store i64 %obj.size, ptr %alloca
+
+  %ptrint = ptrtoint ptr %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+
+  ret void
+}
+
+define void @complex_backpropagate_bundle(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: define void @complex_backpropagate_bundle
+; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    store i32 [[LOAD_B]], ptr [[A]], align 4
+; CHECK-NEXT:    [[OBJ_SIZE:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[C]], i1 false, i1 false, i1 false)
+; CHECK-NEXT:    store i64 [[OBJ_SIZE]], ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i32 32) ]
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca i64
+  %load.a = load i32, ptr %a, align 4
+
+  %load.b = load i32, ptr %b
+  store i32 %load.b, ptr %a
+
+  %obj.size = call i64 @llvm.objectsize.i64.p0(ptr %c, i1 false)
+  store i64 %obj.size, ptr %alloca
+
+  tail call void @llvm.assume(i1 true) ["align"(ptr %a, i32 32)]
+
+  ret void
+}
+
+declare i64 @llvm.objectsize.i64.p0(ptr, i1)
+declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InferAlignment/ptrmask.ll b/llvm/test/Transforms/InferAlignment/ptrmask.ll
new file mode 100644
index 0000000000000..1db2d09321648
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/ptrmask.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+; ------------------------------------------------------------------------------
+; load instructions
+; ------------------------------------------------------------------------------
+
+define void @load(ptr align 1 %ptr) {
+; CHECK-LABEL: define void @load
+; CHECK-SAME: (ptr align 1 [[PTR:%.*]]) {
+; CHECK-NEXT:    [[ALIGNED_0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -2)
+; CHECK-NEXT:    [[ALIGNED_1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -4)
+; CHECK-NEXT:    [[ALIGNED_2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -8)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = load <16 x i8>, ptr [[ALIGNED_0]], align 1
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load <16 x i8>, ptr [[ALIGNED_1]], align 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load <16 x i8>, ptr [[ALIGNED_2]], align 1
+; CHECK-NEXT:    ret void
+;
+  %aligned.0 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -2)
+  %aligned.1 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -4)
+  %aligned.2 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
+
+  %load.0 = load <16 x i8>, ptr %aligned.0, align 1
+  %load.1 = load <16 x i8>, ptr %aligned.1, align 1
+  %load.2 = load <16 x i8>, ptr %aligned.2, align 1
+
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; store instructions
+; ------------------------------------------------------------------------------
+
+define void @store(ptr align 1 %ptr) {
+; CHECK-LABEL: define void @store
+; CHECK-SAME: (ptr align 1 [[PTR:%.*]]) {
+; CHECK-NEXT:    [[ALIGNED_0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -2)
+; CHECK-NEXT:    [[ALIGNED_1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -4)
+; CHECK-NEXT:    [[ALIGNED_2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -8)
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_0]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_1]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_2]], align 1
+; CHECK-NEXT:    ret void
+;
+  %aligned.0 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -2)
+  %aligned.1 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -4)
+  %aligned.2 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
+
+  store <16 x i8> zeroinitializer, ptr %aligned.0, align 1
+  store <16 x i8> zeroinitializer, ptr %aligned.1, align 1
+  store <16 x i8> zeroinitializer, ptr %aligned.2, align 1
+
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; Overaligned pointer
+; ------------------------------------------------------------------------------
+
+; Underlying alignment greater than alignment forced by ptrmask
+define void @ptrmask_overaligned(ptr align 16 %ptr) {
+; CHECK-LABEL: define void @ptrmask_overaligned
+; CHECK-SAME: (ptr align 16 [[PTR:%.*]]) {
+; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -8)
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED]], align 1
+; CHECK-NEXT:    ret void
+;
+  %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
+
+  %load = load <16 x i8>, ptr %aligned, align 1
+  store <16 x i8> zeroinitializer, ptr %aligned, align 1
+
+  ret void
+}
+
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
diff --git a/llvm/test/Transforms/InferAlignment/undef-and-null.ll b/llvm/test/Transforms/InferAlignment/undef-and-null.ll
new file mode 100644
index 0000000000000..76b751a4d411f
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/undef-and-null.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes=no-op-function -S < %s | FileCheck %s
+
+define void @load_undef_null(ptr %P) {
+; CHECK-LABEL: define void @load_undef_null
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[RET_0:%.*]] = load i32, ptr undef, align 4
+; CHECK-NEXT:    [[RET_1:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    ret void
+;
+  %ret.0 = load i32, ptr undef
+  %ret.1 = load i32, ptr null
+  ret void
+}
+
+define void @store_undef_null(ptr %P) {
+; CHECK-LABEL: define void @store_undef_null
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 123, ptr undef, align 4
+; CHECK-NEXT:    store i32 124, ptr null, align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 123, ptr undef
+  store i32 124, ptr null
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/vector.ll b/llvm/test/Transforms/InferAlignment/vector.ll
new file mode 100644
index 0000000000000..1599b583f3244
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/vector.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+; InferAlignment should be able to prove vector alignment in the
+; presence of a few mild address computation tricks.
+
+; ------------------------------------------------------------------------------
+; alloca
+; ------------------------------------------------------------------------------
+
+define void @alloca(<2 x i64> %y) {
+; CHECK-LABEL: define void @alloca
+; CHECK-SAME: (<2 x i64> [[Y:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca <2 x i64>, align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <2 x i64>
+  %load = load <2 x i64>, ptr %alloca, align 1
+  store <2 x i64> %y, ptr %alloca, align 1
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; global
+; ------------------------------------------------------------------------------
+
+@x.vector = external global <2 x i64>, align 16
+
+define void @global(<2 x i64> %y) {
+; CHECK-LABEL: define void @global
+; CHECK-SAME: (<2 x i64> [[Y:%.*]]) {
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr @x.vector, align 1
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr @x.vector, align 1
+; CHECK-NEXT:    ret void
+;
+  %load = load <2 x i64>, ptr @x.vector, align 1
+  store <2 x i64> %y, ptr @x.vector, align 1
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; getelementptr
+; ------------------------------------------------------------------------------
+
+@vector = external global <2 x i64>, align 16
+@vector.arr = external global [13 x <2 x i64>], align 16
+
+; ------------------------------------------------------------------------------
+; 1d access
+; ------------------------------------------------------------------------------
+
+define void @vector_singular(i32 %i, <2 x i64> %y) {
+; CHECK-LABEL: define void @vector_singular
+; CHECK-SAME: (i32 [[I:%.*]], <2 x i64> [[Y:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <2 x i64>, ptr @vector, i32 [[I]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 1
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[GEP]], align 1
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr <2 x i64>, ptr @vector, i32 %i
+  %load = load <2 x i64>, ptr %gep, align 1
+  store <2 x i64> %y, ptr %gep, align 1
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; 2d access
+; ------------------------------------------------------------------------------
+
+define void @vector_array(i32 %i, i32 %j, <2 x i64> %y) {
+; CHECK-LABEL: define void @vector_array
+; CHECK-SAME: (i32 [[I:%.*]], i32 [[J:%.*]], <2 x i64> [[Y:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [13 x <2 x i64>], ptr @vector.arr, i32 [[I]], i32 [[J]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 1
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[GEP]], align 1
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr [13 x <2 x i64>], ptr @vector.arr, i32 %i, i32 %j
+  %load = load <2 x i64>, ptr %gep, align 1
+  store <2 x i64> %y, ptr %gep, align 1
+  ret void
+}
+
+; ------------------------------------------------------------------------------
+; non-vector array type
+; ------------------------------------------------------------------------------
+
+; When we see a unaligned load or store from an insufficiently aligned global or
+; alloca, increase the alignment, turning it into an aligned load or store.
+@x.array = internal global [4 x i32] zeroinitializer
+
+define void @nonvector_array() {
+; CHECK-LABEL: define void @nonvector_array() {
+; CHECK-NEXT:    [[LOAD_0:%.*]] = load <16 x i8>, ptr @x.array, align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @x.array, align 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i32], ptr @x.array, i16 0, i16 2
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load <16 x i8>, ptr [[GEP]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[GEP]], align 1
+; CHECK-NEXT:    ret void
+;
+  %load.0 = load <16 x i8>, ptr @x.array, align 1
+  store <16 x i8> zeroinitializer, ptr @x.array, align 1
+
+  %gep = getelementptr [4 x i32], ptr @x.array, i16 0, i16 2
+  %load.1 = load <16 x i8>, ptr %gep, align 1
+  store <16 x i8> zeroinitializer, ptr %gep, align 1
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/volatile.ll b/llvm/test/Transforms/InferAlignment/volatile.ll
new file mode 100644
index 0000000000000..f2991b6fc2d2d
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/volatile.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+
+define void @load_volatile() {
+; CHECK-LABEL: define void @load_volatile() {
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32 }, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load volatile i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load volatile i32, ptr [[B]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca { i32 }
+  %b = alloca i32
+  %load.a = load volatile i32, ptr %a
+  %load.b = load volatile i32, ptr %b
+  ret void
+}
+
+define void @store_volatile() {
+; CHECK-LABEL: define void @store_volatile() {
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32 }, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store volatile i32 123, ptr [[A]], align 4
+; CHECK-NEXT:    store volatile i32 123, ptr [[B]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca { i32 }
+  %b = alloca i32
+  store volatile i32 123, ptr %a
+  store volatile i32 123, ptr %b
+  ret void
+}
diff --git a/llvm/test/Transforms/InferAlignment/vscale.ll b/llvm/test/Transforms/InferAlignment/vscale.ll
new file mode 100644
index 0000000000000..5152d50a6bb77
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/vscale.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes=no-op-function -S < %s | FileCheck %s
+
+; <4 x i32> -> 16 byte alignment
+define void @alignment_sustain(ptr align 16 %ptr) {
+; CHECK-LABEL: define void @alignment_sustain
+; CHECK-SAME: (ptr align 16 [[PTR:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i32 3
+; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x i32>, ptr [[GEP]], align 16
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[GEP]], align 16
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr <vscale x 4 x i32>, ptr %ptr, i32 3
+
+  %load = load <4 x i32>, ptr %gep, align 16
+  store <4 x i32> zeroinitializer, ptr %gep, align 16
+
+  ret void
+}
+
+; <8 x i32> -> 32 byte alignment
+define void @alignment_increase(ptr align 32 %ptr) {
+; CHECK-LABEL: define void @alignment_increase
+; CHECK-SAME: (ptr align 32 [[PTR:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 8 x i32>, ptr [[PTR]], i32 3
+; CHECK-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[GEP]], align 16
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr [[GEP]], align 16
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr <vscale x 8 x i32>, ptr %ptr, i32 3
+
+  %load = load <8 x i32>, ptr %gep, align 16
+  store <8 x i32> zeroinitializer, ptr %gep, align 16
+
+  ret void
+}

From 0f152a55d3e4e71f7c795bf555e40c8895b97077 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <44582521+dc03@users.noreply.github.com>
Date: Thu, 20 Jul 2023 10:57:19 +0530
Subject: [PATCH 45/57] [InferAlignment] Implement InferAlignmentPass

This pass aims to infer alignment for instructions as a separate pass,
to reduce redundant work done by InstCombine running multiple times. It
runs late in the pipeline, just before the back-end passes where this
information is most useful.

Differential Revision: https://reviews.llvm.org/D158529
---
 .../llvm/Transforms/Scalar/InferAlignment.h   | 27 ++++++
 llvm/include/llvm/Transforms/Utils/Local.h    |  9 ++
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      | 10 ++
 llvm/lib/Passes/PassRegistry.def              |  1 +
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |  1 +
 llvm/lib/Transforms/Scalar/InferAlignment.cpp | 91 +++++++++++++++++++
 llvm/lib/Transforms/Utils/Local.cpp           | 11 +--
 llvm/test/Transforms/InferAlignment/alloca.ll | 20 ++--
 llvm/test/Transforms/InferAlignment/atomic.ll | 38 ++++----
 .../Transforms/InferAlignment/attributes.ll   | 14 +--
 llvm/test/Transforms/InferAlignment/gep-2d.ll |  6 +-
 .../Transforms/InferAlignment/gep-array.ll    | 18 ++--
 .../InferAlignment/irregular-size.ll          | 20 ++--
 .../InferAlignment/propagate-assume.ll        | 38 ++++----
 .../test/Transforms/InferAlignment/ptrmask.ll | 18 ++--
 .../InferAlignment/undef-and-null.ll          |  6 +-
 llvm/test/Transforms/InferAlignment/vector.ll | 26 +++---
 .../Transforms/InferAlignment/volatile.ll     |  6 +-
 llvm/test/Transforms/InferAlignment/vscale.ll |  6 +-
 20 files changed, 250 insertions(+), 117 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Scalar/InferAlignment.h
 create mode 100644 llvm/lib/Transforms/Scalar/InferAlignment.cpp

diff --git a/llvm/include/llvm/Transforms/Scalar/InferAlignment.h b/llvm/include/llvm/Transforms/Scalar/InferAlignment.h
new file mode 100644
index 0000000000000..f6fc5f3b8d987
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/InferAlignment.h
@@ -0,0 +1,27 @@
+//===- InferAlignment.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Infer alignment for load, stores and other memory operations based on
+// trailing zero known bits information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H
+#define LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct InferAlignmentPass : public PassInfoMixin<InferAlignmentPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 81de70a2fb4cd..c752bc7f4a31f 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -213,6 +213,15 @@ AllocaInst *DemoteRegToStack(Instruction &X,
 /// deleted and it returns the pointer to the alloca inserted.
 AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr);
 
+/// If the specified pointer points to an object that we control, try to modify
+/// the object's alignment to PrefAlign. Returns a minimum known alignment of
+/// the value after the operation, which may be lower than PrefAlign.
+///
+/// Increating value alignment isn't often possible though. If alignment is
+/// important, a more reliable approach is to simply align all global variables
+/// and allocation instructions to their preferred alignment from the beginning.
+Align tryEnforceAlignment(Value *V, Align PrefAlign, const DataLayout &DL);
+
 /// Try to ensure that the alignment of \p V is at least \p PrefAlign bytes. If
 /// the owning object can be modified and has an alignment less than \p
 /// PrefAlign, it will be increased and \p PrefAlign returned. If the alignment
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5c7f26109930c..985ff88139323 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -171,6 +171,7 @@
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
+#include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index b12b4ee3e0e59..529743cc8bd2e 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -88,6 +88,7 @@
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
@@ -274,6 +275,11 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
     cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
 
+cl::opt<bool> EnableInferAlignmentPass(
+    "enable-infer-alignment-pass", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Enable the InferAlignment pass, disabling alignment inference in "
+             "InstCombine"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1140,6 +1146,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
+  if (EnableInferAlignmentPass)
+    FPM.addPass(InferAlignmentPass());
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
     // again. Unroll small loops to hide loop backedge latency and saturate any
@@ -1257,6 +1265,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
   }
 
+  if (EnableInferAlignmentPass)
+    FPM.addPass(InferAlignmentPass());
   FPM.addPass(InstCombinePass());
 
   // This is needed for two reasons:
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index b9aa015d02dd9..df9f14920f291 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -337,6 +337,7 @@ FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("gvn-sink", GVNSinkPass())
 FUNCTION_PASS("helloworld", HelloWorldPass())
 FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
+FUNCTION_PASS("infer-alignment", InferAlignmentPass())
 FUNCTION_PASS("instcount", InstCountPass())
 FUNCTION_PASS("instsimplify", InstSimplifyPass())
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index eb008c15903a7..2dd27037a17de 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_component_library(LLVMScalarOpts
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
   InferAddressSpaces.cpp
+  InferAlignment.cpp
   InstSimplifyPass.cpp
   JumpThreading.cpp
   LICM.cpp
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
new file mode 100644
index 0000000000000..b75b8d486fbbe
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -0,0 +1,91 @@
+//===- InferAlignment.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Infer alignment for load, stores and other memory operations based on
+// trailing zero known bits information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/InferAlignment.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+static bool tryToImproveAlign(
+    const DataLayout &DL, Instruction *I,
+    function_ref<Align(Value *PtrOp, Align OldAlign, Align PrefAlign)> Fn) {
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    Value *PtrOp = LI->getPointerOperand();
+    Align OldAlign = LI->getAlign();
+    Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(LI->getType()));
+    if (NewAlign > OldAlign) {
+      LI->setAlignment(NewAlign);
+      return true;
+    }
+  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+    Value *PtrOp = SI->getPointerOperand();
+    Value *ValOp = SI->getValueOperand();
+    Align OldAlign = SI->getAlign();
+    Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(ValOp->getType()));
+    if (NewAlign > OldAlign) {
+      SI->setAlignment(NewAlign);
+      return true;
+    }
+  }
+  // TODO: Also handle memory intrinsics.
+  return false;
+}
+
+bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool Changed = false;
+
+  // Enforce preferred type alignment if possible. We do this as a separate
+  // pass first, because it may improve the alignments we infer below.
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      Changed |= tryToImproveAlign(
+          DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
+            if (PrefAlign > OldAlign)
+              return std::max(OldAlign,
+                              tryEnforceAlignment(PtrOp, PrefAlign, DL));
+            return OldAlign;
+          });
+    }
+  }
+
+  // Compute alignment from known bits.
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      Changed |= tryToImproveAlign(
+          DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
+            KnownBits Known = computeKnownBits(PtrOp, DL, 0, &AC, &I, &DT);
+            unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
+                                       +Value::MaxAlignmentExponent);
+            return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+          });
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses InferAlignmentPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  inferAlignment(F, AC, DT);
+  // Changes to alignment shouldn't invalidated analyses.
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index e11c4aac66ad9..ddb47e693a643 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1388,15 +1388,8 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
   return Changed;
 }
 
-/// If the specified pointer points to an object that we control, try to modify
-/// the object's alignment to PrefAlign. Returns a minimum known alignment of
-/// the value after the operation, which may be lower than PrefAlign.
-///
-/// Increating value alignment isn't often possible though. If alignment is
-/// important, a more reliable approach is to simply align all global variables
-/// and allocation instructions to their preferred alignment from the beginning.
-static Align tryEnforceAlignment(Value *V, Align PrefAlign,
-                                 const DataLayout &DL) {
+Align llvm::tryEnforceAlignment(Value *V, Align PrefAlign,
+                                const DataLayout &DL) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
diff --git a/llvm/test/Transforms/InferAlignment/alloca.ll b/llvm/test/Transforms/InferAlignment/alloca.ll
index b64413336d347..986f291889a2a 100644
--- a/llvm/test/Transforms/InferAlignment/alloca.ll
+++ b/llvm/test/Transforms/InferAlignment/alloca.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 ; ------------------------------------------------------------------------------
 ; Scalar type
@@ -8,11 +8,11 @@
 define void @alloca_local(i8 %x, i32 %y) {
 ; CHECK-LABEL: define void @alloca_local
 ; CHECK-SAME: (i8 [[X:%.*]], i32 [[Y:%.*]]) {
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 1
-; CHECK-NEXT:    [[LOAD_I8:%.*]] = load i8, ptr [[ALLOCA]], align 1
-; CHECK-NEXT:    [[LOAD_I32:%.*]] = load i32, ptr [[ALLOCA]], align 1
-; CHECK-NEXT:    store i8 [[X]], ptr [[ALLOCA]], align 1
-; CHECK-NEXT:    store i32 [[Y]], ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LOAD_I8:%.*]] = load i8, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[LOAD_I32:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i8 [[X]], ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i32 [[Y]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca i32, align 1
@@ -38,10 +38,10 @@ define void @alloca_struct(i32 %x) {
 ; CHECK-NEXT:    [[ALLOCA_STRUCT:%.*]] = alloca [[STRUCT_PAIR:%.*]], align 8
 ; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr [[STRUCT_PAIR]], ptr [[ALLOCA_STRUCT]], i64 0, i32 1
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr { i32, i32 }, ptr [[GEP_0]], i64 0, i32 1
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, ptr [[GEP_0]], align 1
-; CHECK-NEXT:    store i32 0, ptr [[GEP_0]], align 1
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, ptr [[GEP_1]], align 1
-; CHECK-NEXT:    store i32 0, ptr [[GEP_1]], align 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[GEP_1]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca.struct = alloca %struct.pair
diff --git a/llvm/test/Transforms/InferAlignment/atomic.ll b/llvm/test/Transforms/InferAlignment/atomic.ll
index 23efc4381fd32..949e60a61edfa 100644
--- a/llvm/test/Transforms/InferAlignment/atomic.ll
+++ b/llvm/test/Transforms/InferAlignment/atomic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S < %s -passes=no-op-function | FileCheck %s
+; RUN: opt -S < %s -passes=infer-alignment | FileCheck %s
 
 ; ------------------------------------------------------------------------------
 ; load/store of null
@@ -7,9 +7,9 @@
 
 define void @load_null() {
 ; CHECK-LABEL: define void @load_null() {
-; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr null unordered, align 4
-; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr null monotonic, align 4
-; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr null seq_cst, align 4
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr null unordered, align 4294967296
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr null seq_cst, align 4294967296
 ; CHECK-NEXT:    ret void
 ;
   %x.0 = load atomic i32, ptr null unordered, align 4
@@ -20,9 +20,9 @@ define void @load_null() {
 
 define void @store_null() {
 ; CHECK-LABEL: define void @store_null() {
-; CHECK-NEXT:    store atomic i32 0, ptr null unordered, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr null unordered, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4294967296
 ; CHECK-NEXT:    ret void
 ;
   store atomic i32 0, ptr null unordered, align 4
@@ -38,9 +38,9 @@ define void @store_null() {
 
 define void @load_nonnull() {
 ; CHECK-LABEL: define void @load_nonnull() {
-; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr @c unordered, align 4
-; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr @c monotonic, align 4
-; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr @c seq_cst, align 4
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr @c unordered, align 8
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr @c monotonic, align 8
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr @c seq_cst, align 8
 ; CHECK-NEXT:    ret void
 ;
   %x.0 = load atomic i32, ptr @c unordered, align 4
@@ -51,9 +51,9 @@ define void @load_nonnull() {
 
 define void @store_nonnull() {
 ; CHECK-LABEL: define void @store_nonnull() {
-; CHECK-NEXT:    store atomic i32 0, ptr @c unordered, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr @c monotonic, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr @c seq_cst, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr @c unordered, align 8
+; CHECK-NEXT:    store atomic i32 0, ptr @c monotonic, align 8
+; CHECK-NEXT:    store atomic i32 0, ptr @c seq_cst, align 8
 ; CHECK-NEXT:    ret void
 ;
   store atomic i32 0, ptr @c unordered, align 4
@@ -69,9 +69,9 @@ define void @store_nonnull() {
 define void @load_alloca() {
 ; CHECK-LABEL: define void @load_alloca() {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr [[ALLOCA]] unordered, align 1
-; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr [[ALLOCA]] monotonic, align 1
-; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr [[ALLOCA]] seq_cst, align 1
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr [[ALLOCA]] unordered, align 4
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr [[ALLOCA]] monotonic, align 4
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr [[ALLOCA]] seq_cst, align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca i32
@@ -84,9 +84,9 @@ define void @load_alloca() {
 define void @store_alloca() {
 ; CHECK-LABEL: define void @store_alloca() {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] unordered, align 1
-; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] monotonic, align 1
-; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] seq_cst, align 1
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] unordered, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] monotonic, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] seq_cst, align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca i32
diff --git a/llvm/test/Transforms/InferAlignment/attributes.ll b/llvm/test/Transforms/InferAlignment/attributes.ll
index 6dce9a11d661f..c74dec9ac90d9 100644
--- a/llvm/test/Transforms/InferAlignment/attributes.ll
+++ b/llvm/test/Transforms/InferAlignment/attributes.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 define void @attribute(ptr align 32 %a) {
 ; CHECK-LABEL: define void @attribute
 ; CHECK-SAME: (ptr align 32 [[A:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 1
-; CHECK-NEXT:    store i32 123, ptr [[A]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 32
+; CHECK-NEXT:    store i32 123, ptr [[A]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %load = load i32, ptr %a, align 1
@@ -17,8 +17,8 @@ define void @attribute_through_call(ptr align 32 %a) {
 ; CHECK-LABEL: define void @attribute_through_call
 ; CHECK-SAME: (ptr align 32 [[A:%.*]]) {
 ; CHECK-NEXT:    [[RES:%.*]] = call ptr @call(ptr [[A]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 1
-; CHECK-NEXT:    store i32 123, ptr [[RES]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 32
+; CHECK-NEXT:    store i32 123, ptr [[RES]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %res = call ptr @call(ptr %a)
@@ -31,8 +31,8 @@ define void @attribute_return_value(ptr %a) {
 ; CHECK-LABEL: define void @attribute_return_value
 ; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:    [[RES:%.*]] = call align 32 ptr @call(ptr [[A]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 1
-; CHECK-NEXT:    store i32 123, ptr [[RES]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 32
+; CHECK-NEXT:    store i32 123, ptr [[RES]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %res = call align 32 ptr @call(ptr %a)
diff --git a/llvm/test/Transforms/InferAlignment/gep-2d.ll b/llvm/test/Transforms/InferAlignment/gep-2d.ll
index b88a9be988ccf..4ce9e11f401c8 100644
--- a/llvm/test/Transforms/InferAlignment/gep-2d.ll
+++ b/llvm/test/Transforms/InferAlignment/gep-2d.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 ; A multi-dimensional array in a nested loop.inner doing vector stores that
 ; aren't yet aligned. InferAlignment can understand the addressing in the
@@ -21,8 +21,8 @@ define void @nested_loop() {
 ; CHECK:       loop.inner:
 ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 0, [[LOOP_OUTER]] ], [ [[J_NEXT:%.*]], [[LOOP_INNER_TAIL:%.*]] ]
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr [1001 x [20000 x double]], ptr @Nice, i64 0, i64 [[I]], i64 [[J]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_1]], align 8
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load <2 x double>, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_1]], align 16
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load <2 x double>, ptr [[GEP_1]], align 16
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr [1001 x [20001 x double]], ptr @Awkward, i64 0, i64 [[I]], i64 [[J]]
 ; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_2]], align 8
 ; CHECK-NEXT:    [[LOAD_2:%.*]] = load <2 x double>, ptr [[GEP_2]], align 8
diff --git a/llvm/test/Transforms/InferAlignment/gep-array.ll b/llvm/test/Transforms/InferAlignment/gep-array.ll
index 76ba55eee649e..6f6051144b710 100644
--- a/llvm/test/Transforms/InferAlignment/gep-array.ll
+++ b/llvm/test/Transforms/InferAlignment/gep-array.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -passes=no-op-function -S < %s | FileCheck %s
+; RUN: opt -passes=infer-alignment -S < %s | FileCheck %s
 
 ; ------------------------------------------------------------------------------
 ; Array of pair
@@ -18,8 +18,8 @@ define void @simple_pair(i64 %idx) {
 ; CHECK-LABEL: define void @simple_pair
 ; CHECK-SAME: (i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [4 x %pair.simple], ptr @array.simple, i64 0, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %gep = getelementptr inbounds [4 x %pair.simple], ptr @array.simple, i64 0, i64 %idx, i32 1
@@ -39,11 +39,11 @@ define void @simple_pair(i64 %idx) {
 
 define void @load_nested() {
 ; CHECK-LABEL: define void @load_nested() {
-; CHECK-NEXT:    [[X_0:%.*]] = load i32, ptr @array.array, align 4
+; CHECK-NEXT:    [[X_0:%.*]] = load i32, ptr @array.array, align 16
 ; CHECK-NEXT:    [[X_1:%.*]] = load i32, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[X_2:%.*]] = load i32, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[X_2:%.*]] = load i32, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 8
 ; CHECK-NEXT:    [[X_3:%.*]] = load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[X_4:%.*]] = load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[X_4:%.*]] = load i32, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 16
 ; CHECK-NEXT:    ret void
 ;
   %x.0 =  load i32, ptr @array.array, align 4
@@ -56,11 +56,11 @@ define void @load_nested() {
 
 define void @store_nested() {
 ; CHECK-LABEL: define void @store_nested() {
-; CHECK-NEXT:    store i32 1, ptr @array.array, align 4
+; CHECK-NEXT:    store i32 1, ptr @array.array, align 16
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 2), align 8
 ; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 3), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %pair.array], ptr @array.array, i64 0, i64 0, i32 0, i64 4), align 16
 ; CHECK-NEXT:    ret void
 ;
   store i32 1, ptr @array.array, align 4
diff --git a/llvm/test/Transforms/InferAlignment/irregular-size.ll b/llvm/test/Transforms/InferAlignment/irregular-size.ll
index caec3f55b6121..9413c8ac5be46 100644
--- a/llvm/test/Transforms/InferAlignment/irregular-size.ll
+++ b/llvm/test/Transforms/InferAlignment/irregular-size.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 define void @non_pow2_size(i177 %X) {
 ; CHECK-LABEL: define void @non_pow2_size
 ; CHECK-SAME: (i177 [[X:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = alloca i177, align 1
-; CHECK-NEXT:    [[L1:%.*]] = load i177, ptr [[A]], align 1
-; CHECK-NEXT:    store i177 [[X]], ptr [[A]], align 1
+; CHECK-NEXT:    [[A:%.*]] = alloca i177, align 8
+; CHECK-NEXT:    [[L1:%.*]] = load i177, ptr [[A]], align 8
+; CHECK-NEXT:    store i177 [[X]], ptr [[A]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %A = alloca i177, align 1
@@ -27,13 +27,13 @@ define void @load_vector_i4(i4 %X) {
 ; CHECK-NEXT:    [[PTR_2:%.*]] = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 4
 ; CHECK-NEXT:    [[PTR_3:%.*]] = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 8
 ; CHECK-NEXT:    [[RES_0:%.*]] = load i4, ptr [[PTR_0]], align 1
-; CHECK-NEXT:    [[RES_1:%.*]] = load i4, ptr [[PTR_1]], align 1
-; CHECK-NEXT:    [[RES_2:%.*]] = load i4, ptr [[PTR_2]], align 1
-; CHECK-NEXT:    [[RES_3:%.*]] = load i4, ptr [[PTR_3]], align 1
+; CHECK-NEXT:    [[RES_1:%.*]] = load i4, ptr [[PTR_1]], align 2
+; CHECK-NEXT:    [[RES_2:%.*]] = load i4, ptr [[PTR_2]], align 4
+; CHECK-NEXT:    [[RES_3:%.*]] = load i4, ptr [[PTR_3]], align 8
 ; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_0]], align 1
-; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_1]], align 1
-; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_2]], align 1
-; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_3]], align 1
+; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_1]], align 2
+; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_2]], align 4
+; CHECK-NEXT:    store i4 [[X]], ptr [[PTR_3]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %ptr.0 = getelementptr [16 x <2 x i4>], ptr @vector_i4, i64 0, i64 1
diff --git a/llvm/test/Transforms/InferAlignment/propagate-assume.ll b/llvm/test/Transforms/InferAlignment/propagate-assume.ll
index a5c7afa0393ba..8cf0cb35035ed 100644
--- a/llvm/test/Transforms/InferAlignment/propagate-assume.ll
+++ b/llvm/test/Transforms/InferAlignment/propagate-assume.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 ; ------------------------------------------------------------------------------
 ; Simple test
@@ -12,8 +12,8 @@ define void @simple_forwardpropagate(ptr %a) {
 ; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
 ; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 32
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %ptrint = ptrtoint ptr %a to i64
@@ -30,8 +30,8 @@ define void @simple_forwardpropagate(ptr %a) {
 define void @simple_backpropagate(ptr %a) {
 ; CHECK-LABEL: define void @simple_backpropagate
 ; CHECK-SAME: (ptr [[A:%.*]]) {
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 32
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 32
 ; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
 ; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
@@ -53,8 +53,8 @@ define void @simple_forwardpropagate_bundle(ptr %a) {
 ; CHECK-LABEL: define void @simple_forwardpropagate_bundle
 ; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i32 32) ]
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 32
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 32
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.assume(i1 true) ["align"(ptr %a, i32 32)]
@@ -66,8 +66,8 @@ define void @simple_forwardpropagate_bundle(ptr %a) {
 define void @simple_backpropagate_bundle(ptr %a) {
 ; CHECK-LABEL: define void @simple_backpropagate_bundle
 ; CHECK-SAME: (ptr [[A:%.*]]) {
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    store i32 345, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 32
+; CHECK-NEXT:    store i32 345, ptr [[A]], align 32
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i32 32) ]
 ; CHECK-NEXT:    ret void
 ;
@@ -97,10 +97,10 @@ define void @loop_forwardpropagate(ptr %a, ptr %b) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
-; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[GEP_B]], align 64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[LOAD_B]], 1
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 16
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[I_NEXT]], 1648
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
@@ -149,10 +149,10 @@ define void @loop_forwardpropagate_bundle(ptr %a, ptr %b) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
-; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[GEP_B]], align 64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[LOAD_B]], 1
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 16
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[I_NEXT]], 1648
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
@@ -190,11 +190,11 @@ define void @complex_backpropagate(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: define void @complex_backpropagate
 ; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 32
 ; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    store i32 [[LOAD_B]], ptr [[A]], align 4
+; CHECK-NEXT:    store i32 [[LOAD_B]], ptr [[A]], align 32
 ; CHECK-NEXT:    [[OBJ_SIZE:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[C]], i1 false, i1 false, i1 false)
-; CHECK-NEXT:    store i64 [[OBJ_SIZE]], ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 [[OBJ_SIZE]], ptr [[ALLOCA]], align 8
 ; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
 ; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
@@ -222,11 +222,11 @@ define void @complex_backpropagate_bundle(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: define void @complex_backpropagate_bundle
 ; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[A]], align 32
 ; CHECK-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    store i32 [[LOAD_B]], ptr [[A]], align 4
+; CHECK-NEXT:    store i32 [[LOAD_B]], ptr [[A]], align 32
 ; CHECK-NEXT:    [[OBJ_SIZE:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[C]], i1 false, i1 false, i1 false)
-; CHECK-NEXT:    store i64 [[OBJ_SIZE]], ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 [[OBJ_SIZE]], ptr [[ALLOCA]], align 8
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i32 32) ]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InferAlignment/ptrmask.ll b/llvm/test/Transforms/InferAlignment/ptrmask.ll
index 1db2d09321648..52a8bcecba13d 100644
--- a/llvm/test/Transforms/InferAlignment/ptrmask.ll
+++ b/llvm/test/Transforms/InferAlignment/ptrmask.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 ; ------------------------------------------------------------------------------
 ; load instructions
@@ -11,9 +11,9 @@ define void @load(ptr align 1 %ptr) {
 ; CHECK-NEXT:    [[ALIGNED_0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -2)
 ; CHECK-NEXT:    [[ALIGNED_1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -4)
 ; CHECK-NEXT:    [[ALIGNED_2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -8)
-; CHECK-NEXT:    [[LOAD_0:%.*]] = load <16 x i8>, ptr [[ALIGNED_0]], align 1
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load <16 x i8>, ptr [[ALIGNED_1]], align 1
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load <16 x i8>, ptr [[ALIGNED_2]], align 1
+; CHECK-NEXT:    [[LOAD_0:%.*]] = load <16 x i8>, ptr [[ALIGNED_0]], align 2
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load <16 x i8>, ptr [[ALIGNED_1]], align 4
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load <16 x i8>, ptr [[ALIGNED_2]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %aligned.0 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -2)
@@ -37,9 +37,9 @@ define void @store(ptr align 1 %ptr) {
 ; CHECK-NEXT:    [[ALIGNED_0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -2)
 ; CHECK-NEXT:    [[ALIGNED_1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -4)
 ; CHECK-NEXT:    [[ALIGNED_2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -8)
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_0]], align 1
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_1]], align 1
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_2]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_0]], align 2
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_1]], align 4
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED_2]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %aligned.0 = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -2)
@@ -62,8 +62,8 @@ define void @ptrmask_overaligned(ptr align 16 %ptr) {
 ; CHECK-LABEL: define void @ptrmask_overaligned
 ; CHECK-SAME: (ptr align 16 [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 16
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[ALIGNED]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
diff --git a/llvm/test/Transforms/InferAlignment/undef-and-null.ll b/llvm/test/Transforms/InferAlignment/undef-and-null.ll
index 76b751a4d411f..86f6d62eca9e7 100644
--- a/llvm/test/Transforms/InferAlignment/undef-and-null.ll
+++ b/llvm/test/Transforms/InferAlignment/undef-and-null.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -passes=no-op-function -S < %s | FileCheck %s
+; RUN: opt -passes=infer-alignment -S < %s | FileCheck %s
 
 define void @load_undef_null(ptr %P) {
 ; CHECK-LABEL: define void @load_undef_null
 ; CHECK-SAME: (ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[RET_0:%.*]] = load i32, ptr undef, align 4
-; CHECK-NEXT:    [[RET_1:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[RET_1:%.*]] = load i32, ptr null, align 4294967296
 ; CHECK-NEXT:    ret void
 ;
   %ret.0 = load i32, ptr undef
@@ -17,7 +17,7 @@ define void @store_undef_null(ptr %P) {
 ; CHECK-LABEL: define void @store_undef_null
 ; CHECK-SAME: (ptr [[P:%.*]]) {
 ; CHECK-NEXT:    store i32 123, ptr undef, align 4
-; CHECK-NEXT:    store i32 124, ptr null, align 4
+; CHECK-NEXT:    store i32 124, ptr null, align 4294967296
 ; CHECK-NEXT:    ret void
 ;
   store i32 123, ptr undef
diff --git a/llvm/test/Transforms/InferAlignment/vector.ll b/llvm/test/Transforms/InferAlignment/vector.ll
index 1599b583f3244..e3dcfe346d7e7 100644
--- a/llvm/test/Transforms/InferAlignment/vector.ll
+++ b/llvm/test/Transforms/InferAlignment/vector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 ; InferAlignment should be able to prove vector alignment in the
 ; presence of a few mild address computation tricks.
@@ -12,8 +12,8 @@ define void @alloca(<2 x i64> %y) {
 ; CHECK-LABEL: define void @alloca
 ; CHECK-SAME: (<2 x i64> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca <2 x i64>, align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[ALLOCA]], align 1
-; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[ALLOCA]], align 16
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[ALLOCA]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca <2 x i64>
@@ -31,8 +31,8 @@ define void @alloca(<2 x i64> %y) {
 define void @global(<2 x i64> %y) {
 ; CHECK-LABEL: define void @global
 ; CHECK-SAME: (<2 x i64> [[Y:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr @x.vector, align 1
-; CHECK-NEXT:    store <2 x i64> [[Y]], ptr @x.vector, align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr @x.vector, align 16
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr @x.vector, align 16
 ; CHECK-NEXT:    ret void
 ;
   %load = load <2 x i64>, ptr @x.vector, align 1
@@ -55,8 +55,8 @@ define void @vector_singular(i32 %i, <2 x i64> %y) {
 ; CHECK-LABEL: define void @vector_singular
 ; CHECK-SAME: (i32 [[I:%.*]], <2 x i64> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <2 x i64>, ptr @vector, i32 [[I]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 1
-; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 16
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[GEP]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %gep = getelementptr <2 x i64>, ptr @vector, i32 %i
@@ -73,8 +73,8 @@ define void @vector_array(i32 %i, i32 %j, <2 x i64> %y) {
 ; CHECK-LABEL: define void @vector_array
 ; CHECK-SAME: (i32 [[I:%.*]], i32 [[J:%.*]], <2 x i64> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [13 x <2 x i64>], ptr @vector.arr, i32 [[I]], i32 [[J]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 1
-; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 16
+; CHECK-NEXT:    store <2 x i64> [[Y]], ptr [[GEP]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %gep = getelementptr [13 x <2 x i64>], ptr @vector.arr, i32 %i, i32 %j
@@ -93,11 +93,11 @@ define void @vector_array(i32 %i, i32 %j, <2 x i64> %y) {
 
 define void @nonvector_array() {
 ; CHECK-LABEL: define void @nonvector_array() {
-; CHECK-NEXT:    [[LOAD_0:%.*]] = load <16 x i8>, ptr @x.array, align 1
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @x.array, align 1
+; CHECK-NEXT:    [[LOAD_0:%.*]] = load <16 x i8>, ptr @x.array, align 16
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @x.array, align 16
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i32], ptr @x.array, i16 0, i16 2
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load <16 x i8>, ptr [[GEP]], align 1
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load <16 x i8>, ptr [[GEP]], align 8
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[GEP]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %load.0 = load <16 x i8>, ptr @x.array, align 1
diff --git a/llvm/test/Transforms/InferAlignment/volatile.ll b/llvm/test/Transforms/InferAlignment/volatile.ll
index f2991b6fc2d2d..88ff0d0faf922 100644
--- a/llvm/test/Transforms/InferAlignment/volatile.ll
+++ b/llvm/test/Transforms/InferAlignment/volatile.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 define void @load_volatile() {
 ; CHECK-LABEL: define void @load_volatile() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca { i32 }, align 8
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load volatile i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load volatile i32, ptr [[A]], align 8
 ; CHECK-NEXT:    [[LOAD_B:%.*]] = load volatile i32, ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -20,7 +20,7 @@ define void @store_volatile() {
 ; CHECK-LABEL: define void @store_volatile() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca { i32 }, align 8
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store volatile i32 123, ptr [[A]], align 4
+; CHECK-NEXT:    store volatile i32 123, ptr [[A]], align 8
 ; CHECK-NEXT:    store volatile i32 123, ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InferAlignment/vscale.ll b/llvm/test/Transforms/InferAlignment/vscale.ll
index 5152d50a6bb77..8abac031c6fa4 100644
--- a/llvm/test/Transforms/InferAlignment/vscale.ll
+++ b/llvm/test/Transforms/InferAlignment/vscale.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -passes=no-op-function -S < %s | FileCheck %s
+; RUN: opt -passes=infer-alignment -S < %s | FileCheck %s
 
 ; <4 x i32> -> 16 byte alignment
 define void @alignment_sustain(ptr align 16 %ptr) {
@@ -23,8 +23,8 @@ define void @alignment_increase(ptr align 32 %ptr) {
 ; CHECK-LABEL: define void @alignment_increase
 ; CHECK-SAME: (ptr align 32 [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 8 x i32>, ptr [[PTR]], i32 3
-; CHECK-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[GEP]], align 16
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[GEP]], align 32
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr [[GEP]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %gep = getelementptr <vscale x 8 x i32>, ptr %ptr, i32 3

From 0104f37f1626057a856b57f8acdd2b7407a8b01f Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <44582521+dc03@users.noreply.github.com>
Date: Wed, 23 Aug 2023 16:08:09 +0530
Subject: [PATCH 46/57] [InstCombine] Use a cl::opt to control calls to
 getOrEnforceKnownAlignment in LoadInst and StoreInst

This is in preparation for the InferAlignment pass which handles
inferring alignment for instructions separately. It is better to handle
this as a separate pass as inferring alignment is quite costly, and
InstCombine running multiple times in the pass pipeline makes it even
more so.

Differential Revision: https://reviews.llvm.org/D158527
---
 .../InstCombineLoadStoreAlloca.cpp            | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e176d1bea25a0..3767ecd6539f3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -36,6 +36,8 @@ static cl::opt<unsigned> MaxCopiedFromConstantUsers(
     cl::desc("Maximum users to visit in copy from constant transform"),
     cl::Hidden);
 
+extern cl::opt<bool> EnableInferAlignmentPass;
+
 /// isOnlyCopiedFromConstantMemory - Recursively walk the uses of a (derived)
 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
@@ -1048,11 +1050,13 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
   if (Instruction *Res = combineLoadToOperationType(*this, LI))
     return Res;
 
-  // Attempt to improve the alignment.
-  Align KnownAlign = getOrEnforceKnownAlignment(
-      Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
-  if (KnownAlign > LI.getAlign())
-    LI.setAlignment(KnownAlign);
+  if (!EnableInferAlignmentPass) {
+    // Attempt to improve the alignment.
+    Align KnownAlign = getOrEnforceKnownAlignment(
+        Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
+    if (KnownAlign > LI.getAlign())
+      LI.setAlignment(KnownAlign);
+  }
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI))
@@ -1445,11 +1449,13 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
   if (combineStoreToValueType(*this, SI))
     return eraseInstFromFunction(SI);
 
-  // Attempt to improve the alignment.
-  const Align KnownAlign = getOrEnforceKnownAlignment(
-      Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
-  if (KnownAlign > SI.getAlign())
-    SI.setAlignment(KnownAlign);
+  if (!EnableInferAlignmentPass) {
+    // Attempt to improve the alignment.
+    const Align KnownAlign = getOrEnforceKnownAlignment(
+        Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
+    if (KnownAlign > SI.getAlign())
+      SI.setAlignment(KnownAlign);
+  }
 
   // Try to canonicalize the stored type.
   if (unpackStoreToAggregate(*this, SI))

From 3e992d81afc3925a8685eb15f794dd4a6ba3e97e Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <44582521+dc03@users.noreply.github.com>
Date: Wed, 23 Aug 2023 17:43:34 +0530
Subject: [PATCH 47/57] [InferAlignment] Enable InferAlignment pass by default

This gives an improvement of 0.6%:
https://llvm-compile-time-tracker.com/compare.php?from=7d35fe6d08e2b9b786e1c8454cd2391463832167&to=0456c8e8a42be06b62ad4c3e3cf34b21f2633d1e&stat=instructions:u

Differential Revision: https://reviews.llvm.org/D158600
---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  2 +-
 llvm/test/Analysis/BasicAA/featuretest.ll     |  2 +-
 llvm/test/Analysis/ValueTracking/assume.ll    |  2 +-
 .../CodeGen/AMDGPU/implicit-arg-v5-opt.ll     |  2 +-
 .../CodeGen/AMDGPU/reqd-work-group-size.ll    |  4 +-
 llvm/test/Other/new-pm-defaults.ll            |  2 +
 llvm/test/Other/new-pm-lto-defaults.ll        |  2 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |  2 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |  2 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |  2 +
 .../InstCombine/2009-01-08-AlignAlloca.ll     |  4 +-
 .../2009-02-20-InstCombine-SROA.ll            | 60 +++++++-------
 .../Transforms/InstCombine/addrspacecast.ll   |  2 +-
 .../Transforms/InstCombine/align-2d-gep.ll    | 65 ---------------
 .../test/Transforms/InstCombine/align-addr.ll | 27 +++----
 .../test/Transforms/InstCombine/align-attr.ll |  4 +-
 .../InstCombine/alloca-cast-debuginfo.ll      |  4 +-
 llvm/test/Transforms/InstCombine/alloca.ll    | 32 ++++++--
 .../Transforms/InstCombine/apint-shift.ll     |  2 +-
 .../Transforms/InstCombine/assume-align.ll    |  4 +-
 .../InstCombine/assume-loop-align.ll          |  4 +-
 llvm/test/Transforms/InstCombine/assume.ll    | 13 ++-
 .../InstCombine/assume_inevitable.ll          |  5 +-
 llvm/test/Transforms/InstCombine/atomic.ll    | 22 ++---
 .../constant-fold-address-space-pointer.ll    |  8 +-
 .../InstCombine/constant-fold-gep.ll          | 37 +++------
 .../dbg-scalable-store-fixed-frag.ll          | 12 +--
 .../InstCombine/fcmp-denormals-are-zero.ll    | 80 +++++++++----------
 .../Transforms/InstCombine/fp-ret-bitcast.ll  |  4 +-
 .../Transforms/InstCombine/gep-custom-dl.ll   |  2 +-
 .../Transforms/InstCombine/getelementptr.ll   |  8 +-
 llvm/test/Transforms/InstCombine/load-cmp.ll  |  4 +-
 .../load-combine-metadata-dominance.ll        |  2 +-
 llvm/test/Transforms/InstCombine/load.ll      | 10 +--
 .../InstCombine/loadstore-alignment.ll        | 20 ++---
 .../InstCombine/memcpy-from-global.ll         |  4 +-
 .../merging-multiple-stores-into-successor.ll | 31 +++----
 llvm/test/Transforms/InstCombine/phi.ll       |  4 +-
 .../InstCombine/pr33689_same_bitwidth.ll      |  4 +-
 llvm/test/Transforms/InstCombine/pr44552.ll   |  2 +-
 llvm/test/Transforms/InstCombine/pr59613.ll   |  2 +-
 .../InstCombine/scalable-cast-of-alloc.ll     | 32 ++++----
 llvm/test/Transforms/InstCombine/select.ll    | 12 +--
 llvm/test/Transforms/InstCombine/store.ll     |  2 +-
 .../InstCombine/trivial-dse-calls.ll          |  2 +-
 .../test/Transforms/InstCombine/vscale_gep.ll |  4 +-
 .../Transforms/LoopVectorize/X86/pr42674.ll   |  4 +-
 .../LoopVectorize/X86/small-size.ll           | 10 +--
 .../LoopVectorize/multiple-address-spaces.ll  |  4 +-
 .../Transforms/LoopVectorize/non-const-n.ll   |  6 +-
 .../inlining-alignment-assumptions.ll         |  2 +-
 51 files changed, 260 insertions(+), 322 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/align-2d-gep.ll

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 529743cc8bd2e..29cf2f75fd6ca 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -276,7 +276,7 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
     cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
 
 cl::opt<bool> EnableInferAlignmentPass(
-    "enable-infer-alignment-pass", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    "enable-infer-alignment-pass", cl::init(true), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Enable the InferAlignment pass, disabling alignment inference in "
              "InstCombine"));
 
diff --git a/llvm/test/Analysis/BasicAA/featuretest.ll b/llvm/test/Analysis/BasicAA/featuretest.ll
index f78fa7cf44eda..f556c95747a19 100644
--- a/llvm/test/Analysis/BasicAA/featuretest.ll
+++ b/llvm/test/Analysis/BasicAA/featuretest.ll
@@ -131,7 +131,7 @@ define i32 @gep_distance_test3(ptr %A) {
 ;
 ; USE_ASSUME-LABEL: @gep_distance_test3(
 ; USE_ASSUME-NEXT:    [[C:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
-; USE_ASSUME-NEXT:    store i8 42, ptr [[C]], align 4
+; USE_ASSUME-NEXT:    store i8 42, ptr [[C]], align 1
 ; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4), "nonnull"(ptr [[A]]), "align"(ptr [[A]], i64 4) ]
 ; USE_ASSUME-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Analysis/ValueTracking/assume.ll b/llvm/test/Analysis/ValueTracking/assume.ll
index a5533a4f1db10..cc098e1013832 100644
--- a/llvm/test/Analysis/ValueTracking/assume.ll
+++ b/llvm/test/Analysis/ValueTracking/assume.ll
@@ -100,7 +100,7 @@ define dso_local i32 @test4a(ptr readonly %0, i1 %cond) {
 ; CHECK:       A:
 ; CHECK-NEXT:    br i1 false, label [[TMP4:%.*]], label [[TMP2:%.*]]
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    br label [[TMP4]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP3]], [[TMP2]] ], [ poison, [[A]] ]
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
index 7774677b09ef9..954218f339fa2 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
 define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
index f9f817302b6da..ecdc3845efc4b 100644
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
 
 target datalayout = "n32"
 
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 016dfad98c69f..ad8fab8bad3b7 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -246,6 +246,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
@@ -257,6 +258,7 @@
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
 ; CHECK-O-NEXT: Running pass: SROAPass
+; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
 ; CHECK-O-NEXT: Running pass: LCSSAPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index c444197e0db70..63ea58caa5b4a 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -116,6 +116,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo
+; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo
 ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: SROAPass on foo
@@ -128,6 +129,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
 ; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 79010c3eb8080..46d00a083b92c 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -173,6 +173,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
@@ -184,6 +185,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: SROAPass
+; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index bfa3ed6e4b757..d0e1b8fb9ab30 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -160,6 +160,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
@@ -171,6 +172,7 @@
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
 ; CHECK-O-NEXT: Running pass: SROAPass
+; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
 ; CHECK-O-NEXT: Running pass: LCSSAPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 004ec790e9847..4b033ad238e2f 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -167,6 +167,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
@@ -178,6 +179,7 @@
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
 ; CHECK-O-NEXT: Running pass: SROAPass
+; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
 ; CHECK-O-NEXT: Running pass: LCSSAPass
diff --git a/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll b/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
index 00bce165efa2a..dee71b2290acf 100644
--- a/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
+++ b/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
@@ -12,10 +12,10 @@ define i32 @bar(i64 %key_token2) nounwind {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IOSPEC:%.*]] = alloca [[STRUCT_KEY:%.*]], align 8
 ; CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 0, ptr [[IOSPEC]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[IOSPEC]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { i32, i32 }, ptr [[IOSPEC]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
-; CHECK-NEXT:    store i64 [[KEY_TOKEN2:%.*]], ptr [[IOSPEC]], align 8
+; CHECK-NEXT:    store i64 [[KEY_TOKEN2:%.*]], ptr [[IOSPEC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @foo(ptr nonnull byval([[STRUCT_KEY]]) align 4 [[IOSPEC]], ptr nonnull [[RET]]) #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[RET]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP2]]
diff --git a/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll b/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
index 4316018bb4e37..5ed8d9507ca78 100644
--- a/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
+++ b/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
@@ -27,59 +27,59 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    [[TMP1:%.*]] = getelementptr %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl", ptr [[X:%.*]], i32 0, i32 1
 ; IC-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
 ; IC-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[X]], align 4
-; IC-NEXT:    store ptr [[TMP3]], ptr [[__FIRST_ADDR_I_I]], align 8
-; IC-NEXT:    store ptr [[TMP2]], ptr [[__LAST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP3]], ptr [[__FIRST_ADDR_I_I]], align 4
+; IC-NEXT:    store ptr [[TMP2]], ptr [[__LAST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP2]] to i32
 ; IC-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP3]] to i32
 ; IC-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[TMP5]]
 ; IC-NEXT:    [[TMP7:%.*]] = ashr i32 [[TMP6]], 4
 ; IC-NEXT:    br label [[BB12_I_I:%.*]]
 ; IC:       bb.i.i:
-; IC-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
 ; IC-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
 ; IC-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP9]], [[TMP10]]
 ; IC-NEXT:    br i1 [[TMP11]], label [[BB1_I_I:%.*]], label [[BB2_I_I:%.*]]
 ; IC:       bb1.i.i:
-; IC-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT:%.*]]
 ; IC:       bb2.i.i:
-; IC-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 1
-; IC-NEXT:    store ptr [[TMP14]], ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP14]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
 ; IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
 ; IC-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
 ; IC-NEXT:    br i1 [[TMP17]], label [[BB4_I_I:%.*]], label [[BB5_I_I:%.*]]
 ; IC:       bb4.i.i:
-; IC-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb5.i.i:
-; IC-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 1
-; IC-NEXT:    store ptr [[TMP20]], ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP20]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; IC-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP0]], align 4
 ; IC-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], [[TMP22]]
 ; IC-NEXT:    br i1 [[TMP23]], label [[BB7_I_I:%.*]], label [[BB8_I_I:%.*]]
 ; IC:       bb7.i.i:
-; IC-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb8.i.i:
-; IC-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP25]], i32 1
-; IC-NEXT:    store ptr [[TMP26]], ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP26]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
 ; IC-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP0]], align 4
 ; IC-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], [[TMP28]]
 ; IC-NEXT:    br i1 [[TMP29]], label [[BB10_I_I:%.*]], label [[BB11_I_I:%.*]]
 ; IC:       bb10.i.i:
-; IC-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb11.i.i:
-; IC-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 1
-; IC-NEXT:    store ptr [[TMP32]], ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP32]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP33:%.*]] = add i32 [[__TRIP_COUNT_0_I_I:%.*]], -1
 ; IC-NEXT:    br label [[BB12_I_I]]
 ; IC:       bb12.i.i:
@@ -87,9 +87,9 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[__TRIP_COUNT_0_I_I]], 0
 ; IC-NEXT:    br i1 [[TMP34]], label [[BB_I_I:%.*]], label [[BB13_I_I:%.*]]
 ; IC:       bb13.i.i:
-; IC-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[__LAST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[__LAST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP35]] to i32
-; IC-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i32
 ; IC-NEXT:    [[TMP39:%.*]] = sub i32 [[TMP36]], [[TMP38]]
 ; IC-NEXT:    [[TMP40:%.*]] = ashr i32 [[TMP39]], 2
@@ -99,49 +99,49 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    i32 3, label [[BB14_I_I:%.*]]
 ; IC-NEXT:    ]
 ; IC:       bb14.i.i:
-; IC-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4
 ; IC-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP0]], align 4
 ; IC-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP42]], [[TMP43]]
 ; IC-NEXT:    br i1 [[TMP44]], label [[BB16_I_I:%.*]], label [[BB17_I_I:%.*]]
 ; IC:       bb16.i.i:
-; IC-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb17.i.i:
-; IC-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[TMP46]], i32 1
-; IC-NEXT:    store ptr [[TMP47]], ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP47]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[BB18_I_I]]
 ; IC:       bb18.i.i:
-; IC-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4
 ; IC-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP0]], align 4
 ; IC-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[TMP49]], [[TMP50]]
 ; IC-NEXT:    br i1 [[TMP51]], label [[BB20_I_I:%.*]], label [[BB21_I_I:%.*]]
 ; IC:       bb20.i.i:
-; IC-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb21.i.i:
-; IC-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP53]], i32 1
-; IC-NEXT:    store ptr [[TMP54]], ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP54]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[BB22_I_I]]
 ; IC:       bb22.i.i:
-; IC-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4
 ; IC-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP0]], align 4
 ; IC-NEXT:    [[TMP58:%.*]] = icmp eq i32 [[TMP56]], [[TMP57]]
 ; IC-NEXT:    br i1 [[TMP58]], label [[BB24_I_I:%.*]], label [[BB25_I_I:%.*]]
 ; IC:       bb24.i.i:
-; IC-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb25.i.i:
-; IC-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP61:%.*]] = getelementptr i32, ptr [[TMP60]], i32 1
-; IC-NEXT:    store ptr [[TMP61]], ptr [[__FIRST_ADDR_I_I]], align 8
+; IC-NEXT:    store ptr [[TMP61]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[BB26_I_I]]
 ; IC:       bb26.i.i:
-; IC-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[__LAST_ADDR_I_I]], align 8
+; IC-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[__LAST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       _ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit:
 ; IC-NEXT:    [[DOT0_0_I_I:%.*]] = phi ptr [ [[TMP62]], [[BB26_I_I]] ], [ [[TMP59]], [[BB24_I_I]] ], [ [[TMP52]], [[BB20_I_I]] ], [ [[TMP45]], [[BB16_I_I]] ], [ [[TMP30]], [[BB10_I_I]] ], [ [[TMP24]], [[BB7_I_I]] ], [ [[TMP18]], [[BB4_I_I]] ], [ [[TMP12]], [[BB1_I_I]] ]
diff --git a/llvm/test/Transforms/InstCombine/addrspacecast.ll b/llvm/test/Transforms/InstCombine/addrspacecast.ll
index 6b48cfb8fc4a5..cbb88b9a09c93 100644
--- a/llvm/test/Transforms/InstCombine/addrspacecast.ll
+++ b/llvm/test/Transforms/InstCombine/addrspacecast.ll
@@ -173,7 +173,7 @@ end:
 
 define void @constant_fold_null() #0 {
 ; CHECK-LABEL: @constant_fold_null(
-; CHECK-NEXT:    store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4294967296
+; CHECK-NEXT:    store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(3) null to ptr addrspace(4)
diff --git a/llvm/test/Transforms/InstCombine/align-2d-gep.ll b/llvm/test/Transforms/InstCombine/align-2d-gep.ll
deleted file mode 100644
index a2606fba2bf83..0000000000000
--- a/llvm/test/Transforms/InstCombine/align-2d-gep.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
-
-; A multi-dimensional array in a nested loop doing vector stores that
-; aren't yet aligned. Instcombine can understand the addressing in the
-; Nice case to prove 16 byte alignment. In the Awkward case, the inner
-; array dimension is not even, so the stores to it won't always be
-; aligned. Instcombine should prove alignment in exactly one of the two
-; stores.
-
-@Nice    = global [1001 x [20000 x double]] zeroinitializer, align 32
-@Awkward = global [1001 x [20001 x double]] zeroinitializer, align 32
-
-define void @foo() nounwind  {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[BB7_OUTER:%.*]]
-; CHECK:       bb7.outer:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT26:%.*]], [[BB11:%.*]] ]
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 0, [[BB7_OUTER]] ], [ [[INDVAR_NEXT:%.*]], [[BB1]] ]
-; CHECK-NEXT:    [[T4:%.*]] = getelementptr [1001 x [20000 x double]], ptr @Nice, i64 0, i64 [[I]], i64 [[J]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[T4]], align 16
-; CHECK-NEXT:    [[S4:%.*]] = getelementptr [1001 x [20001 x double]], ptr @Awkward, i64 0, i64 [[I]], i64 [[J]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[S4]], align 8
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[J]], 2
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 556
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB11]], label [[BB1]]
-; CHECK:       bb11:
-; CHECK-NEXT:    [[INDVAR_NEXT26]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND27:%.*]] = icmp eq i64 [[INDVAR_NEXT26]], 991
-; CHECK-NEXT:    br i1 [[EXITCOND27]], label [[RETURN_SPLIT:%.*]], label [[BB7_OUTER]]
-; CHECK:       return.split:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %bb7.outer
-
-bb7.outer:
-  %i = phi i64 [ 0, %entry ], [ %indvar.next26, %bb11 ]
-  br label %bb1
-
-bb1:
-  %j = phi i64 [ 0, %bb7.outer ], [ %indvar.next, %bb1 ]
-
-  %t4 = getelementptr [1001 x [20000 x double]], ptr @Nice, i64 0, i64 %i, i64 %j
-  store <2 x double><double 0.0, double 0.0>, ptr %t4, align 8
-
-  %s4 = getelementptr [1001 x [20001 x double]], ptr @Awkward, i64 0, i64 %i, i64 %j
-  store <2 x double><double 0.0, double 0.0>, ptr %s4, align 8
-
-  %indvar.next = add i64 %j, 2
-  %exitcond = icmp eq i64 %indvar.next, 556
-  br i1 %exitcond, label %bb11, label %bb1
-
-bb11:
-  %indvar.next26 = add i64 %i, 1
-  %exitcond27 = icmp eq i64 %indvar.next26, 991
-  br i1 %exitcond27, label %return.split, label %bb7.outer
-
-return.split:
-  ret void
-}
diff --git a/llvm/test/Transforms/InstCombine/align-addr.ll b/llvm/test/Transforms/InstCombine/align-addr.ll
index d02f4ef800ef9..23f620310d7c2 100644
--- a/llvm/test/Transforms/InstCombine/align-addr.ll
+++ b/llvm/test/Transforms/InstCombine/align-addr.ll
@@ -2,9 +2,6 @@
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 target datalayout = "E-p:64:64:64-p1:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
-; Instcombine should be able to prove vector alignment in the
-; presence of a few mild address computation tricks.
-
 define void @test0(ptr %b, i64 %n, i64 %u, i64 %y) nounwind  {
 ; CHECK-LABEL: @test0(
 ; CHECK-NEXT:  entry:
@@ -20,7 +17,7 @@ define void @test0(ptr %b, i64 %n, i64 %u, i64 %y) nounwind  {
 ; CHECK-NEXT:    [[J:%.*]] = mul i64 [[I]], [[V]]
 ; CHECK-NEXT:    [[H:%.*]] = add i64 [[J]], [[Z]]
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr double, ptr [[E]], i64 [[H]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[T8]], align 16
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[T8]], align 8
 ; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[RETURN]], label [[BB]]
@@ -58,7 +55,7 @@ return:
 define <16 x i8> @test1(<2 x i64> %x) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr @GLOBAL, align 16
+; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr @GLOBAL, align 1
 ; CHECK-NEXT:    ret <16 x i8> [[TMP]]
 ;
 entry:
@@ -70,7 +67,7 @@ entry:
 
 define <16 x i8> @test1_as1(<2 x i64> %x) {
 ; CHECK-LABEL: @test1_as1(
-; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr addrspace(1) @GLOBAL_as1, align 16
+; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr addrspace(1) @GLOBAL_as1, align 1
 ; CHECK-NEXT:    ret <16 x i8> [[TMP]]
 ;
   %tmp = load <16 x i8>, ptr addrspace(1) @GLOBAL_as1, align 1
@@ -81,7 +78,7 @@ define <16 x i8> @test1_as1(<2 x i64> %x) {
 
 define <16 x i8> @test1_as1_gep(<2 x i64> %x) {
 ; CHECK-LABEL: @test1_as1_gep(
-; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr addrspace(1) getelementptr inbounds ([8 x i32], ptr addrspace(1) @GLOBAL_as1_gep, i32 0, i32 4), align 16
+; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr addrspace(1) getelementptr inbounds ([8 x i32], ptr addrspace(1) @GLOBAL_as1_gep, i32 0, i32 4), align 1
 ; CHECK-NEXT:    ret <16 x i8> [[TMP]]
 ;
   %tmp = load <16 x i8>, ptr addrspace(1) getelementptr ([8 x i32], ptr addrspace(1) @GLOBAL_as1_gep, i16 0, i16 4), align 1
@@ -138,7 +135,7 @@ define <16 x i8> @ptrmask_align_unknown_ptr_align1(ptr align 1 %ptr, i64 %mask)
 define <16 x i8> @ptrmask_align_unknown_ptr_align8(ptr align 8 %ptr, i64 %mask) {
 ; CHECK-LABEL: @ptrmask_align_unknown_ptr_align8(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 [[MASK:%.*]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
@@ -150,7 +147,7 @@ define <16 x i8> @ptrmask_align_unknown_ptr_align8(ptr align 8 %ptr, i64 %mask)
 define <16 x i8> @ptrmask_align2_ptr_align1(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align2_ptr_align1(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -2)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 2
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -2)
@@ -162,7 +159,7 @@ define <16 x i8> @ptrmask_align2_ptr_align1(ptr align 1 %ptr) {
 define <16 x i8> @ptrmask_align4_ptr_align1(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align4_ptr_align1(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -4)
@@ -174,7 +171,7 @@ define <16 x i8> @ptrmask_align4_ptr_align1(ptr align 1 %ptr) {
 define <16 x i8> @ptrmask_align8_ptr_align1(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align1(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
@@ -187,7 +184,7 @@ define <16 x i8> @ptrmask_align8_ptr_align1(ptr align 1 %ptr) {
 define <16 x i8> @ptrmask_align8_ptr_align8(ptr align 8 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align8(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
@@ -200,7 +197,7 @@ define <16 x i8> @ptrmask_align8_ptr_align8(ptr align 8 %ptr) {
 define <16 x i8> @ptrmask_align8_ptr_align16(ptr align 16 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align16(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
@@ -213,7 +210,7 @@ define <16 x i8> @ptrmask_align8_ptr_align16(ptr align 16 %ptr) {
 define <16 x i8> @ptrmask_align8_ptr_align1_smallmask(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align1_smallmask(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i32(ptr %ptr, i32 -8)
@@ -226,7 +223,7 @@ define <16 x i8> @ptrmask_align8_ptr_align1_smallmask(ptr align 1 %ptr) {
 define <16 x i8> @ptrmask_align8_ptr_align1_bigmask(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align1_bigmask(
 ; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i128(ptr [[PTR:%.*]], i128 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i128(ptr %ptr, i128 -8)
diff --git a/llvm/test/Transforms/InstCombine/align-attr.ll b/llvm/test/Transforms/InstCombine/align-attr.ll
index 687aa604a4438..e7b17e72a8171 100644
--- a/llvm/test/Transforms/InstCombine/align-attr.ll
+++ b/llvm/test/Transforms/InstCombine/align-attr.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define i32 @foo1(ptr align 32 %a) #0 {
 ; CHECK-LABEL: @foo1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
@@ -20,7 +20,7 @@ define i32 @foo2(ptr align 32 %a) #0 {
 ; CHECK-LABEL: @foo2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[V:%.*]] = call ptr @func1(ptr [[A:%.*]])
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll b/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll
index 6b7725098535e..9c0f7ec04d4a2 100644
--- a/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll
+++ b/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll
@@ -26,10 +26,10 @@ target triple = "x86_64-pc-windows-msvc19.11.25508"
 define void @f(ptr %p) !dbg !11 {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8
+; CHECK-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[LOCAL]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P:%.*]], align 8, !dbg [[DBG24:![0-9]+]], !tbaa [[TBAA25:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP0]], ptr [[LOCAL]], align 8, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA25]]
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[LOCAL]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA25]]
 ; CHECK-NEXT:    call void @escape(ptr nonnull [[LOCAL]]), !dbg [[DBG30:![0-9]+]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG31:![0-9]+]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/alloca.ll b/llvm/test/Transforms/InstCombine/alloca.ll
index 24129b0a1986d..a64de28ee8397 100644
--- a/llvm/test/Transforms/InstCombine/alloca.ll
+++ b/llvm/test/Transforms/InstCombine/alloca.ll
@@ -132,7 +132,7 @@ define void @test6() {
 ; NODL-NEXT:  entry:
 ; NODL-NEXT:    [[A:%.*]] = alloca { i32 }, align 8
 ; NODL-NEXT:    [[B:%.*]] = alloca i32, align 4
-; NODL-NEXT:    store volatile i32 123, ptr [[A]], align 8
+; NODL-NEXT:    store volatile i32 123, ptr [[A]], align 4
 ; NODL-NEXT:    tail call void @f(ptr nonnull [[B]])
 ; NODL-NEXT:    ret void
 ;
@@ -186,13 +186,29 @@ declare ptr @llvm.stacksave()
 declare void @llvm.stackrestore(ptr)
 
 define void @test9(ptr %a) {
-; ALL-LABEL: @test9(
-; ALL-NEXT:  entry:
-; ALL-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_TYPE:%.*]] }>, align 8
-; ALL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 4
-; ALL-NEXT:    store i64 [[TMP0]], ptr [[ARGMEM]], align 8
-; ALL-NEXT:    call void @test9_aux(ptr nonnull inalloca(<{ [[STRUCT_TYPE]] }>) [[ARGMEM]])
-; ALL-NEXT:    ret void
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_TYPE:%.*]] }>, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[ARGMEM]], align 4
+; CHECK-NEXT:    call void @test9_aux(ptr nonnull inalloca(<{ [[STRUCT_TYPE]] }>) [[ARGMEM]])
+; CHECK-NEXT:    ret void
+;
+; P32-LABEL: @test9(
+; P32-NEXT:  entry:
+; P32-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_TYPE:%.*]] }>, align 1
+; P32-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 4
+; P32-NEXT:    store i64 [[TMP0]], ptr [[ARGMEM]], align 4
+; P32-NEXT:    call void @test9_aux(ptr nonnull inalloca(<{ [[STRUCT_TYPE]] }>) [[ARGMEM]])
+; P32-NEXT:    ret void
+;
+; NODL-LABEL: @test9(
+; NODL-NEXT:  entry:
+; NODL-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_TYPE:%.*]] }>, align 8
+; NODL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 4
+; NODL-NEXT:    store i64 [[TMP0]], ptr [[ARGMEM]], align 8
+; NODL-NEXT:    call void @test9_aux(ptr nonnull inalloca(<{ [[STRUCT_TYPE]] }>) [[ARGMEM]])
+; NODL-NEXT:    ret void
 ;
 entry:
   %inalloca.save = call ptr @llvm.stacksave()
diff --git a/llvm/test/Transforms/InstCombine/apint-shift.ll b/llvm/test/Transforms/InstCombine/apint-shift.ll
index cbe8ed993b400..2d862ff6debd1 100644
--- a/llvm/test/Transforms/InstCombine/apint-shift.ll
+++ b/llvm/test/Transforms/InstCombine/apint-shift.ll
@@ -565,7 +565,7 @@ define i40 @test26(i40 %A) {
 define i177 @ossfuzz_9880(i177 %X) {
 ; CHECK-LABEL: @ossfuzz_9880(
 ; CHECK-NEXT:    [[A:%.*]] = alloca i177, align 8
-; CHECK-NEXT:    [[L1:%.*]] = load i177, ptr [[A]], align 8
+; CHECK-NEXT:    [[L1:%.*]] = load i177, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i177 [[L1]], -1
 ; CHECK-NEXT:    [[B5_NEG:%.*]] = sext i1 [[TMP1]] to i177
 ; CHECK-NEXT:    [[B14:%.*]] = add i177 [[L1]], [[B5_NEG]]
diff --git a/llvm/test/Transforms/InstCombine/assume-align.ll b/llvm/test/Transforms/InstCombine/assume-align.ll
index 80da69b2da3ed..798707f317d29 100644
--- a/llvm/test/Transforms/InstCombine/assume-align.ll
+++ b/llvm/test/Transforms/InstCombine/assume-align.ll
@@ -56,10 +56,10 @@ define void @f2(ptr %a) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    store i64 16, ptr [[TMP0]], align 8
+; CHECK-NEXT:    store i64 16, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    store i8 1, ptr [[TMP0]], align 8
+; CHECK-NEXT:    store i8 1, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/assume-loop-align.ll b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
index 24007bacd31ad..e7eb18c61b6bb 100644
--- a/llvm/test/Transforms/InstCombine/assume-loop-align.ll
+++ b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
@@ -22,10 +22,10 @@ define void @foo(ptr %a, ptr %b) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 64
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1648
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index ee6d86610274b..52ce90e0515a5 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -7,12 +7,11 @@ target triple = "x86_64-unknown-linux-gnu"
 
 declare void @llvm.assume(i1) #1
 
-; Check that the alignment has been upgraded and that the assume has not
-; been removed:
+; Check that the assume has not been removed:
 
 define i32 @foo1(ptr %a) #0 {
 ; DEFAULT-LABEL: @foo1(
-; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 32
+; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
 ; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
 ; DEFAULT-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
@@ -20,7 +19,7 @@ define i32 @foo1(ptr %a) #0 {
 ; DEFAULT-NEXT:    ret i32 [[T0]]
 ;
 ; BUNDLES-LABEL: @foo1(
-; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 32
+; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ]
 ; BUNDLES-NEXT:    ret i32 [[T0]]
 ;
@@ -40,12 +39,12 @@ define i32 @foo2(ptr %a) #0 {
 ; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
 ; DEFAULT-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
 ; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A]], align 32
+; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A]], align 4
 ; DEFAULT-NEXT:    ret i32 [[T0]]
 ;
 ; BUNDLES-LABEL: @foo2(
 ; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A:%.*]], i64 32) ]
-; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A]], align 32
+; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A]], align 4
 ; BUNDLES-NEXT:    ret i32 [[T0]]
 ;
   %ptrint = ptrtoint ptr %a to i64
@@ -266,7 +265,7 @@ define i32 @bundle2(ptr %P) {
 
 define i1 @nonnull1(ptr %a) {
 ; CHECK-LABEL: @nonnull1(
-; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[A:%.*]], align 8, !nonnull [[META6:![0-9]+]], !noundef [[META6]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[A:%.*]], align 8, !nonnull !6, !noundef !6
 ; CHECK-NEXT:    tail call void @escape(ptr nonnull [[LOAD]])
 ; CHECK-NEXT:    ret i1 false
 ;
diff --git a/llvm/test/Transforms/InstCombine/assume_inevitable.ll b/llvm/test/Transforms/InstCombine/assume_inevitable.ll
index b86b84dba5232..2643c9b525cb5 100644
--- a/llvm/test/Transforms/InstCombine/assume_inevitable.ll
+++ b/llvm/test/Transforms/InstCombine/assume_inevitable.ll
@@ -3,15 +3,14 @@
 
 ; Check that assume is propagated backwards through all
 ; operations that are `isGuaranteedToTransferExecutionToSuccessor`
-; (it should reach the load and mark it as `align 32`).
 define i32 @assume_inevitable(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @assume_inevitable(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[M:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[LOADRES:%.*]] = load i32, ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    [[LOADRES2:%.*]] = call i32 @llvm.annotation.i32.p0(i32 [[LOADRES]], ptr nonnull @.str, ptr nonnull @.str1, i32 2)
-; CHECK-NEXT:    store i32 [[LOADRES2]], ptr [[A]], align 32
+; CHECK-NEXT:    store i32 [[LOADRES2]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[DUMMY_EQ:%.*]] = icmp ugt i32 [[LOADRES]], 42
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[DUMMY_EQ]])
 ; CHECK-NEXT:    [[M_A:%.*]] = call ptr @llvm.ptr.annotation.p0.p0(ptr nonnull [[M]], ptr nonnull @.str, ptr nonnull @.str1, i32 2, ptr null)
diff --git a/llvm/test/Transforms/InstCombine/atomic.ll b/llvm/test/Transforms/InstCombine/atomic.ll
index e4a3d0f8e1a77..75ee53982dfe1 100644
--- a/llvm/test/Transforms/InstCombine/atomic.ll
+++ b/llvm/test/Transforms/InstCombine/atomic.ll
@@ -128,7 +128,7 @@ define i32 @test9() {
 
 define i32 @test9_no_null_opt() #0 {
 ; CHECK-LABEL: @test9_no_null_opt(
-; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null unordered, align 4294967296
+; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null unordered, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = load atomic i32, ptr null unordered, align 4
@@ -138,7 +138,7 @@ define i32 @test9_no_null_opt() #0 {
 ; FIXME: Could also fold
 define i32 @test10() {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null monotonic, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = load atomic i32, ptr null monotonic, align 4
@@ -147,7 +147,7 @@ define i32 @test10() {
 
 define i32 @test10_no_null_opt() #0 {
 ; CHECK-LABEL: @test10_no_null_opt(
-; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null monotonic, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = load atomic i32, ptr null monotonic, align 4
@@ -157,7 +157,7 @@ define i32 @test10_no_null_opt() #0 {
 ; Would this be legal to fold?  Probably?
 define i32 @test11() {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null seq_cst, align 4294967296
+; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = load atomic i32, ptr null seq_cst, align 4
@@ -166,7 +166,7 @@ define i32 @test11() {
 
 define i32 @test11_no_null_opt() #0 {
 ; CHECK-LABEL: @test11_no_null_opt(
-; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null seq_cst, align 4294967296
+; CHECK-NEXT:    [[X:%.*]] = load atomic i32, ptr null seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = load atomic i32, ptr null seq_cst, align 4
@@ -177,7 +177,7 @@ define i32 @test11_no_null_opt() #0 {
 ; ordering imposed.
 define i32 @test12() {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    store atomic i32 poison, ptr null unordered, align 4294967296
+; CHECK-NEXT:    store atomic i32 poison, ptr null unordered, align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   store atomic i32 0, ptr null unordered, align 4
@@ -186,7 +186,7 @@ define i32 @test12() {
 
 define i32 @test12_no_null_opt() #0 {
 ; CHECK-LABEL: @test12_no_null_opt(
-; CHECK-NEXT:    store atomic i32 0, ptr null unordered, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null unordered, align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   store atomic i32 0, ptr null unordered, align 4
@@ -196,7 +196,7 @@ define i32 @test12_no_null_opt() #0 {
 ; FIXME: Could also fold
 define i32 @test13() {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   store atomic i32 0, ptr null monotonic, align 4
@@ -205,7 +205,7 @@ define i32 @test13() {
 
 define i32 @test13_no_null_opt() #0 {
 ; CHECK-LABEL: @test13_no_null_opt(
-; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   store atomic i32 0, ptr null monotonic, align 4
@@ -215,7 +215,7 @@ define i32 @test13_no_null_opt() #0 {
 ; Would this be legal to fold?  Probably?
 define i32 @test14() {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   store atomic i32 0, ptr null seq_cst, align 4
@@ -224,7 +224,7 @@ define i32 @test14() {
 
 define i32 @test14_no_null_opt() #0 {
 ; CHECK-LABEL: @test14_no_null_opt(
-; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   store atomic i32 0, ptr null seq_cst, align 4
diff --git a/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll b/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
index e59f7529c6722..30d5cd66066bb 100644
--- a/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
+++ b/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
@@ -185,7 +185,7 @@ define i32 @constant_fold_bitcast_itof_load() {
 
 define <4 x float> @constant_fold_bitcast_vector_as() {
 ; CHECK-LABEL: @constant_fold_bitcast_vector_as(
-; CHECK-NEXT:    [[A:%.*]] = load <4 x float>, ptr addrspace(3) @g_v4f_as3, align 16
+; CHECK-NEXT:    [[A:%.*]] = load <4 x float>, ptr addrspace(3) @g_v4f_as3, align 4
 ; CHECK-NEXT:    ret <4 x float> [[A]]
 ;
   %a = load <4 x float>, ptr addrspace(3) @g_v4f_as3, align 4
@@ -196,7 +196,7 @@ define <4 x float> @constant_fold_bitcast_vector_as() {
 
 define i32 @test_cast_gep_small_indices_as() {
 ; CHECK-LABEL: @test_cast_gep_small_indices_as(
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr addrspace(3) @i32_array_as3, align 16
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr addrspace(3) @i32_array_as3, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = load i32, ptr addrspace(3) @i32_array_as3, align 4
@@ -214,7 +214,7 @@ define i32 @test_cast_gep_small_indices_as() {
 
 define i32 @test_cast_gep_large_indices_as() {
 ; CHECK-LABEL: @test_cast_gep_large_indices_as(
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr addrspace(3) @i32_array_as3, align 16
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr addrspace(3) @i32_array_as3, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = load i32, ptr addrspace(3) @i32_array_as3, align 4
@@ -223,7 +223,7 @@ define i32 @test_cast_gep_large_indices_as() {
 
 define i32 @test_constant_cast_gep_struct_indices_as() {
 ; CHECK-LABEL: @test_constant_cast_gep_struct_indices_as(
-; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[STRUCT_FOO:%.*]], ptr addrspace(3) @constant_fold_global_ptr, i16 0, i32 2, i16 2), align 16
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[STRUCT_FOO:%.*]], ptr addrspace(3) @constant_fold_global_ptr, i16 0, i32 2, i16 2), align 4
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
   %x = getelementptr %struct.foo, ptr addrspace(3) @constant_fold_global_ptr, i18 0, i32 2, i12 2
diff --git a/llvm/test/Transforms/InstCombine/constant-fold-gep.ll b/llvm/test/Transforms/InstCombine/constant-fold-gep.ll
index 80a18c0abfadc..009c19dfa66cf 100644
--- a/llvm/test/Transforms/InstCombine/constant-fold-gep.ll
+++ b/llvm/test/Transforms/InstCombine/constant-fold-gep.ll
@@ -11,26 +11,26 @@ target datalayout = "E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 
 define void @frob() {
 ; CHECK-LABEL: @frob(
-; CHECK-NEXT:    store i32 1, ptr @Y, align 16
+; CHECK-NEXT:    store i32 1, ptr @Y, align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 0, i64 2), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 0, i64 2), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 1, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 1, i64 1), align 16
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 1, i64 1), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 1, i64 2), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 0), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 2), align 16
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 2), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 1, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 1, i64 1), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 1, i64 1), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 1, i64 2), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 0, i64 0), align 16
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 0, i64 0), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 0, i64 1), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 0, i64 2), align 8
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 1, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 1, i64 1), align 16
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 1, i64 1), align 8
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 1, i64 2), align 4
 ; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 1, i64 0, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %struct.X], ptr @Y, i64 2, i64 0, i32 0, i64 0), align 16
+; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %struct.X], ptr @Y, i64 2, i64 0, i32 0, i64 0), align 8
 ; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %struct.X], ptr @Y, i64 1, i64 0, i32 0, i64 1), align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -97,25 +97,6 @@ entry:
   ret i16 %E
 }
 
-; Check that we improve the alignment information.
-; The base pointer is 16-byte aligned and we access the field at
-; an offset of 8-byte.
-; Every element in the @CallerInfos array is 16-byte aligned so
-; any access from the following gep is 8-byte aligned.
-%struct.CallerInfo = type { ptr, i32 }
-@CallerInfos = global [128 x %struct.CallerInfo] zeroinitializer, align 16
-
-define i32 @test_gep_in_struct(i64 %idx) {
-; CHECK-LABEL: @test_gep_in_struct(
-; CHECK-NEXT:    [[NS7:%.*]] = getelementptr inbounds [128 x %struct.CallerInfo], ptr @CallerInfos, i64 0, i64 [[IDX:%.*]], i32 1
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[NS7]], align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %NS7 = getelementptr inbounds [128 x %struct.CallerInfo], ptr @CallerInfos, i64 0, i64 %idx, i32 1
-  %res = load i32, ptr %NS7, align 1
-  ret i32 %res
-}
-
 @g = external global i8
 @g2 = external global i8
 
diff --git a/llvm/test/Transforms/InstCombine/dbg-scalable-store-fixed-frag.ll b/llvm/test/Transforms/InstCombine/dbg-scalable-store-fixed-frag.ll
index 5307ebe399379..a8a7ee4608f65 100644
--- a/llvm/test/Transforms/InstCombine/dbg-scalable-store-fixed-frag.ll
+++ b/llvm/test/Transforms/InstCombine/dbg-scalable-store-fixed-frag.ll
@@ -4,10 +4,10 @@
 define i32 @foo(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARR:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[ARR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata <vscale x 2 x i32> undef, metadata [[META8:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    store <vscale x 2 x i32> [[X:%.*]], ptr [[ARR]], align 8
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[ARR]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> [[X:%.*]], ptr [[ARR]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[ARR]], align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
@@ -21,10 +21,10 @@ entry:
 define i32 @foo2(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @foo2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARR:%.*]] = alloca [4 x i32], align 8
+; CHECK-NEXT:    [[ARR:%.*]] = alloca [4 x i32], align 4
 ; CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARR]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    store <vscale x 2 x i32> [[X:%.*]], ptr [[ARR]], align 8
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[ARR]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> [[X:%.*]], ptr [[ARR]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[ARR]], align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/fcmp-denormals-are-zero.ll b/llvm/test/Transforms/InstCombine/fcmp-denormals-are-zero.ll
index 216d06c7be81e..22c422c816039 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-denormals-are-zero.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-denormals-are-zero.ll
@@ -8,13 +8,13 @@
 define void @denormal_input_preserve_sign_fcmp_olt_smallest_normalized(float %f32, double %f64, half %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_fcmp_olt_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp oeq float [[F32:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp oeq double [[F64:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp oeq half [[F16:%.*]], 0xH0000
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF32_FLAGS:%.*]] = fcmp oeq float [[F32]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF32_FLAGS]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32_FLAGS]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -41,11 +41,11 @@ define void @denormal_input_preserve_sign_fcmp_olt_smallest_normalized(float %f3
 define void @denormal_input_preserve_sign_fcmp_uge_smallest_normalized(float %f32, double %f64, half %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_fcmp_uge_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp une float [[F32:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp une double [[F64:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp une half [[F16:%.*]], 0xH0000
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -67,11 +67,11 @@ define void @denormal_input_preserve_sign_fcmp_uge_smallest_normalized(float %f3
 define void @denormal_input_preserve_sign_fcmp_oge_smallest_normalized(float %f32, double %f64, half %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_fcmp_oge_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp one float [[F32:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp one double [[F64:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp one half [[F16:%.*]], 0xH0000
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -93,11 +93,11 @@ define void @denormal_input_preserve_sign_fcmp_oge_smallest_normalized(float %f3
 define void @denormal_input_preserve_sign_fcmp_ult_smallest_normalized(float %f32, double %f64, half %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_fcmp_ult_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp ueq float [[F32:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp ueq double [[F64:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp ueq half [[F16:%.*]], 0xH0000
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -117,11 +117,11 @@ define void @denormal_input_preserve_sign_fcmp_ult_smallest_normalized(float %f3
 define void @denormal_input_preserve_sign_vector_fcmp_olt_smallest_normalized(<2 x float> %f32, <2 x double> %f64, <2 x half> %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_vector_fcmp_olt_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp oeq <2 x float> [[F32:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp oeq <2 x double> [[F64:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp oeq <2 x half> [[F16:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %f32)
@@ -141,11 +141,11 @@ define void @denormal_input_preserve_sign_vector_fcmp_olt_smallest_normalized(<2
 define void @denormal_input_preserve_sign_vector_fcmp_uge_smallest_normalized(<2 x float> %f32, <2 x double> %f64, <2 x half> %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_vector_fcmp_uge_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp une <2 x float> [[F32:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp une <2 x double> [[F64:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp une <2 x half> [[F16:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %f32)
@@ -165,11 +165,11 @@ define void @denormal_input_preserve_sign_vector_fcmp_uge_smallest_normalized(<2
 define void @denormal_input_preserve_sign_vector_fcmp_oge_smallest_normalized(<2 x float> %f32, <2 x double> %f64, <2 x half> %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_vector_fcmp_oge_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp one <2 x float> [[F32:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp one <2 x double> [[F64:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp one <2 x half> [[F16:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %f32)
@@ -189,11 +189,11 @@ define void @denormal_input_preserve_sign_vector_fcmp_oge_smallest_normalized(<2
 define void @denormal_input_preserve_sign_vector_fcmp_ult_smallest_normalized(<2 x float> %f32, <2 x double> %f64, <2 x half> %f16) #0 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_vector_fcmp_ult_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp ueq <2 x float> [[F32:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp ueq <2 x double> [[F64:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp ueq <2 x half> [[F16:%.*]], zeroinitializer
-; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile <2 x i1> [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %f32)
@@ -215,11 +215,11 @@ define void @denormal_input_preserve_sign_vector_fcmp_ult_smallest_normalized(<2
 define void @denormal_input_positive_zero_fcmp_olt_smallest_normalized(float %f32, double %f64, half %f16) #1 {
 ; CHECK-LABEL: @denormal_input_positive_zero_fcmp_olt_smallest_normalized(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp oeq float [[F32:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp oeq double [[F64:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp oeq half [[F16:%.*]], 0xH0000
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -241,13 +241,13 @@ define void @denormal_input_ieee(float %f32, double %f64, half %f16) #2 {
 ; CHECK-LABEL: @denormal_input_ieee(
 ; CHECK-NEXT:    [[F32_FABS:%.*]] = call float @llvm.fabs.f32(float [[F32:%.*]])
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp olt float [[F32_FABS]], 0x3810000000000000
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[F64_FABS:%.*]] = call double @llvm.fabs.f64(double [[F64:%.*]])
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp olt double [[F64_FABS]], 0x10000000000000
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[F16_FABS:%.*]] = call half @llvm.fabs.f16(half [[F16:%.*]])
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp olt half [[F16_FABS]], 0xH0400
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -268,13 +268,13 @@ define void @denormal_input_ieee(float %f32, double %f64, half %f16) #2 {
 define void @denormal_input_preserve_sign_f32_only(float %f32, double %f64, half %f16) #3 {
 ; CHECK-LABEL: @denormal_input_preserve_sign_f32_only(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp oeq float [[F32:%.*]], 0.000000e+00
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[F64_FABS:%.*]] = call double @llvm.fabs.f64(double [[F64:%.*]])
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp olt double [[F64_FABS]], 0x10000000000000
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[F16_FABS:%.*]] = call half @llvm.fabs.f16(half [[F16:%.*]])
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp olt half [[F16_FABS]], 0xH0400
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -295,13 +295,13 @@ define void @wrong_fcmp_type_ole(float %f32, double %f64, half %f16) #0 {
 ; CHECK-LABEL: @wrong_fcmp_type_ole(
 ; CHECK-NEXT:    [[F32_FABS:%.*]] = call float @llvm.fabs.f32(float [[F32:%.*]])
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp ole float [[F32_FABS]], 0x3810000000000000
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[F64_FABS:%.*]] = call double @llvm.fabs.f64(double [[F64:%.*]])
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp ole double [[F64_FABS]], 0x10000000000000
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[F16_FABS:%.*]] = call half @llvm.fabs.f16(half [[F16:%.*]])
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp ole half [[F16_FABS]], 0xH0400
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %f32.fabs = call float @llvm.fabs.f32(float %f32)
@@ -321,11 +321,11 @@ define void @wrong_fcmp_type_ole(float %f32, double %f64, half %f16) #0 {
 define void @missing_fabs(float %f32, double %f64, half %f16) #0 {
 ; CHECK-LABEL: @missing_fabs(
 ; CHECK-NEXT:    [[CMPF32:%.*]] = fcmp olt float [[F32:%.*]], 0x3810000000000000
-; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF32]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF64:%.*]] = fcmp olt double [[F64:%.*]], 0x10000000000000
-; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF64]], ptr @var, align 1
 ; CHECK-NEXT:    [[CMPF16:%.*]] = fcmp olt half [[F16:%.*]], 0xH0400
-; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 4
+; CHECK-NEXT:    store volatile i1 [[CMPF16]], ptr @var, align 1
 ; CHECK-NEXT:    ret void
 ;
   %cmpf32 = fcmp olt float %f32, 0x3810000000000000
diff --git a/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll b/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
index f4be83ace0c0e..15eb3e15ea44a 100644
--- a/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
@@ -16,8 +16,8 @@ define void @bork() nounwind  {
 ; CHECK-LABEL: @bork(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COLOR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[COLOR]], align 8
-; CHECK-NEXT:    [[TMP105:%.*]] = load ptr, ptr @"\01L_OBJC_SELECTOR_REFERENCES_81", align 8
+; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[COLOR]], align 4
+; CHECK-NEXT:    [[TMP105:%.*]] = load ptr, ptr @"\01L_OBJC_SELECTOR_REFERENCES_81", align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = call float @objc_msgSend_fpret(ptr [[TMP103]], ptr [[TMP105]]) #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
index e80f1b242a52c..41285c78f03ed 100644
--- a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
@@ -62,7 +62,7 @@ define void @test_evaluate_gep_nested_as_ptrs(ptr addrspace(2) %B) {
 
 define void @test_evaluate_gep_as_ptrs_array(ptr addrspace(2) %B) {
 ; CHECK-LABEL: @test_evaluate_gep_as_ptrs_array(
-; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) getelementptr inbounds ([4 x ptr addrspace(2)], ptr addrspace(1) @arst, i32 0, i32 2), align 16
+; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) getelementptr inbounds ([4 x ptr addrspace(2)], ptr addrspace(1) @arst, i32 0, i32 2), align 8
 ; CHECK-NEXT:    ret void
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 9046cb6b4529e..fb6147b688975 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -91,7 +91,7 @@ define void @test5_as1(i8 %B) {
 ; This should be turned into a constexpr instead of being an instruction
 define void @test_evaluate_gep_nested_as_ptrs(ptr addrspace(2) %B) {
 ; CHECK-LABEL: @test_evaluate_gep_nested_as_ptrs(
-; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) @global_as1_as2_ptr, align 8
+; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) @global_as1_as2_ptr, align 4
 ; CHECK-NEXT:    ret void
 ;
   store ptr addrspace(2) %B, ptr addrspace(1) @global_as1_as2_ptr
@@ -458,7 +458,7 @@ define i32 @test20_as1(ptr addrspace(1) %P, i32 %A, i32 %B) {
 define i32 @test21() {
 ; CHECK-LABEL: @test21(
 ; CHECK-NEXT:    [[PBOB1:%.*]] = alloca [[INTSTRUCT:%.*]], align 8
-; CHECK-NEXT:    [[RVAL:%.*]] = load i32, ptr [[PBOB1]], align 8
+; CHECK-NEXT:    [[RVAL:%.*]] = load i32, ptr [[PBOB1]], align 4
 ; CHECK-NEXT:    ret i32 [[RVAL]]
 ;
   %pbob1 = alloca %intstruct
@@ -668,11 +668,11 @@ define i1 @test31(ptr %A) {
 define ptr @test32(ptr %v) {
 ; CHECK-LABEL: @test32(
 ; CHECK-NEXT:    [[A:%.*]] = alloca [4 x ptr], align 16
-; CHECK-NEXT:    store ptr null, ptr [[A]], align 16
+; CHECK-NEXT:    store ptr null, ptr [[A]], align 8
 ; CHECK-NEXT:    [[D:%.*]] = getelementptr inbounds { [16 x i8] }, ptr [[A]], i64 0, i32 0, i64 8
 ; CHECK-NEXT:    store ptr [[V:%.*]], ptr [[D]], align 8
 ; CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [4 x ptr], ptr [[A]], i64 0, i64 2
-; CHECK-NEXT:    [[G:%.*]] = load ptr, ptr [[F]], align 16
+; CHECK-NEXT:    [[G:%.*]] = load ptr, ptr [[F]], align 8
 ; CHECK-NEXT:    ret ptr [[G]]
 ;
   %A = alloca [4 x ptr], align 16
diff --git a/llvm/test/Transforms/InstCombine/load-cmp.ll b/llvm/test/Transforms/InstCombine/load-cmp.ll
index 7fb6e7a3a37b3..56f6e042b3caf 100644
--- a/llvm/test/Transforms/InstCombine/load-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/load-cmp.ll
@@ -216,7 +216,7 @@ define i1 @test10_struct(i32 %x) {
 define i1 @test10_struct_noinbounds(i32 %x) {
 ; CHECK-LABEL: @test10_struct_noinbounds(
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr [[FOO:%.*]], ptr @GS, i32 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[P]], align 8
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[Q]], 9
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -254,7 +254,7 @@ define i1 @test10_struct_noinbounds_i16(i16 %x) {
 ; CHECK-LABEL: @test10_struct_noinbounds_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr [[FOO:%.*]], ptr @GS, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[P]], align 8
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[Q]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/load-combine-metadata-dominance.ll b/llvm/test/Transforms/InstCombine/load-combine-metadata-dominance.ll
index 07a15d01cf43c..13dfc4a59877a 100644
--- a/llvm/test/Transforms/InstCombine/load-combine-metadata-dominance.ll
+++ b/llvm/test/Transforms/InstCombine/load-combine-metadata-dominance.ll
@@ -125,7 +125,7 @@ define void @combine_metadata_dominance6(ptr %p) {
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[P]], align 8, !align !2, !noundef !0
-; CHECK-NEXT:    store i32 0, ptr [[A]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll
index 0da1918018f86..fb6c50d5fd8fc 100644
--- a/llvm/test/Transforms/InstCombine/load.ll
+++ b/llvm/test/Transforms/InstCombine/load.ll
@@ -175,9 +175,9 @@ define <16 x i8> @test13(<2 x i64> %x) {
 define i8 @test14(i8 %x, i32 %y) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i8 [[X:%.*]], ptr [[A]], align 4
+; CHECK-NEXT:    store i8 [[X:%.*]], ptr [[A]], align 1
 ; CHECK-NEXT:    store i32 [[Y:%.*]], ptr [[A]], align 4
-; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[A]], align 4
+; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[A]], align 1
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %a = alloca i32
@@ -193,9 +193,9 @@ define i8 @test14(i8 %x, i32 %y) {
 
 define i8 @test15(i8 %x, i32 %y) {
 ; CHECK-LABEL: @test15(
-; CHECK-NEXT:    store i8 [[X:%.*]], ptr @test15_global, align 4
+; CHECK-NEXT:    store i8 [[X:%.*]], ptr @test15_global, align 1
 ; CHECK-NEXT:    store i32 [[Y:%.*]], ptr @test15_global, align 4
-; CHECK-NEXT:    [[R:%.*]] = load i8, ptr @test15_global, align 4
+; CHECK-NEXT:    [[R:%.*]] = load i8, ptr @test15_global, align 1
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   store i8 %x, ptr @test15_global
@@ -420,7 +420,7 @@ define i32 @load_via_strip_invariant_group() {
 
 define i4 @test_vector_load_i4_non_byte_sized() {
 ; CHECK-LABEL: @test_vector_load_i4_non_byte_sized(
-; CHECK-NEXT:    [[RES0:%.*]] = load i4, ptr @foo, align 8
+; CHECK-NEXT:    [[RES0:%.*]] = load i4, ptr @foo, align 1
 ; CHECK-NEXT:    ret i4 [[RES0]]
 ;
   %ptr0 = getelementptr i8, ptr @foo, i64 0
diff --git a/llvm/test/Transforms/InstCombine/loadstore-alignment.ll b/llvm/test/Transforms/InstCombine/loadstore-alignment.ll
index c71135fb99637..0fc82a1d53436 100644
--- a/llvm/test/Transforms/InstCombine/loadstore-alignment.ll
+++ b/llvm/test/Transforms/InstCombine/loadstore-alignment.ll
@@ -9,7 +9,7 @@ target datalayout = "E-p:64:64:64-p1:64:64:64-p2:32:32:32-a0:0:8-f32:32:32-f64:6
 
 define <2 x i64> @static_hem() {
 ; CHECK-LABEL: @static_hem(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (<2 x i64>, ptr @x, i64 7), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (<2 x i64>, ptr @x, i64 7), align 1
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %t = getelementptr <2 x i64>, ptr @x, i32 7
@@ -21,7 +21,7 @@ define <2 x i64> @hem(i32 %i) {
 ; CHECK-LABEL: @hem(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[T:%.*]] = getelementptr <2 x i64>, ptr @x, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[T]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[T]], align 1
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %t = getelementptr <2 x i64>, ptr @x, i32 %i
@@ -34,7 +34,7 @@ define <2 x i64> @hem_2d(i32 %i, i32 %j) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[J:%.*]] to i64
 ; CHECK-NEXT:    [[T:%.*]] = getelementptr [13 x <2 x i64>], ptr @xx, i64 [[TMP1]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[T]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[T]], align 1
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %t = getelementptr [13 x <2 x i64>], ptr @xx, i32 %i, i32 %j
@@ -44,7 +44,7 @@ define <2 x i64> @hem_2d(i32 %i, i32 %j) {
 
 define <2 x i64> @foo() {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @x, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @x, align 1
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %tmp1 = load <2 x i64>, ptr @x, align 1
@@ -55,7 +55,7 @@ define <2 x i64> @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:    [[T:%.*]] = alloca <2 x i64>, align 16
 ; CHECK-NEXT:    call void @kip(ptr nonnull [[T]])
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[T]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[T]], align 1
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %t = alloca <2 x i64>
@@ -66,7 +66,7 @@ define <2 x i64> @bar() {
 
 define void @static_hem_store(<2 x i64> %y) {
 ; CHECK-LABEL: @static_hem_store(
-; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr getelementptr (<2 x i64>, ptr @x, i64 7), align 16
+; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr getelementptr (<2 x i64>, ptr @x, i64 7), align 1
 ; CHECK-NEXT:    ret void
 ;
   %t = getelementptr <2 x i64>, ptr @x, i32 7
@@ -78,7 +78,7 @@ define void @hem_store(i32 %i, <2 x i64> %y) {
 ; CHECK-LABEL: @hem_store(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[T:%.*]] = getelementptr <2 x i64>, ptr @x, i64 [[TMP1]]
-; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr [[T]], align 16
+; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr [[T]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %t = getelementptr <2 x i64>, ptr @x, i32 %i
@@ -91,7 +91,7 @@ define void @hem_2d_store(i32 %i, i32 %j, <2 x i64> %y) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[J:%.*]] to i64
 ; CHECK-NEXT:    [[T:%.*]] = getelementptr [13 x <2 x i64>], ptr @xx, i64 [[TMP1]], i64 [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr [[T]], align 16
+; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr [[T]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %t = getelementptr [13 x <2 x i64>], ptr @xx, i32 %i, i32 %j
@@ -101,7 +101,7 @@ define void @hem_2d_store(i32 %i, i32 %j, <2 x i64> %y) {
 
 define void @foo_store(<2 x i64> %y) {
 ; CHECK-LABEL: @foo_store(
-; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr @x, align 16
+; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr @x, align 1
 ; CHECK-NEXT:    ret void
 ;
   store <2 x i64> %y, ptr @x, align 1
@@ -112,7 +112,7 @@ define void @bar_store(<2 x i64> %y) {
 ; CHECK-LABEL: @bar_store(
 ; CHECK-NEXT:    [[T:%.*]] = alloca <2 x i64>, align 16
 ; CHECK-NEXT:    call void @kip(ptr nonnull [[T]])
-; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr [[T]], align 16
+; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr [[T]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %t = alloca <2 x i64>
diff --git a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
index 349673974b8fa..ea9b16e1382ee 100644
--- a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -10,7 +10,7 @@ define float @test1(i32 %hash, float %x, float %y, float %z, float %w) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP3]], 124
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP753:%.*]] = getelementptr [128 x float], ptr @C.0.1248, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP753]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP753]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul float [[TMP9]], [[X:%.*]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd float [[TMP11]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP17_SUM52:%.*]] = or i32 [[TMP5]], 1
@@ -22,7 +22,7 @@ define float @test1(i32 %hash, float %x, float %y, float %z, float %w) {
 ; CHECK-NEXT:    [[TMP27_SUM50:%.*]] = or i32 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP27_SUM50]] to i64
 ; CHECK-NEXT:    [[TMP2849:%.*]] = getelementptr [128 x float], ptr @C.0.1248, i64 0, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP2849]], align 8
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP2849]], align 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul float [[TMP29]], [[Z:%.*]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = fadd float [[TMP31]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP37_SUM48:%.*]] = or i32 [[TMP5]], 3
diff --git a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
index e60d80cdf2da7..b35fceef372c6 100644
--- a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
+++ b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
@@ -30,12 +30,11 @@ define void @_Z4testv() {
 ; CHECK-NEXT:    [[I11:%.*]] = trunc i64 [[I7]] to i32
 ; CHECK-NEXT:    br label [[BB12]]
 ; CHECK:       bb12:
-; CHECK-NEXT:    [[STOREMERGE1:%.*]] = phi i32 [ [[I11]], [[BB10]] ], [ 1, [[BB9]] ]
 ; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ 1, [[BB9]] ], [ [[I11]], [[BB10]] ]
-; CHECK-NEXT:    store i32 [[STOREMERGE1]], ptr @arr_2, align 4
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr @arr_2, align 4
 ; CHECK-NEXT:    store i16 [[I4]], ptr @arr_4, align 2
 ; CHECK-NEXT:    [[I8:%.*]] = sext i16 [[I4]] to i32
-; CHECK-NEXT:    store i32 [[I8]], ptr @arr_3, align 16
+; CHECK-NEXT:    store i32 [[I8]], ptr @arr_3, align 4
 ; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr getelementptr inbounds ([0 x i32], ptr @arr_2, i64 0, i64 1), align 4
 ; CHECK-NEXT:    store i16 [[I4]], ptr getelementptr inbounds ([0 x i16], ptr @arr_4, i64 0, i64 1), align 2
 ; CHECK-NEXT:    store i32 [[I8]], ptr getelementptr inbounds ([8 x i32], ptr @arr_3, i64 0, i64 1), align 4
@@ -109,13 +108,13 @@ define i32 @diff_types_diff_width_no_merge(i1 %cond, i32 %a, i64 %b) {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]]
 ; CHECK:       A:
-; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    br label [[SINK:%.*]]
 ; CHECK:       B:
-; CHECK-NEXT:    store i64 [[B:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    store i64 [[B:%.*]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    br label [[SINK]]
 ; CHECK:       sink:
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    ret i32 [[VAL]]
 ;
 entry:
@@ -135,10 +134,10 @@ sink:
 define <4 x i32> @vec_no_merge(i1 %cond, <2 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @vec_no_merge(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 16
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]]
 ; CHECK:       A:
-; CHECK-NEXT:    store <2 x i32> [[A:%.*]], ptr [[ALLOCA]], align 16
+; CHECK-NEXT:    store <2 x i32> [[A:%.*]], ptr [[ALLOCA]], align 8
 ; CHECK-NEXT:    br label [[SINK:%.*]]
 ; CHECK:       B:
 ; CHECK-NEXT:    store <4 x i32> [[B:%.*]], ptr [[ALLOCA]], align 16
@@ -199,13 +198,13 @@ define %struct.tup @multi_elem_struct_no_merge(i1 %cond, %struct.tup %a, half %b
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]]
 ; CHECK:       A:
-; CHECK-NEXT:    store [[STRUCT_TUP:%.*]] [[A:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    store [[STRUCT_TUP:%.*]] [[A:%.*]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    br label [[SINK:%.*]]
 ; CHECK:       B:
-; CHECK-NEXT:    store half [[B:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    store half [[B:%.*]], ptr [[ALLOCA]], align 2
 ; CHECK-NEXT:    br label [[SINK]]
 ; CHECK:       sink:
-; CHECK-NEXT:    [[VAL:%.*]] = load [[STRUCT_TUP]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    [[VAL:%.*]] = load [[STRUCT_TUP]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    ret [[STRUCT_TUP]] [[VAL]]
 ;
 entry:
@@ -234,7 +233,7 @@ define i16 @same_types_diff_align_no_merge(i1 %cond, i16 %a, i16 %b) {
 ; CHECK-NEXT:    store i16 [[B:%.*]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    br label [[SINK]]
 ; CHECK:       sink:
-; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ALLOCA]], align 2
 ; CHECK-NEXT:    ret i16 [[VAL]]
 ;
 entry:
@@ -254,15 +253,17 @@ sink:
 define i64 @ptrtoint_merge(i1 %cond, i64 %a, ptr %b) {
 ; CHECK-LABEL: @ptrtoint_merge(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
 ; CHECK:       BB0:
+; CHECK-NEXT:    store i64 [[A:%.*]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    br label [[SINK:%.*]]
 ; CHECK:       BB1:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    store ptr [[B:%.*]], ptr [[ALLOCA]], align 8
 ; CHECK-NEXT:    br label [[SINK]]
 ; CHECK:       sink:
-; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i64 [ [[A:%.*]], [[BB0]] ], [ [[TMP0]], [[BB1]] ]
-; CHECK-NEXT:    ret i64 [[STOREMERGE]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i64, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    ret i64 [[VAL]]
 ;
 entry:
   %alloca = alloca ptr
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 81bed68139c3f..bb8e37b4cfb00 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -508,8 +508,8 @@ define i32 @test16(ptr addrspace(1) %pointer1, i32 %flag, ptr %pointer2)
 ; CHECK-LABEL: @test16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[POINTER1_ADDR:%.*]] = alloca ptr addrspace(1), align 8
-; CHECK-NEXT:    [[POINTER2_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[POINTER1_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+; CHECK-NEXT:    [[POINTER2_ADDR:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    store ptr addrspace(1) [[POINTER1:%.*]], ptr [[POINTER1_ADDR]], align 8
 ; CHECK-NEXT:    store ptr [[POINTER2:%.*]], ptr [[POINTER2_ADDR]], align 8
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
diff --git a/llvm/test/Transforms/InstCombine/pr33689_same_bitwidth.ll b/llvm/test/Transforms/InstCombine/pr33689_same_bitwidth.ll
index 465c1db0e0ce1..4c569993370b4 100644
--- a/llvm/test/Transforms/InstCombine/pr33689_same_bitwidth.ll
+++ b/llvm/test/Transforms/InstCombine/pr33689_same_bitwidth.ll
@@ -20,9 +20,9 @@ define void @f(i1 %cond) {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T9:%.*]] = load ptr, ptr @b, align 2
 ; CHECK-NEXT:    store i16 0, ptr [[T9]], align 2
-; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[T1]], align 8
+; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[T1]], align 4
 ; CHECK-NEXT:    [[T11:%.*]] = add i32 [[T10]], -1
-; CHECK-NEXT:    store i32 [[T11]], ptr [[T1]], align 8
+; CHECK-NEXT:    store i32 [[T11]], ptr [[T1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb0:
diff --git a/llvm/test/Transforms/InstCombine/pr44552.ll b/llvm/test/Transforms/InstCombine/pr44552.ll
index 86899c3d026a6..5301190d0123d 100644
--- a/llvm/test/Transforms/InstCombine/pr44552.ll
+++ b/llvm/test/Transforms/InstCombine/pr44552.ll
@@ -22,7 +22,7 @@
 define i16 @main() {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i64 0, ptr @csmith_sink_, align 8
+; CHECK-NEXT:    store i64 0, ptr @csmith_sink_, align 1
 ; CHECK-NEXT:    ret i16 0
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/pr59613.ll b/llvm/test/Transforms/InstCombine/pr59613.ll
index a669a0d4207e9..2db3bf0decf70 100644
--- a/llvm/test/Transforms/InstCombine/pr59613.ll
+++ b/llvm/test/Transforms/InstCombine/pr59613.ll
@@ -4,7 +4,7 @@
 ; This used to crash, depending on the particular worklist iteration order.
 define void @pr59613(<6 x i16> %0) {
 ; CHECK-LABEL: @pr59613(
-; CHECK-NEXT:    store <6 x i16> poison, ptr null, align 4294967296
+; CHECK-NEXT:    store <6 x i16> poison, ptr null, align 16
 ; CHECK-NEXT:    ret void
 ;
   %cmp1 = icmp ne <6 x i16> %0, zeroinitializer
diff --git a/llvm/test/Transforms/InstCombine/scalable-cast-of-alloc.ll b/llvm/test/Transforms/InstCombine/scalable-cast-of-alloc.ll
index ef0734b883f87..71799d18ed312 100644
--- a/llvm/test/Transforms/InstCombine/scalable-cast-of-alloc.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-cast-of-alloc.ll
@@ -21,9 +21,9 @@ entry:
 define void @scalable4i32_to_fixed16i32(ptr %out) {
 ; CHECK-LABEL: @scalable4i32_to_fixed16i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 64
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr [[TMP]], align 64
-; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, ptr [[TMP]], align 64
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr [[TMP]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, ptr [[TMP]], align 16
 ; CHECK-NEXT:    store <16 x i32> [[RELOAD]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -55,9 +55,9 @@ entry:
 define void @scalable16i32_to_fixed16i32(ptr %out) {
 ; CHECK-LABEL: @scalable16i32_to_fixed16i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 16 x i32>, align 64
-; CHECK-NEXT:    store volatile <16 x i32> zeroinitializer, ptr [[TMP]], align 64
-; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, ptr [[TMP]], align 64
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 16 x i32>, align 16
+; CHECK-NEXT:    store volatile <16 x i32> zeroinitializer, ptr [[TMP]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, ptr [[TMP]], align 16
 ; CHECK-NEXT:    store <16 x i32> [[RELOAD]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -72,9 +72,9 @@ entry:
 define void @scalable32i32_to_scalable16i32(ptr %out) {
 ; CHECK-LABEL: @scalable32i32_to_scalable16i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i32>, align 64
-; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, ptr [[TMP]], align 64
-; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, ptr [[TMP]], align 64
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i32>, align 16
+; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, ptr [[TMP]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, ptr [[TMP]], align 16
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[RELOAD]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -89,9 +89,9 @@ entry:
 define void @scalable32i16_to_scalable16i32(ptr %out) {
 ; CHECK-LABEL: @scalable32i16_to_scalable16i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i16>, align 64
-; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, ptr [[TMP]], align 64
-; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, ptr [[TMP]], align 64
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i16>, align 16
+; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, ptr [[TMP]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, ptr [[TMP]], align 16
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[RELOAD]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -106,11 +106,11 @@ entry:
 define void @scalable32i16_to_scalable16i32_multiuse(ptr %out, ptr %out2) {
 ; CHECK-LABEL: @scalable32i16_to_scalable16i32_multiuse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i16>, align 64
-; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, ptr [[TMP]], align 64
-; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, ptr [[TMP]], align 64
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i16>, align 16
+; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, ptr [[TMP]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, ptr [[TMP]], align 16
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[RELOAD]], ptr [[OUT:%.*]], align 16
-; CHECK-NEXT:    [[RELOAD2:%.*]] = load volatile <vscale x 32 x i16>, ptr [[TMP]], align 64
+; CHECK-NEXT:    [[RELOAD2:%.*]] = load volatile <vscale x 32 x i16>, ptr [[TMP]], align 16
 ; CHECK-NEXT:    store <vscale x 32 x i16> [[RELOAD2]], ptr [[OUT2:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index a2cdb951d31bd..82e163d93f0f1 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -1212,8 +1212,8 @@ define ptr @test83(i1 %flag) {
 ; CHECK-NEXT:    [[Y:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    call void @scribble_on_i64(ptr nonnull [[X]])
 ; CHECK-NEXT:    call void @scribble_on_i64(ptr nonnull [[Y]])
-; CHECK-NEXT:    [[T:%.*]] = load i64, ptr [[X]], align 8
-; CHECK-NEXT:    store i64 [[T]], ptr [[Y]], align 8
+; CHECK-NEXT:    [[T:%.*]] = load i64, ptr [[X]], align 4
+; CHECK-NEXT:    store i64 [[T]], ptr [[Y]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = inttoptr i64 [[T]] to ptr
 ; CHECK-NEXT:    ret ptr [[V]]
 ;
@@ -1261,8 +1261,8 @@ define ptr @test85(i1 %flag) {
 ; CHECK-NEXT:    [[Y:%.*]] = alloca i128, align 8
 ; CHECK-NEXT:    call void @scribble_on_i128(ptr nonnull [[X]])
 ; CHECK-NEXT:    call void @scribble_on_i128(ptr nonnull [[Y]])
-; CHECK-NEXT:    [[T:%.*]] = load i128, ptr [[X]], align 8
-; CHECK-NEXT:    store i128 [[T]], ptr [[Y]], align 8
+; CHECK-NEXT:    [[T:%.*]] = load i128, ptr [[X]], align 4
+; CHECK-NEXT:    store i128 [[T]], ptr [[Y]], align 4
 ; CHECK-NEXT:    [[X_VAL:%.*]] = load ptr, ptr [[X]], align 8
 ; CHECK-NEXT:    [[Y_VAL:%.*]] = load ptr, ptr [[Y]], align 8
 ; CHECK-NEXT:    [[V:%.*]] = select i1 [[FLAG:%.*]], ptr [[X_VAL]], ptr [[Y_VAL]]
@@ -1290,8 +1290,8 @@ define i128 @test86(i1 %flag) {
 ; CHECK-NEXT:    call void @scribble_on_i128(ptr nonnull [[Y]])
 ; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[X]], align 8
 ; CHECK-NEXT:    store ptr [[T]], ptr [[Y]], align 8
-; CHECK-NEXT:    [[X_VAL:%.*]] = load i128, ptr [[X]], align 8
-; CHECK-NEXT:    [[Y_VAL:%.*]] = load i128, ptr [[Y]], align 8
+; CHECK-NEXT:    [[X_VAL:%.*]] = load i128, ptr [[X]], align 4
+; CHECK-NEXT:    [[Y_VAL:%.*]] = load i128, ptr [[Y]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = select i1 [[FLAG:%.*]], i128 [[X_VAL]], i128 [[Y_VAL]]
 ; CHECK-NEXT:    ret i128 [[V]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/store.ll b/llvm/test/Transforms/InstCombine/store.ll
index fab280b366f44..53e0865e16c87 100644
--- a/llvm/test/Transforms/InstCombine/store.ll
+++ b/llvm/test/Transforms/InstCombine/store.ll
@@ -30,7 +30,7 @@ define void @store_into_undef(ptr %P) {
 
 define void @store_into_null(ptr %P) {
 ; CHECK-LABEL: @store_into_null(
-; CHECK-NEXT:    store i32 poison, ptr null, align 4294967296
+; CHECK-NEXT:    store i32 poison, ptr null, align 4
 ; CHECK-NEXT:    ret void
 ;
   store i32 124, ptr null
diff --git a/llvm/test/Transforms/InstCombine/trivial-dse-calls.ll b/llvm/test/Transforms/InstCombine/trivial-dse-calls.ll
index 1d54fe0827ce4..feb98891035ba 100644
--- a/llvm/test/Transforms/InstCombine/trivial-dse-calls.ll
+++ b/llvm/test/Transforms/InstCombine/trivial-dse-calls.ll
@@ -124,7 +124,7 @@ define void @test_neg_unmodeled_write() {
 define i32 @test_neg_captured_by_call() {
 ; CHECK-LABEL: @test_neg_captured_by_call(
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[A2:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[A2:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    call void @f2(ptr nonnull writeonly [[A]], ptr nonnull [[A2]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[A_COPY_CAST:%.*]] = load ptr, ptr [[A2]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[A_COPY_CAST]], align 4
diff --git a/llvm/test/Transforms/InstCombine/vscale_gep.ll b/llvm/test/Transforms/InstCombine/vscale_gep.ll
index 8bb4e2cb95ac9..2a1865f69fe30 100644
--- a/llvm/test/Transforms/InstCombine/vscale_gep.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_gep.ll
@@ -41,7 +41,7 @@ define i32 @gep_alloca_inbounds_vscale_zero() {
 ; CHECK-LABEL: @gep_alloca_inbounds_vscale_zero(
 ; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>, align 16
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[A]], i64 0, i64 2
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
   %a = alloca <vscale x 4 x i32>
@@ -55,7 +55,7 @@ define i32 @gep_alloca_inbounds_vscale_nonzero() {
 ; CHECK-LABEL: @gep_alloca_inbounds_vscale_nonzero(
 ; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>, align 16
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[A]], i64 1, i64 2
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
   %a = alloca <vscale x 4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
index 630fef820b809..97bb4a2b4db53 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
@@ -9,8 +9,8 @@
 define zeroext i8 @sum() {
 ; CHECK-LABEL: @sum(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, ptr getelementptr inbounds ([128 x i8], ptr @bytes, i64 0, i64 64), align 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, ptr @bytes, align 16
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, ptr getelementptr inbounds ([128 x i8], ptr @bytes, i64 0, i64 64), align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, ptr @bytes, align 1
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <64 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
 ; CHECK-NEXT:    ret i8 [[TMP0]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index fd99e3f2ddde0..6a6440dbed601 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -29,12 +29,12 @@ define void @example1() optsize {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP4]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -90,7 +90,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP5]], align 16
+; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
@@ -106,7 +106,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK:       pred.store.if3:
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP10]]
-; CHECK-NEXT:    store i32 [[X]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store i32 [[X]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index a89482a943569..80348f9e46130 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -25,10 +25,10 @@ define i32 @main() #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [40000 x i8], ptr addrspace(1) @Y, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[TMP0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [40000 x i8], ptr @X, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 40000
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/non-const-n.ll b/llvm/test/Transforms/LoopVectorize/non-const-n.ll
index aa04da5639a72..295bf9111329c 100644
--- a/llvm/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/llvm/test/Transforms/LoopVectorize/non-const-n.ll
@@ -20,12 +20,12 @@ define void @example1(i32 %n) nounwind uwtable ssp {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
index 7fde8ef2914a8..b1cee80bde33f 100644
--- a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
@@ -19,7 +19,7 @@ define void @caller1(i1 %c, ptr align 1 %ptr) {
 ; ASSUMPTIONS-OFF-NEXT:    br i1 [[C:%.*]], label [[COMMON_RET:%.*]], label [[FALSE2:%.*]]
 ; ASSUMPTIONS-OFF:       common.ret:
 ; ASSUMPTIONS-OFF-NEXT:    [[DOTSINK:%.*]] = phi i64 [ 3, [[FALSE2]] ], [ 2, [[TMP0:%.*]] ]
-; ASSUMPTIONS-OFF-NEXT:    store volatile i64 0, ptr [[PTR:%.*]], align 8
+; ASSUMPTIONS-OFF-NEXT:    store volatile i64 0, ptr [[PTR:%.*]], align 4
 ; ASSUMPTIONS-OFF-NEXT:    store volatile i64 -1, ptr [[PTR]], align 4
 ; ASSUMPTIONS-OFF-NEXT:    store volatile i64 -1, ptr [[PTR]], align 4
 ; ASSUMPTIONS-OFF-NEXT:    store volatile i64 -1, ptr [[PTR]], align 4

From a317afaf00b0b898538855c36c203bb9117d6b86 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 19 Sep 2023 23:44:55 -0700
Subject: [PATCH 48/57] [ELF][test] Improve -r tests for local symbols

---
 lld/test/ELF/relocatable-discard-locals.s |  9 ++++
 lld/test/ELF/relocatable-local-sym.s      | 50 +++++++++++++++++------
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/lld/test/ELF/relocatable-discard-locals.s b/lld/test/ELF/relocatable-discard-locals.s
index 7f9640ceca19f..8941e8de7abc8 100644
--- a/lld/test/ELF/relocatable-discard-locals.s
+++ b/lld/test/ELF/relocatable-discard-locals.s
@@ -16,26 +16,31 @@
 # DISCARD-LOCALS:    0: {{0+}} 0 NOTYPE  LOCAL  DEFAULT UND
 # DISCARD-LOCALS-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} .Lused
 # DISCARD-LOCALS-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} used
+# DISCARD-LOCALS-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} .L.str
 # DISCARD-LOCALS-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} unused
 # DISCARD-LOCALS-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} unused_gc
 # DISCARD-LOCALS-NEXT:           SECTION LOCAL  DEFAULT {{.*}} .text
 # DISCARD-LOCALS-NEXT:           SECTION LOCAL  DEFAULT {{.*}} text
 # DISCARD-LOCALS-NEXT:           SECTION LOCAL  DEFAULT {{.*}} gc
+# DISCARD-LOCALS-NEXT:           SECTION LOCAL  DEFAULT {{.*}} .rodata.str1.1
 # DISCARD-LOCALS-NEXT:           NOTYPE  GLOBAL DEFAULT {{.*}} _start
 
 ## --discard-all removes all unused regular local symbols.
 # DISCARD-ALL:    0: {{0+}} 0 NOTYPE  LOCAL  DEFAULT UND
 # DISCARD-ALL-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} .Lused
 # DISCARD-ALL-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} used
+# DISCARD-ALL-NEXT:           NOTYPE  LOCAL  DEFAULT {{.*}} .L.str
 # DISCARD-ALL-NEXT:           SECTION LOCAL  DEFAULT {{.*}} .text
 # DISCARD-ALL-NEXT:           SECTION LOCAL  DEFAULT {{.*}} text
 # DISCARD-ALL-NEXT:           SECTION LOCAL  DEFAULT {{.*}} gc
+# DISCARD-ALL-NEXT:           SECTION LOCAL  DEFAULT {{.*}} .rodata.str1.1
 # DISCARD-ALL-NEXT:           NOTYPE  GLOBAL DEFAULT {{.*}} _start
 
 # REL:      .rela.text {
 # REL-NEXT:   R_X86_64_PLT32 text 0xFFFFFFFFFFFFFFFC
 # REL-NEXT:   R_X86_64_PLT32 .Lused 0xFFFFFFFFFFFFFFFC
 # REL-NEXT:   R_X86_64_PLT32 used 0xFFFFFFFFFFFFFFFC
+# REL-NEXT:   R_X86_64_PC32 .L.str 0xFFFFFFFFFFFFFFFC
 # REL-NEXT: }
 
 .globl _start
@@ -43,6 +48,7 @@ _start:
   call text@plt
   jmp .Lused@plt
   call used@plt
+  leaq .L.str(%rip), %rdi
 
 .section text,"ax"
 .Lunused:
@@ -54,3 +60,6 @@ used:
 .Lunused_gc:
 unused_gc:
   ret
+
+.section .rodata.str1.1,"aMS",@progbits,1
+.L.str: .asciz "a"
diff --git a/lld/test/ELF/relocatable-local-sym.s b/lld/test/ELF/relocatable-local-sym.s
index b894d6b9900c1..b925ff5bdb402 100644
--- a/lld/test/ELF/relocatable-local-sym.s
+++ b/lld/test/ELF/relocatable-local-sym.s
@@ -1,16 +1,42 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t1.o
-# RUN: ld.lld -r %t1.o -o %t2.o
-# RUN: llvm-readobj -r %t2.o | FileCheck %s
+## Test relocations referencing non-STT_SECTION local symbols in SHF_ALLOC and non-SHF_ALLOC sections for -r.
 
-# CHECK:      Relocations [
-# CHECK-NEXT:   Section ({{.*}}) .rela.text {
-# CHECK-NEXT:     0x3 R_X86_64_PC32 .Lstr 0xFFFFFFFFFFFFFFFC
-# CHECK-NEXT:   }
-# CHECK-NEXT: ]
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld -r -o %t %t.o %t.o
+# RUN: llvm-readelf -r -x .nonalloc %t | FileCheck --check-prefix=RELA %s
 
-        leaq    .Lstr(%rip), %rdi
+# RUN: llvm-mc -filetype=obj -triple=i686 --defsym X86_32=1 %s -o %t1.o
+# RUN: ld.lld -r -o %t1 %t1.o %t1.o
+# RUN: llvm-readelf -r -x .nonalloc %t1 | FileCheck --check-prefix=REL %s
 
-        .section        .rodata.str1.1,"aMS",@progbits,1
-        .Lstr:
-        .asciz "abc\n"
+# RELA:       Relocation section '.rela.data' at offset {{.*}} contains 2 entries:
+# RELA:         Offset          Info         Type      Symbol's Value  Symbol's Name + Addend
+# RELA-NEXT:  0000000000000000  {{.*}} R_X86_64_32   0000000000000000 ifunc + 9
+# RELA-NEXT:  0000000000000004  {{.*}} R_X86_64_32   0000000000000004 ifunc + 9
+# RELA:       Relocation section '.rela.nonalloc' at offset {{.*}} contains 2 entries:
+# RELA:         Offset          Info         Type      Symbol's Value  Symbol's Name + Addend
+# RELA-NEXT:  0000000000000000  {{.*}} R_X86_64_32   0000000000000000 ifunc + 9
+# RELA-NEXT:  0000000000000004  {{.*}} R_X86_64_32   0000000000000004 ifunc + 9
+# RELA:       Hex dump of section '.nonalloc':
+# RELA-NEXT:  0x00000000 00000000 00000000                   ........
+
+# REL:         Offset   Info   Type         Sym. Value  Symbol's Name
+# REL-NEXT:  00000000  {{.*}} R_386_32        00000000   ifunc
+# REL-NEXT:  00000004  {{.*}} R_386_32        00000004   ifunc
+# REL-EMPTY:
+# REL:         Offset   Info   Type         Sym. Value  Symbol's Name
+# REL-NEXT:  00000000  {{.*}} R_386_32        00000000   ifunc
+# REL-NEXT:  00000004  {{.*}} R_386_32        00000004   ifunc
+# REL:       Hex dump of section '.nonalloc':
+# REL-NEXT:  0x00000000 09000000 09000000                   ........
+
+resolver: ret
+.type ifunc, @gnu_indirect_function
+.set ifunc, resolver
+
+.data
+.long ifunc+9
+
+.section .nonalloc
+## The relocation references ifunc instead of the STT_SECTION symbol.
+.long ifunc+9

From 695a5a6a66396b83263bbb3f1946fbaf41e422c3 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 20 Sep 2023 08:45:46 +0200
Subject: [PATCH 49/57] [mlir][IR] Trigger `notifyOperationRemoved` callback
 for nested ops (#66771)

When cloning an op, the `notifyOperationInserted` callback is triggered
for all nested ops. Similarly, the `notifyOperationRemoved` callback
should be triggered for all nested ops when removing an op.

Listeners may inspect the IR during a `notifyOperationRemoved` callback.
Therefore, when multiple ops are removed in a single
`RewriterBase::eraseOp` call, the notifications must be triggered in an
order in which the ops could have been removed one-by-one:

* Op removals must be interleaved with `notifyOperationRemoved`
callbacks. A callback is triggered right before the respective op is
removed.
* Ops are removed post-order and in reverse order. Other traversal
orders could delete an op that still has uses. (This is not avoidable in
graph regions and with cyclic block graphs.)

Differential Revision: Imported from https://reviews.llvm.org/D144193.
---
 mlir/include/mlir/IR/RegionKindInterface.h    |   6 +
 .../Bufferization/Transforms/Bufferize.cpp    |   9 +-
 mlir/lib/IR/PatternMatch.cpp                  |  72 +++++++-
 mlir/lib/IR/RegionKindInterface.cpp           |  12 +-
 .../Utils/GreedyPatternRewriteDriver.cpp      |   9 +-
 .../test-strict-pattern-driver.mlir           | 160 +++++++++++++++++-
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |   8 +
 7 files changed, 251 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir/IR/RegionKindInterface.h b/mlir/include/mlir/IR/RegionKindInterface.h
index 46bfe717533a8..d6d3aeeb9bd05 100644
--- a/mlir/include/mlir/IR/RegionKindInterface.h
+++ b/mlir/include/mlir/IR/RegionKindInterface.h
@@ -43,6 +43,12 @@ class HasOnlyGraphRegion : public TraitBase<ConcreteType, HasOnlyGraphRegion> {
 /// not implement the RegionKindInterface.
 bool mayHaveSSADominance(Region &region);
 
+/// Return "true" if the given region may be a graph region without SSA
+/// dominance. This function returns "true" in case the owner op is an
+/// unregistered op. It returns "false" if it is a registered op that does not
+/// implement the RegionKindInterface.
+bool mayBeGraphRegion(Region &region);
+
 } // namespace mlir
 
 #include "mlir/IR/RegionKindInterface.h.inc"
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index cad78b3e65b23..c34f422292cb4 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -394,12 +394,9 @@ class BufferizationRewriter : public IRRewriter, public RewriterBase::Listener {
 
 protected:
   void notifyOperationRemoved(Operation *op) override {
-    // TODO: Walk can be removed when D144193 has landed.
-    op->walk([&](Operation *op) {
-      erasedOps.insert(op);
-      // Erase if present.
-      toMemrefOps.erase(op);
-    });
+    erasedOps.insert(op);
+    // Erase if present.
+    toMemrefOps.erase(op);
   }
 
   void notifyOperationInserted(Operation *op) override {
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index db920c14ea08d..5e9b9b2a810a4 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -8,6 +8,8 @@
 
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/IR/RegionKindInterface.h"
 
 using namespace mlir;
 
@@ -275,7 +277,7 @@ void RewriterBase::replaceOp(Operation *op, ValueRange newValues) {
   for (auto it : llvm::zip(op->getResults(), newValues))
     replaceAllUsesWith(std::get<0>(it), std::get<1>(it));
 
-  // Erase the op.
+  // Erase op and notify listener.
   eraseOp(op);
 }
 
@@ -295,7 +297,7 @@ void RewriterBase::replaceOp(Operation *op, Operation *newOp) {
   for (auto it : llvm::zip(op->getResults(), newOp->getResults()))
     replaceAllUsesWith(std::get<0>(it), std::get<1>(it));
 
-  // Erase the old op.
+  // Erase op and notify listener.
   eraseOp(op);
 }
 
@@ -303,9 +305,71 @@ void RewriterBase::replaceOp(Operation *op, Operation *newOp) {
 /// the given operation *must* be known to be dead.
 void RewriterBase::eraseOp(Operation *op) {
   assert(op->use_empty() && "expected 'op' to have no uses");
-  if (auto *rewriteListener = dyn_cast_if_present<Listener>(listener))
+  auto *rewriteListener = dyn_cast_if_present<Listener>(listener);
+
+  // Fast path: If no listener is attached, the op can be dropped in one go.
+  if (!rewriteListener) {
+    op->erase();
+    return;
+  }
+
+  // Helper function that erases a single op.
+  auto eraseSingleOp = [&](Operation *op) {
+#ifndef NDEBUG
+    // All nested ops should have been erased already.
+    assert(
+        llvm::all_of(op->getRegions(), [&](Region &r) { return r.empty(); }) &&
+        "expected empty regions");
+    // All users should have been erased already if the op is in a region with
+    // SSA dominance.
+    if (!op->use_empty() && op->getParentOp())
+      assert(mayBeGraphRegion(*op->getParentRegion()) &&
+             "expected that op has no uses");
+#endif // NDEBUG
     rewriteListener->notifyOperationRemoved(op);
-  op->erase();
+
+    // Explicitly drop all uses in case the op is in a graph region.
+    op->dropAllUses();
+    op->erase();
+  };
+
+  // Nested ops must be erased one-by-one, so that listeners have a consistent
+  // view of the IR every time a notification is triggered. Users must be
+  // erased before definitions. I.e., post-order, reverse dominance.
+  std::function<void(Operation *)> eraseTree = [&](Operation *op) {
+    // Erase nested ops.
+    for (Region &r : llvm::reverse(op->getRegions())) {
+      // Erase all blocks in the right order. Successors should be erased
+      // before predecessors because successor blocks may use values defined
+      // in predecessor blocks. A post-order traversal of blocks within a
+      // region visits successors before predecessors. Repeat the traversal
+      // until the region is empty. (The block graph could be disconnected.)
+      while (!r.empty()) {
+        SmallVector<Block *> erasedBlocks;
+        for (Block *b : llvm::post_order(&r.front())) {
+          // Visit ops in reverse order.
+          for (Operation &op :
+               llvm::make_early_inc_range(ReverseIterator::makeIterable(*b)))
+            eraseTree(&op);
+          // Do not erase the block immediately. This is not supprted by the
+          // post_order iterator.
+          erasedBlocks.push_back(b);
+        }
+        for (Block *b : erasedBlocks) {
+          // Explicitly drop all uses in case there is a cycle in the block
+          // graph.
+          for (BlockArgument bbArg : b->getArguments())
+            bbArg.dropAllUses();
+          b->dropAllUses();
+          b->erase();
+        }
+      }
+    }
+    // Then erase the enclosing op.
+    eraseSingleOp(op);
+  };
+
+  eraseTree(op);
 }
 
 void RewriterBase::eraseBlock(Block *block) {
diff --git a/mlir/lib/IR/RegionKindInterface.cpp b/mlir/lib/IR/RegionKindInterface.cpp
index cbef3025a5dd6..007f4cf92dbc7 100644
--- a/mlir/lib/IR/RegionKindInterface.cpp
+++ b/mlir/lib/IR/RegionKindInterface.cpp
@@ -18,9 +18,17 @@ using namespace mlir;
 #include "mlir/IR/RegionKindInterface.cpp.inc"
 
 bool mlir::mayHaveSSADominance(Region &region) {
-  auto regionKindOp =
-      dyn_cast_if_present<RegionKindInterface>(region.getParentOp());
+  auto regionKindOp = dyn_cast<RegionKindInterface>(region.getParentOp());
   if (!regionKindOp)
     return true;
   return regionKindOp.hasSSADominance(region.getRegionNumber());
 }
+
+bool mlir::mayBeGraphRegion(Region &region) {
+  if (!region.getParentOp()->isRegistered())
+    return true;
+  auto regionKindOp = dyn_cast<RegionKindInterface>(region.getParentOp());
+  if (!regionKindOp)
+    return false;
+  return !regionKindOp.hasSSADominance(region.getRegionNumber());
+}
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index fba4944f130c2..8e2bfe557c555 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -421,8 +421,7 @@ bool GreedyPatternRewriteDriver::processWorklist() {
 
     // If the operation is trivially dead - remove it.
     if (isOpTriviallyDead(op)) {
-      notifyOperationRemoved(op);
-      op->erase();
+      eraseOp(op);
       changed = true;
 
       LLVM_DEBUG(logResultWithLine("success", "operation is trivially dead"));
@@ -567,10 +566,8 @@ void GreedyPatternRewriteDriver::notifyOperationRemoved(Operation *op) {
     config.listener->notifyOperationRemoved(op);
 
   addOperandsToWorklist(op->getOperands());
-  op->walk([this](Operation *operation) {
-    worklist.remove(operation);
-    folder.notifyRemoval(operation);
-  });
+  worklist.remove(op);
+  folder.notifyRemoval(op);
 
   if (config.strictMode != GreedyRewriteStrictness::AnyOp)
     strictModeFilteredOps.erase(op);
diff --git a/mlir/test/Transforms/test-strict-pattern-driver.mlir b/mlir/test/Transforms/test-strict-pattern-driver.mlir
index 5df2d6d1fdeeb..a5ab8f97c74ce 100644
--- a/mlir/test/Transforms/test-strict-pattern-driver.mlir
+++ b/mlir/test/Transforms/test-strict-pattern-driver.mlir
@@ -12,9 +12,9 @@
 
 // CHECK-EN-LABEL: func @test_erase
 //  CHECK-EN-SAME:     pattern_driver_all_erased = true, pattern_driver_changed = true}
-//       CHECK-EN:   test.arg0
-//       CHECK-EN:   test.arg1
-//   CHECK-EN-NOT:   test.erase_op
+//       CHECK-EN:   "test.arg0"
+//       CHECK-EN:   "test.arg1"
+//   CHECK-EN-NOT:   "test.erase_op"
 func.func @test_erase() {
   %0 = "test.arg0"() : () -> (i32)
   %1 = "test.arg1"() : () -> (i32)
@@ -51,13 +51,13 @@ func.func @test_replace_with_new_op() {
 
 // CHECK-EN-LABEL: func @test_replace_with_erase_op
 //  CHECK-EN-SAME:     {pattern_driver_all_erased = true, pattern_driver_changed = true}
-//   CHECK-EN-NOT:   test.replace_with_new_op
-//   CHECK-EN-NOT:   test.erase_op
+//   CHECK-EN-NOT:   "test.replace_with_new_op"
+//   CHECK-EN-NOT:   "test.erase_op"
 
 // CHECK-EX-LABEL: func @test_replace_with_erase_op
 //  CHECK-EX-SAME:     {pattern_driver_all_erased = true, pattern_driver_changed = true}
-//   CHECK-EX-NOT:   test.replace_with_new_op
-//       CHECK-EX:   test.erase_op
+//   CHECK-EX-NOT:   "test.replace_with_new_op"
+//       CHECK-EX:   "test.erase_op"
 func.func @test_replace_with_erase_op() {
   "test.replace_with_new_op"() {create_erase_op} : () -> ()
   return
@@ -83,3 +83,149 @@ func.func @test_trigger_rewrite_through_block() {
   // in turn, replaces the successor with bb3.
   "test.implicit_change_op"() [^bb1] : () -> ()
 }
+
+// -----
+
+// CHECK-AN: notifyOperationRemoved: test.foo_b
+// CHECK-AN: notifyOperationRemoved: test.foo_a
+// CHECK-AN: notifyOperationRemoved: test.graph_region
+// CHECK-AN: notifyOperationRemoved: test.erase_op
+// CHECK-AN-LABEL: func @test_remove_graph_region()
+//  CHECK-AN-NEXT:   return
+func.func @test_remove_graph_region() {
+  "test.erase_op"() ({
+    test.graph_region {
+      %0 = "test.foo_a"(%1) : (i1) -> (i1)
+      %1 = "test.foo_b"(%0) : (i1) -> (i1)
+    }
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.bar
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.foo
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.dummy_op
+// CHECK-AN: notifyOperationRemoved: test.erase_op
+// CHECK-AN-LABEL: func @test_remove_cyclic_blocks()
+//  CHECK-AN-NEXT:   return
+func.func @test_remove_cyclic_blocks() {
+  "test.erase_op"() ({
+    %x = "test.dummy_op"() : () -> (i1)
+    cf.br ^bb1(%x: i1)
+  ^bb1(%arg0: i1):
+    "test.foo"(%x) : (i1) -> ()
+    cf.br ^bb2(%arg0: i1)
+  ^bb2(%arg1: i1):
+    "test.bar"(%x) : (i1) -> ()
+    cf.br ^bb1(%arg1: i1)
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// CHECK-AN: notifyOperationRemoved: test.dummy_op
+// CHECK-AN: notifyOperationRemoved: test.bar
+// CHECK-AN: notifyOperationRemoved: test.qux
+// CHECK-AN: notifyOperationRemoved: test.qux_unreachable
+// CHECK-AN: notifyOperationRemoved: test.nested_dummy
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.foo
+// CHECK-AN: notifyOperationRemoved: test.erase_op
+// CHECK-AN-LABEL: func @test_remove_dead_blocks()
+//  CHECK-AN-NEXT:   return
+func.func @test_remove_dead_blocks() {
+  "test.erase_op"() ({
+    "test.dummy_op"() : () -> (i1)
+  // The following blocks are not reachable. Still, ^bb2 should be deleted
+  // befire ^bb1.
+  ^bb1(%arg0: i1):
+    "test.foo"() : () -> ()
+    cf.br ^bb2(%arg0: i1)
+  ^bb2(%arg1: i1):
+    "test.nested_dummy"() ({
+      "test.qux"() : () -> ()
+    // The following block is unreachable.
+    ^bb3:
+      "test.qux_unreachable"() : () -> ()
+    }) : () -> ()
+    "test.bar"() : () -> ()
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// test.nested_* must be deleted before test.foo.
+// test.bar must be deleted before test.foo.
+
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.bar
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.nested_b
+// CHECK-AN: notifyOperationRemoved: test.nested_a
+// CHECK-AN: notifyOperationRemoved: test.nested_d
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.nested_e
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.nested_c
+// CHECK-AN: notifyOperationRemoved: test.foo
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.dummy_op
+// CHECK-AN: notifyOperationRemoved: test.erase_op
+// CHECK-AN-LABEL: func @test_remove_nested_ops()
+//  CHECK-AN-NEXT:   return
+func.func @test_remove_nested_ops() {
+  "test.erase_op"() ({
+    %x = "test.dummy_op"() : () -> (i1)
+    cf.br ^bb1(%x: i1)
+  ^bb1(%arg0: i1):
+    "test.foo"() ({
+      "test.nested_a"() : () -> ()
+      "test.nested_b"() : () -> ()
+    ^dead1:
+      "test.nested_c"() : () -> ()
+      cf.br ^dead3
+    ^dead2:
+      "test.nested_d"() : () -> ()
+    ^dead3:
+      "test.nested_e"() : () -> ()
+      cf.br ^dead2
+    }) : () -> ()
+    cf.br ^bb2(%arg0: i1)
+  ^bb2(%arg1: i1):
+    "test.bar"(%x) : (i1) -> ()
+    cf.br ^bb1(%arg1: i1)
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// CHECK-AN: notifyOperationRemoved: test.qux
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.foo
+// CHECK-AN: notifyOperationRemoved: cf.br
+// CHECK-AN: notifyOperationRemoved: test.bar
+// CHECK-AN: notifyOperationRemoved: cf.cond_br
+// CHECK-AN-LABEL: func @test_remove_diamond(
+//  CHECK-AN-NEXT:   return
+func.func @test_remove_diamond(%c: i1) {
+  "test.erase_op"() ({
+    cf.cond_br %c, ^bb1, ^bb2
+  ^bb1:
+    "test.foo"() : () -> ()
+    cf.br ^bb3
+  ^bb2:
+    "test.bar"() : () -> ()
+    cf.br ^bb3
+  ^bb3:
+    "test.qux"() : () -> ()
+  }) : () -> ()
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index e23ed105e3833..2e3bc76009ca2 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -239,6 +239,12 @@ struct TestPatternDriver
       llvm::cl::init(GreedyRewriteConfig().maxIterations)};
 };
 
+struct DumpNotifications : public RewriterBase::Listener {
+  void notifyOperationRemoved(Operation *op) override {
+    llvm::outs() << "notifyOperationRemoved: " << op->getName() << "\n";
+  }
+};
+
 struct TestStrictPatternDriver
     : public PassWrapper<TestStrictPatternDriver, OperationPass<func::FuncOp>> {
 public:
@@ -275,7 +281,9 @@ struct TestStrictPatternDriver
       }
     });
 
+    DumpNotifications dumpNotifications;
     GreedyRewriteConfig config;
+    config.listener = &dumpNotifications;
     if (strictMode == "AnyOp") {
       config.strictMode = GreedyRewriteStrictness::AnyOp;
     } else if (strictMode == "ExistingAndNewOps") {

From da94bf0d561109529e4ab3dabfcbb8b6c258ea39 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Wed, 20 Sep 2023 08:52:29 +0200
Subject: [PATCH 50/57] [Workflow] Add new code format helper. (#66684)

This helper will format python files with black/darker and
C/C++ files with clang-format.

The format helper is written so that we can expand it with new
formatters in the future like clang-tidy.
---
 .github/workflows/pr-code-format.yml          |  54 ++++
 .github/workflows/pr-python-format.yml        |  39 ---
 llvm/utils/git/code-format-helper.py          | 233 ++++++++++++++++++
 llvm/utils/git/requirements_formatting.txt    |  52 ++++
 llvm/utils/git/requirements_formatting.txt.in |   3 +
 5 files changed, 342 insertions(+), 39 deletions(-)
 create mode 100644 .github/workflows/pr-code-format.yml
 delete mode 100644 .github/workflows/pr-python-format.yml
 create mode 100644 llvm/utils/git/code-format-helper.py
 create mode 100644 llvm/utils/git/requirements_formatting.txt
 create mode 100644 llvm/utils/git/requirements_formatting.txt.in

diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
new file mode 100644
index 0000000000000..102e1a263b15a
--- /dev/null
+++ b/.github/workflows/pr-code-format.yml
@@ -0,0 +1,54 @@
+name: "Check code formatting"
+on: pull_request
+permissions:
+  pull-requests: write
+
+jobs:
+  code_formatter:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fetch LLVM sources
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 2
+
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v39
+        with:
+          separator: ","
+
+      - name: "Listed files"
+        run: |
+          echo "Formatting files:"
+          echo "${{ steps.changed-files.outputs.all_changed_files }}"
+
+      - name: Install clang-format
+        uses: aminya/setup-cpp@v1
+        with:
+          clangformat: 16.0.6
+
+      - name: Setup Python env
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt'
+
+      - name: Install python dependencies
+        run: pip install -r llvm/utils/git/requirements_formatting.txt
+
+      - name: Run code formatter
+        env:
+          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
+          START_REV: ${{ github.event.pull_request.base.sha }}
+          END_REV: ${{ github.event.pull_request.head.sha }}
+          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+        run: |
+          python llvm/utils/git/code-format-helper.py \
+            --token ${{ secrets.GITHUB_TOKEN }} \
+            --issue-number $GITHUB_PR_NUMBER \
+            --start-rev $START_REV \
+            --end-rev $END_REV \
+            --changed-files "$CHANGED_FILES"
diff --git a/.github/workflows/pr-python-format.yml b/.github/workflows/pr-python-format.yml
deleted file mode 100644
index c612295882654..0000000000000
--- a/.github/workflows/pr-python-format.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: "Check Python Formatting"
-on:
-  pull_request:
-    # run on .py
-    paths:
-      - '**.py'
-
-jobs:
-  python_formatting:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@v4
-        with:
-          persist-credentials: false
-          fetch-depth: 2
-
-      - name: Get changed files
-        id: changed-files
-        uses: tj-actions/changed-files@v39
-        with:
-          files: '**/*.py'
-
-      - name: "Listed files"
-        run: |
-          echo "Formatting files:"
-          echo "${{ steps.changed-files.outputs.all_changed_files }}"
-
-      - name: Setup Python env
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-
-      - name: Python Formatting
-        uses: akaihola/darker@1.7.2
-        with:
-          options: "--check --diff --color"
-          version: "~=1.7.2"
-          src: "${{ steps.changed-files.outputs.all_changed_files }}"
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
new file mode 100644
index 0000000000000..8d3c30b309d01
--- /dev/null
+++ b/llvm/utils/git/code-format-helper.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+#
+# ====- code-format-helper, runs code formatters from the ci --*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+import argparse
+import os
+import subprocess
+import sys
+from functools import cached_property
+
+import github
+from github import IssueComment, PullRequest
+
+
+class FormatHelper:
+    COMMENT_TAG = "<!--LLVM CODE FORMAT COMMENT: {fmt}-->"
+    name = "unknown"
+
+    @property
+    def comment_tag(self) -> str:
+        return self.COMMENT_TAG.replace("fmt", self.name)
+
+    def format_run(self, changed_files: [str], args: argparse.Namespace) -> str | None:
+        pass
+
+    def pr_comment_text(self, diff: str) -> str:
+        return f"""
+{self.comment_tag}
+
+:warning: {self.friendly_name}, {self.name} found issues in your code. :warning:
+
+<details>
+<summary>
+You can test this locally with the following command:
+</summary>
+
+``````````bash
+{self.instructions}
+``````````
+
+</details>
+
+<details>
+<summary>
+View the diff from {self.name} here.
+</summary>
+
+``````````diff
+{diff}
+``````````
+
+</details>
+"""
+
+    def find_comment(
+        self, pr: PullRequest.PullRequest
+    ) -> IssueComment.IssueComment | None:
+        for comment in pr.as_issue().get_comments():
+            if self.comment_tag in comment.body:
+                return comment
+        return None
+
+    def update_pr(self, diff: str, args: argparse.Namespace):
+        repo = github.Github(args.token).get_repo(args.repo)
+        pr = repo.get_issue(args.issue_number).as_pull_request()
+
+        existing_comment = self.find_comment(pr)
+        pr_text = self.pr_comment_text(diff)
+
+        if existing_comment:
+            existing_comment.edit(pr_text)
+        else:
+            pr.as_issue().create_comment(pr_text)
+
+    def update_pr_success(self, args: argparse.Namespace):
+        repo = github.Github(args.token).get_repo(args.repo)
+        pr = repo.get_issue(args.issue_number).as_pull_request()
+
+        existing_comment = self.find_comment(pr)
+        if existing_comment:
+            existing_comment.edit(
+                f"""
+{self.comment_tag}
+:white_check_mark: With the latest revision this PR passed the {self.friendly_name}.
+"""
+            )
+
+    def run(self, changed_files: [str], args: argparse.Namespace):
+        diff = self.format_run(changed_files, args)
+        if diff:
+            self.update_pr(diff, args)
+            return False
+        else:
+            self.update_pr_success(args)
+            return True
+
+
+class ClangFormatHelper(FormatHelper):
+    name = "clang-format"
+    friendly_name = "C/C++ code formatter"
+
+    @property
+    def instructions(self):
+        return " ".join(self.cf_cmd)
+
+    @cached_property
+    def libcxx_excluded_files(self):
+        with open("libcxx/utils/data/ignore_format.txt", "r") as ifd:
+            return [excl.strip() for excl in ifd.readlines()]
+
+    def should_be_excluded(self, path: str) -> bool:
+        if path in self.libcxx_excluded_files:
+            print(f"Excluding file {path}")
+            return True
+        return False
+
+    def filter_changed_files(self, changed_files: [str]) -> [str]:
+        filtered_files = []
+        for path in changed_files:
+            _, ext = os.path.splitext(path)
+            if ext in (".cpp", ".c", ".h", ".hpp", ".hxx", ".cxx"):
+                if not self.should_be_excluded(path):
+                    filtered_files.append(path)
+        return filtered_files
+
+    def format_run(self, changed_files: [str], args: argparse.Namespace) -> str | None:
+        cpp_files = self.filter_changed_files(changed_files)
+        if not cpp_files:
+            return
+        cf_cmd = [
+            "git-clang-format",
+            "--diff",
+            args.start_rev,
+            args.end_rev,
+            "--",
+        ] + cpp_files
+        print(f"Running: {' '.join(cf_cmd)}")
+        self.cf_cmd = cf_cmd
+        proc = subprocess.run(cf_cmd, capture_output=True)
+
+        # formatting needed
+        if proc.returncode == 1:
+            return proc.stdout.decode("utf-8")
+
+        return None
+
+
+class DarkerFormatHelper(FormatHelper):
+    name = "darker"
+    friendly_name = "Python code formatter"
+
+    @property
+    def instructions(self):
+        return " ".join(self.darker_cmd)
+
+    def filter_changed_files(self, changed_files: [str]) -> [str]:
+        filtered_files = []
+        for path in changed_files:
+            name, ext = os.path.splitext(path)
+            if ext == ".py":
+                filtered_files.append(path)
+
+        return filtered_files
+
+    def format_run(self, changed_files: [str], args: argparse.Namespace) -> str | None:
+        py_files = self.filter_changed_files(changed_files)
+        if not py_files:
+            return
+        darker_cmd = [
+            "darker",
+            "--check",
+            "--diff",
+            "-r",
+            f"{args.start_rev}..{args.end_rev}",
+        ] + py_files
+        print(f"Running: {' '.join(darker_cmd)}")
+        self.darker_cmd = darker_cmd
+        proc = subprocess.run(darker_cmd, capture_output=True)
+
+        # formatting needed
+        if proc.returncode == 1:
+            return proc.stdout.decode("utf-8")
+
+        return None
+
+
+ALL_FORMATTERS = (DarkerFormatHelper(), ClangFormatHelper())
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--token", type=str, required=True, help="GitHub authentiation token"
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        default=os.getenv("GITHUB_REPOSITORY", "llvm/llvm-project"),
+        help="The GitHub repository that we are working with in the form of <owner>/<repo> (e.g. llvm/llvm-project)",
+    )
+    parser.add_argument("--issue-number", type=int, required=True)
+    parser.add_argument(
+        "--start-rev",
+        type=str,
+        required=True,
+        help="Compute changes from this revision.",
+    )
+    parser.add_argument(
+        "--end-rev", type=str, required=True, help="Compute changes to this revision"
+    )
+    parser.add_argument(
+        "--changed-files",
+        type=str,
+        help="Comma separated list of files that has been changed",
+    )
+
+    args = parser.parse_args()
+
+    changed_files = []
+    if args.changed_files:
+        changed_files = args.changed_files.split(",")
+
+    exit_code = 0
+    for fmt in ALL_FORMATTERS:
+        if not fmt.run(changed_files, args):
+            exit_code = 1
+
+    sys.exit(exit_code)
diff --git a/llvm/utils/git/requirements_formatting.txt b/llvm/utils/git/requirements_formatting.txt
new file mode 100644
index 0000000000000..ff744f0d4225f
--- /dev/null
+++ b/llvm/utils/git/requirements_formatting.txt
@@ -0,0 +1,52 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --output-file=llvm/utils/git/requirements_formatting.txt llvm/utils/git/requirements_formatting.txt.in
+#
+black==23.9.1
+    # via
+    #   -r llvm/utils/git/requirements_formatting.txt.in
+    #   darker
+certifi==2023.7.22
+    # via requests
+cffi==1.15.1
+    # via
+    #   cryptography
+    #   pynacl
+charset-normalizer==3.2.0
+    # via requests
+click==8.1.7
+    # via black
+cryptography==41.0.3
+    # via pyjwt
+darker==1.7.2
+    # via -r llvm/utils/git/requirements_formatting.txt.in
+deprecated==1.2.14
+    # via pygithub
+idna==3.4
+    # via requests
+mypy-extensions==1.0.0
+    # via black
+packaging==23.1
+    # via black
+pathspec==0.11.2
+    # via black
+platformdirs==3.10.0
+    # via black
+pycparser==2.21
+    # via cffi
+pygithub==1.59.1
+    # via -r llvm/utils/git/requirements_formatting.txt.in
+pyjwt[crypto]==2.8.0
+    # via pygithub
+pynacl==1.5.0
+    # via pygithub
+requests==2.31.0
+    # via pygithub
+toml==0.10.2
+    # via darker
+urllib3==2.0.4
+    # via requests
+wrapt==1.15.0
+    # via deprecated
diff --git a/llvm/utils/git/requirements_formatting.txt.in b/llvm/utils/git/requirements_formatting.txt.in
new file mode 100644
index 0000000000000..4aac571af1cf5
--- /dev/null
+++ b/llvm/utils/git/requirements_formatting.txt.in
@@ -0,0 +1,3 @@
+black~=23.0
+darker==1.7.2
+PyGithub==1.59.1

From 747c40e04c4edc48fff57fc2629770f6c4f5f05e Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 20 Sep 2023 06:56:36 +0000
Subject: [PATCH 51/57] [gn build] Port 0f152a55d3e4

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
index 58f7e05d92d61..bed26df94e2c4 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -33,6 +33,7 @@ static_library("Scalar") {
     "IndVarSimplify.cpp",
     "InductiveRangeCheckElimination.cpp",
     "InferAddressSpaces.cpp",
+    "InferAlignment.cpp",
     "InstSimplifyPass.cpp",
     "JumpThreading.cpp",
     "LICM.cpp",

From 976df42e6af5bcbd906e1bc0dc62a1aaf23a360d Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Wed, 20 Sep 2023 15:20:13 +0800
Subject: [PATCH 52/57] [RISCV] Fix bugs about register list of Zcmp push/pop.
 (#66073)

The pr does two things. One is to fix internal compiler error when we
need to spill callee saves but none of them is GPR, another is to fix
wrong register number for pushed registers are {ra, s0-s11}.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp |  41 +++---
 llvm/test/CodeGen/RISCV/callee-saved-gprs.ll | 136 +++++++++++++++++++
 llvm/test/CodeGen/RISCV/zcmp-with-float.ll   |  38 ++++++
 3 files changed, 194 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/zcmp-with-float.ll

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index cbcf41a979c9e..50e98e6b8ea99 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -226,37 +226,38 @@ getRestoreLibCallName(const MachineFunction &MF,
   return RestoreLibCalls[LibCallID];
 }
 
-// Return encoded value for PUSH/POP instruction, representing
-// registers to store/load.
-static unsigned getPushPopEncoding(const Register MaxReg) {
+// Return encoded value and register count for PUSH/POP instruction,
+// representing registers to store/load.
+static std::pair<unsigned, unsigned>
+getPushPopEncodingAndNum(const Register MaxReg) {
   switch (MaxReg) {
   default:
     llvm_unreachable("Unexpected Reg for Push/Pop Inst");
   case RISCV::X27: /*s11*/
   case RISCV::X26: /*s10*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S11;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S11, 13);
   case RISCV::X25: /*s9*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S9;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S9, 11);
   case RISCV::X24: /*s8*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S8;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S8, 10);
   case RISCV::X23: /*s7*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S7;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S7, 9);
   case RISCV::X22: /*s6*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S6;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S6, 8);
   case RISCV::X21: /*s5*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S5;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S5, 7);
   case RISCV::X20: /*s4*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S4;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S4, 6);
   case RISCV::X19: /*s3*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S3;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S3, 5);
   case RISCV::X18: /*s2*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S2;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S2, 4);
   case RISCV::X9: /*s1*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0_S1;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0_S1, 3);
   case RISCV::X8: /*s0*/
-    return llvm::RISCVZC::RLISTENCODE::RA_S0;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA_S0, 2);
   case RISCV::X1: /*ra*/
-    return llvm::RISCVZC::RLISTENCODE::RA;
+    return std::make_pair(llvm::RISCVZC::RLISTENCODE::RA, 1);
   }
 }
 
@@ -1360,14 +1361,12 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
   RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
   if (RVFI->isPushable(*MF)) {
     Register MaxReg = getMaxPushPopReg(*MF, CSI);
-    unsigned PushedRegNum =
-        getPushPopEncoding(MaxReg) - llvm::RISCVZC::RLISTENCODE::RA + 1;
-    RVFI->setRVPushRegs(PushedRegNum);
-    RVFI->setRVPushStackSize(alignTo((STI.getXLen() / 8) * PushedRegNum, 16));
-
     if (MaxReg != RISCV::NoRegister) {
+      auto [RegEnc, PushedRegNum] = getPushPopEncodingAndNum(MaxReg);
+      RVFI->setRVPushRegs(PushedRegNum);
+      RVFI->setRVPushStackSize(alignTo((STI.getXLen() / 8) * PushedRegNum, 16));
+
       // Use encoded number to represent registers to spill.
-      unsigned RegEnc = getPushPopEncoding(MaxReg);
       RVFI->setRVPushRlist(RegEnc);
       MachineInstrBuilder PushBuilder =
           BuildMI(MBB, MI, DL, TII.get(RISCV::CM_PUSH))
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index ca290988b58ca..09ecbbc7e8feb 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -2128,6 +2128,142 @@ entry:
   ret void
 }
 
+; Check .cfi_offset of s11 is correct for Zcmp.
+define void @bar() {
+; RV32I-LABEL: bar:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset s11, -4
+; RV32I-NEXT:    #APP
+; RV32I-NEXT:    li s11, 0
+; RV32I-NEXT:    #NO_APP
+; RV32I-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32I-WITH-FP-LABEL: bar:
+; RV32I-WITH-FP:       # %bb.0: # %entry
+; RV32I-WITH-FP-NEXT:    addi sp, sp, -16
+; RV32I-WITH-FP-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-WITH-FP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-WITH-FP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-WITH-FP-NEXT:    sw s11, 4(sp) # 4-byte Folded Spill
+; RV32I-WITH-FP-NEXT:    .cfi_offset ra, -4
+; RV32I-WITH-FP-NEXT:    .cfi_offset s0, -8
+; RV32I-WITH-FP-NEXT:    .cfi_offset s11, -12
+; RV32I-WITH-FP-NEXT:    addi s0, sp, 16
+; RV32I-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-WITH-FP-NEXT:    #APP
+; RV32I-WITH-FP-NEXT:    li s11, 0
+; RV32I-WITH-FP-NEXT:    #NO_APP
+; RV32I-WITH-FP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-WITH-FP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-WITH-FP-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-WITH-FP-NEXT:    addi sp, sp, 16
+; RV32I-WITH-FP-NEXT:    ret
+;
+; RV32IZCMP-LABEL: bar:
+; RV32IZCMP:       # %bb.0: # %entry
+; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -64
+; RV32IZCMP-NEXT:    .cfi_def_cfa_offset 64
+; RV32IZCMP-NEXT:    .cfi_offset s11, -4
+; RV32IZCMP-NEXT:    #APP
+; RV32IZCMP-NEXT:    li s11, 0
+; RV32IZCMP-NEXT:    #NO_APP
+; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 64
+;
+; RV32IZCMP-WITH-FP-LABEL: bar:
+; RV32IZCMP-WITH-FP:       # %bb.0: # %entry
+; RV32IZCMP-WITH-FP-NEXT:    addi sp, sp, -16
+; RV32IZCMP-WITH-FP-NEXT:    .cfi_def_cfa_offset 16
+; RV32IZCMP-WITH-FP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZCMP-WITH-FP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IZCMP-WITH-FP-NEXT:    sw s11, 4(sp) # 4-byte Folded Spill
+; RV32IZCMP-WITH-FP-NEXT:    .cfi_offset ra, -4
+; RV32IZCMP-WITH-FP-NEXT:    .cfi_offset s0, -8
+; RV32IZCMP-WITH-FP-NEXT:    .cfi_offset s11, -12
+; RV32IZCMP-WITH-FP-NEXT:    addi s0, sp, 16
+; RV32IZCMP-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
+; RV32IZCMP-WITH-FP-NEXT:    #APP
+; RV32IZCMP-WITH-FP-NEXT:    li s11, 0
+; RV32IZCMP-WITH-FP-NEXT:    #NO_APP
+; RV32IZCMP-WITH-FP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZCMP-WITH-FP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IZCMP-WITH-FP-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
+; RV32IZCMP-WITH-FP-NEXT:    addi sp, sp, 16
+; RV32IZCMP-WITH-FP-NEXT:    ret
+;
+; RV64I-LABEL: bar:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset s11, -8
+; RV64I-NEXT:    #APP
+; RV64I-NEXT:    li s11, 0
+; RV64I-NEXT:    #NO_APP
+; RV64I-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64I-WITH-FP-LABEL: bar:
+; RV64I-WITH-FP:       # %bb.0: # %entry
+; RV64I-WITH-FP-NEXT:    addi sp, sp, -32
+; RV64I-WITH-FP-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-WITH-FP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-WITH-FP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-WITH-FP-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-WITH-FP-NEXT:    .cfi_offset ra, -8
+; RV64I-WITH-FP-NEXT:    .cfi_offset s0, -16
+; RV64I-WITH-FP-NEXT:    .cfi_offset s11, -24
+; RV64I-WITH-FP-NEXT:    addi s0, sp, 32
+; RV64I-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-WITH-FP-NEXT:    #APP
+; RV64I-WITH-FP-NEXT:    li s11, 0
+; RV64I-WITH-FP-NEXT:    #NO_APP
+; RV64I-WITH-FP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-WITH-FP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-WITH-FP-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-WITH-FP-NEXT:    addi sp, sp, 32
+; RV64I-WITH-FP-NEXT:    ret
+;
+; RV64IZCMP-LABEL: bar:
+; RV64IZCMP:       # %bb.0: # %entry
+; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -112
+; RV64IZCMP-NEXT:    .cfi_def_cfa_offset 112
+; RV64IZCMP-NEXT:    .cfi_offset s11, -8
+; RV64IZCMP-NEXT:    #APP
+; RV64IZCMP-NEXT:    li s11, 0
+; RV64IZCMP-NEXT:    #NO_APP
+; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 112
+;
+; RV64IZCMP-WITH-FP-LABEL: bar:
+; RV64IZCMP-WITH-FP:       # %bb.0: # %entry
+; RV64IZCMP-WITH-FP-NEXT:    addi sp, sp, -32
+; RV64IZCMP-WITH-FP-NEXT:    .cfi_def_cfa_offset 32
+; RV64IZCMP-WITH-FP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64IZCMP-WITH-FP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64IZCMP-WITH-FP-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; RV64IZCMP-WITH-FP-NEXT:    .cfi_offset ra, -8
+; RV64IZCMP-WITH-FP-NEXT:    .cfi_offset s0, -16
+; RV64IZCMP-WITH-FP-NEXT:    .cfi_offset s11, -24
+; RV64IZCMP-WITH-FP-NEXT:    addi s0, sp, 32
+; RV64IZCMP-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
+; RV64IZCMP-WITH-FP-NEXT:    #APP
+; RV64IZCMP-WITH-FP-NEXT:    li s11, 0
+; RV64IZCMP-WITH-FP-NEXT:    #NO_APP
+; RV64IZCMP-WITH-FP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64IZCMP-WITH-FP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64IZCMP-WITH-FP-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; RV64IZCMP-WITH-FP-NEXT:    addi sp, sp, 32
+; RV64IZCMP-WITH-FP-NEXT:    ret
+entry:
+  tail call void asm sideeffect "li s11, 0", "~{s11}"()
+  ret void
+}
+
 define void @varargs(...) {
 ; RV32I-LABEL: varargs:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/zcmp-with-float.ll b/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
new file mode 100644
index 0000000000000..05ee92c89db7c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=riscv32 -mattr=+f,+zcmp -target-abi ilp32f -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+f,+zcmp -target-abi lp64f -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
+
+declare void @callee()
+
+; Test the file could be compiled successfully.
+; .cfi_offset of fs0 is wrong here. It should be fixed by #66613.
+define float @foo(float %arg) {
+; RV32-LABEL: foo:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    cm.push {ra}, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    fsw fs0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset fs0, -4
+; RV32-NEXT:    fmv.s fs0, fa0
+; RV32-NEXT:    call callee@plt
+; RV32-NEXT:    fmv.s fa0, fs0
+; RV32-NEXT:    flw fs0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    cm.popret {ra}, 32
+;
+; RV64-LABEL: foo:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    cm.push {ra}, -32
+; RV64-NEXT:    .cfi_def_cfa_offset 32
+; RV64-NEXT:    fsw fs0, 12(sp) # 4-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset fs0, -4
+; RV64-NEXT:    fmv.s fs0, fa0
+; RV64-NEXT:    call callee@plt
+; RV64-NEXT:    fmv.s fa0, fs0
+; RV64-NEXT:    flw fs0, 12(sp) # 4-byte Folded Reload
+; RV64-NEXT:    cm.popret {ra}, 32
+entry:
+  call void @callee()
+  ret float %arg
+}

From a68c7241ec86036ba8001a44817b6e1e0584522d Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 20 Sep 2023 08:22:00 +0100
Subject: [PATCH 53/57] [AMDGPU] Run twoaddr tests with -early-live-intervals
 (#66775)

Sample test case:

%3 = V_FMAC_F32_e32 killed %0, %1, %2, implicit $mode, implicit $exec

With LiveVariables this is converted to three-address form just because
there is no "killed" flag on %2. To make it do the same thing with
LiveIntervals I added a later use of %2:

%3 = V_FMAC_F32_e32 killed %0, %1, %2, implicit $mode, implicit $exec
    S_ENDPGM 0, implicit %2
---
 .../test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir | 10 ++++----
 llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir  |  9 ++++---
 llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir      | 15 +++++++----
 llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir      | 16 +++++++-----
 llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir     | 25 ++++++++++---------
 5 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
index 367c1fb10af53..2fe4d9d8e9e9a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
@@ -1,4 +1,5 @@
 # RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX10 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck --check-prefixes=GFX10 %s
 
 # GFX10-LABEL: name: test_fmamk_reg_imm_f16
 # GFX10: %2:vgpr_32 = IMPLICIT_DEF
@@ -43,7 +44,7 @@ body:             |
 ...
 
 # GFX10-LABEL: name: test_fmaak_f16
-# GFX10: %1:vgpr_32 = IMPLICIT_DEF
+# GFX10: %1:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
 # GFX10-NOT: V_MOV_B32
 # GFX10: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
 ---
@@ -58,10 +59,11 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
     %2 = V_FMAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %1
 ...
 
 # GFX10-LABEL: name: test_fmaak_inline_literal_f16
-# GFX10: %1:vgpr_32 = IMPLICIT_DEF
+# GFX10: %1:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
 # GFX10-NOT: V_MOV_B32
 # GFX10: %2:vgpr_32 = V_FMAAK_F16 16384, killed %0, 49664, implicit $mode, implicit $exec
 
@@ -78,7 +80,5 @@ body:             |
 
     %1:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
     %2:vgpr_32 = V_FMAC_F16_e32 16384, killed %0, %1, implicit $mode, implicit $exec
-    S_ENDPGM 0
-
+    S_ENDPGM 0, implicit %1
 ...
-
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
index db5cf285c1097..717893a957644 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
@@ -1,4 +1,5 @@
 # RUN: llc -march=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f64
 # GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
@@ -52,7 +53,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
     %2 = V_FMAC_F64_e32 killed %0.sub0_sub1, %0.sub2_sub3, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 
 # GCN-LABEL: name: test_fmaak_sgpr_src0_f64
@@ -72,7 +73,7 @@ body:             |
     %1 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
     %2 = IMPLICIT_DEF
     %3 = V_FMAC_F64_e32 killed %0, %1, %2, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %2
 ...
 
 # GCN-LABEL: name: test_fmaak_inlineimm_src0_f64
@@ -90,7 +91,7 @@ body:             |
     %0 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
     %1 = IMPLICIT_DEF
     %2 = V_FMAC_F64_e32 4611686018427387904, %0, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 
 # GCN-LABEL: name: test_fmaak_otherimm_src0_f64
@@ -186,5 +187,5 @@ body:             |
     %1 = COPY %0
     %2 = V_MOV_B64_PSEUDO 123456, implicit $exec
     %3 = V_FMAC_F64_e32 killed %0, killed %1, %2, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
index 7757c758134c7..412457fd0999f 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
@@ -1,5 +1,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck --check-prefixes=GCN %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck --check-prefixes=GCN %s
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f32
 # GCN: %2:vgpr_32 = IMPLICIT_DEF
@@ -44,7 +46,7 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmaak_f32
-# GCN: %1:vgpr_32 = IMPLICIT_DEF
+# GCN: %1:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
 # GCN-NOT: V_MOV_B32
 # GCN: V_FMAAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
 ---
@@ -59,7 +61,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
     %2 = V_FMAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 
 
@@ -82,7 +84,7 @@ body:             |
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
     %2 = IMPLICIT_DEF
     %3 = V_FMAC_F32_e32 killed %0, %1, %2, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %2
 ...
 
 # GCN-LABEL: name: test_fmaak_inlineimm_src0_f32
@@ -102,7 +104,7 @@ body:             |
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
     %1 = IMPLICIT_DEF
     %2 = V_FMAC_F32_e32 1073741824, %0, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 
 # GCN-LABEL: name: test_fmaak_otherimm_src0_f32
@@ -120,7 +122,7 @@ body:             |
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
     %1 = IMPLICIT_DEF
     %2 = V_FMAC_F32_e32 1120403456, %0, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 
 # GCN-LABEL: name: test_fmaak_other_constantlike_src0_f32
@@ -204,6 +206,7 @@ body: |
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
     %3:vgpr_32 = V_FMAC_F32_e64 1, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %2
 ...
 
 # GCN-LABEL: name: test_fma_src1mods
@@ -216,6 +219,7 @@ body: |
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
     %3:vgpr_32 = V_FMAC_F32_e64 0, %0, 1, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %2
 ...
 
 # GCN-LABEL: name: test_fma_src2mods
@@ -228,4 +232,5 @@ body: |
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
     %3:vgpr_32 = V_FMAC_F32_e64 0, %0, 0, %1, 1, %2, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
index 1458e8135ef2d..d470e3adb2a88 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
@@ -1,4 +1,5 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_madmk_reg_imm_f32
 # GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
@@ -52,7 +53,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
     %2 = V_MAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 
 # GCN-LABEL: name: test_madmk_reg_imm_f16
@@ -107,6 +108,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
     %2 = V_MAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %1
 ...
 
 # Make sure constant bus restriction isn't violated if src0 is an SGPR.
@@ -129,7 +131,7 @@ body:             |
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
     %2 = IMPLICIT_DEF
     %3 = V_MAC_F32_e32 killed %0, %1, %2, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %2
 ...
 
 # This can still fold if this is an inline immediate.
@@ -149,7 +151,7 @@ body:             |
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
     %1 = IMPLICIT_DEF
     %2 = V_MAC_F32_e32 1073741824, %0, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 # Non-inline immediate uses constant bus already.
 
@@ -168,7 +170,7 @@ body:             |
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
     %1 = IMPLICIT_DEF
     %2 = V_MAC_F32_e32 1120403456, %0, %1, implicit $mode, implicit $exec
-
+    S_ENDPGM 0, implicit %1
 ...
 # Non-inline immediate uses constant bus already.
 
@@ -204,8 +206,7 @@ body:             |
 
     %26:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
     %28:vgpr_32 = V_MAC_F16_e32 16384, killed %3, %26, implicit $mode, implicit $exec
-    S_ENDPGM 0
-
+    S_ENDPGM 0, implicit %26
 ...
 
 # GCN-LABEL: name: test_mad_src0mods
@@ -218,6 +219,7 @@ body: |
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
     %3:vgpr_32 = V_MAC_F32_e64 1, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %2
 ...
 
 # GCN-LABEL: name: test_mad_src1mods
@@ -230,6 +232,7 @@ body: |
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
     %3:vgpr_32 = V_MAC_F32_e64 0, %0, 1, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %2
 ...
 
 # GCN-LABEL: name: test_mad_src2mods
@@ -242,4 +245,5 @@ body: |
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
     %3:vgpr_32 = V_MAC_F32_e64 0, %0, 0, %1, 1, %2, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
index a74d442534c12..fc5b942804ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
@@ -1,4 +1,5 @@
 # RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
 # GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
@@ -15,7 +16,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
@@ -33,7 +34,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
@@ -51,7 +52,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
@@ -69,7 +70,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
@@ -87,7 +88,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_256, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
@@ -105,7 +106,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_twoaddr_w32 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_256, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
@@ -123,7 +124,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
@@ -141,7 +142,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
@@ -159,7 +160,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
@@ -177,7 +178,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
@@ -195,7 +196,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_twoaddr_w64 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_128, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...
 
 # GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
@@ -213,5 +214,5 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_twoaddr_w64 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_128, 0, 0, 0, implicit $exec
-
+    S_ENDPGM 0, implicit %0
 ...

From fcde8c88eb212ed39ff07fef2863bf3e1adb085e Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Wed, 20 Sep 2023 09:32:32 +0200
Subject: [PATCH 54/57] [TableGen][GlobalISel] Use `GIM_SwitchOpcode` in
 Combiners (#66864)

The call to `initOpcodeValuesMap` was missing, causing the MatchTable to
(unintentionally) not emit a `SwitchMatcher`. Also adds other code
imported from `GlobalISelEmitter.cpp` to ensure rules are sorted by
precedence as well.

Overall this improves GlobalISel compile-time performance by a
noticeable amount. See #66751
---
 .../builtins/match-table-replacerreg.td       |  46 +--
 .../match-table-imms.td                       |  48 +--
 .../match-table-patfrag-root.td               |  42 ++-
 .../match-table-permutations.td               | 332 +++++++++---------
 .../match-table-variadics.td                  | 120 +++----
 .../GlobalISelCombinerEmitter/match-table.td  | 156 ++++----
 .../TableGen/GlobalISelCombinerEmitter.cpp    |  13 +
 7 files changed, 400 insertions(+), 357 deletions(-)

diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
index dfde358405189..2d968977701fd 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
@@ -28,26 +28,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const int64_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static int64_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 0*/ 29, // Rule ID 0 //
-// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_FNEG,
-// CHECK-NEXT:       // MIs[0] dst
-// CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       // MIs[0] tmp
-// CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_FNEG,
-// CHECK-NEXT:       // MIs[1] src
-// CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       GIM_CheckCanReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1,
-// CHECK-NEXT:       GIM_CheckIsSafeToFold, /*InsnID*/1,
-// CHECK-NEXT:       // Combiner Rule #0: ReplaceMatched
-// CHECK-NEXT:       GIR_ReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1,
-// CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
-// CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 0: @29
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 1*/ 76, // Rule ID 1 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/65, 180, /*)*//*default:*//*Label 2*/ 192,
+// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ 120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 1*/ 165,
+// CHECK-NEXT:     // Label 0: @120
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ 164, // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_UNMERGE_VALUES,
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -71,7 +57,27 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_ReplaceRegWithTempReg, /*OldInsnID*/0, /*OldOpIdx*/1, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 1: @76
+// CHECK-NEXT:     // Label 3: @164
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 1: @165
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ 191, // Rule ID 0 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
+// CHECK-NEXT:       // MIs[0] dst
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] tmp
+// CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_FNEG,
+// CHECK-NEXT:       // MIs[1] src
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       GIM_CheckCanReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1,
+// CHECK-NEXT:       GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT:       // Combiner Rule #0: ReplaceMatched
+// CHECK-NEXT:       GIR_ReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1,
+// CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 4: @191
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 2: @192
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     };
 // CHECK-NEXT:   return MatchTable0;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
index b3e7ba28b0fa3..efe1b4b50dfda 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
@@ -34,9 +34,13 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const int64_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static int64_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 0*/ 28, // Rule ID 0 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/19, 126, /*)*//*default:*//*Label 3*/ 202,
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ 138, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ 165,
+// CHECK-NEXT:     // Label 0: @112
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ 137, // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::COPY,
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -47,10 +51,26 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddImm, /*InsnID*/0, /*Imm*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 0: @28
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 1*/ 67, // Rule ID 1 //
+// CHECK-NEXT:     // Label 4: @137
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 1: @138
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ 164, // Rule ID 2 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule2Enabled,
+// CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       GIM_CheckLiteralInt, /*MI*/0, /*Op*/1, 0,
+// CHECK-NEXT:       // Combiner Rule #2: CImmInstTest1
+// CHECK-NEXT:       GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::G_CONSTANT,
+// CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // a
+// CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/42,
+// CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 5: @164
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 2: @165
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ 201, // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_ZEXT,
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // MIs[0] Operand 1
@@ -65,21 +85,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 1: @67
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 2*/ 96, // Rule ID 2 //
-// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule2Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_CONSTANT,
-// CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
-// CHECK-NEXT:       // MIs[0] a
-// CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       GIM_CheckLiteralInt, /*MI*/0, /*Op*/1, 0,
-// CHECK-NEXT:       // Combiner Rule #2: CImmInstTest1
-// CHECK-NEXT:       GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::G_CONSTANT,
-// CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // a
-// CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/42,
-// CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
-// CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 2: @96
+// CHECK-NEXT:     // Label 6: @201
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 3: @202
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     };
 // CHECK-NEXT:   return MatchTable0;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
index 21a31a9fc5bf5..b6296cf9024da 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
@@ -28,49 +28,55 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const int64_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static int64_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 0*/ 44, // Rule ID 0 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/118, 181, /*)*//*default:*//*Label 3*/ 176,
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 0*/ 68, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 1*/ 101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_FPEXT*//*Label 2*/ 143,
+// CHECK-NEXT:     // Label 0: @68
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ 100, // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_ZEXT,
 // CHECK-NEXT:       // MIs[0] root
 // CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       // MIs[0] __Test0_match_0.b
-// CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:       // MIs[1] __Test0_match_0.x
+// CHECK-NEXT:       // MIs[0] __Test0_match_0.z
 // CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
 // CHECK-NEXT:       GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:       GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:       // Combiner Rule #0: Test0 @ [__Test0_match_0[0]]
+// CHECK-NEXT:       // Combiner Rule #0: Test0 @ [__Test0_match_0[1]]
 // CHECK-NEXT:       GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // root
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 0: @44
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 1*/ 79, // Rule ID 1 //
+// CHECK-NEXT:     // Label 4: @100
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 1: @101
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ 142, // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_TRUNC,
 // CHECK-NEXT:       // MIs[0] root
 // CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       // MIs[0] __Test0_match_0.z
+// CHECK-NEXT:       // MIs[0] __Test0_match_0.b
+// CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:       // MIs[1] __Test0_match_0.x
 // CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
 // CHECK-NEXT:       GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:       GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:       // Combiner Rule #0: Test0 @ [__Test0_match_0[1]]
+// CHECK-NEXT:       // Combiner Rule #0: Test0 @ [__Test0_match_0[0]]
 // CHECK-NEXT:       GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // root
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 1: @79
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 2*/ 114, // Rule ID 2 //
+// CHECK-NEXT:     // Label 5: @142
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 2: @143
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ 175, // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_FPEXT,
 // CHECK-NEXT:       // MIs[0] root
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // MIs[0] __Test0_match_0.z
@@ -85,7 +91,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 2: @114
+// CHECK-NEXT:     // Label 6: @175
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 3: @176
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     };
 // CHECK-NEXT:   return MatchTable0;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-permutations.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-permutations.td
index e6825ba6607ea..b0651c971c023 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-permutations.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-permutations.td
@@ -161,139 +161,168 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:   constexpr static int64_t MatchTable0[] = {
 // CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 0*/ 746,
 // CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_AND,
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 1*/ 111, // Rule ID 0 //
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 1*/ 84, // Rule ID 7 //
 // CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
 // CHECK-NEXT:         // MIs[0] dst
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] cst0
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[1] a.b
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/1, /*OpIdx*/1, // MIs[2]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[2] a.x
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[1] a.z
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] tmp
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/0, /*OpIdx*/2, // MIs[3]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_AND,
-// CHECK-NEXT:         // MIs[3] cst1
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/3, /*OpIdx*/1, // MIs[4]
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
+// CHECK-NEXT:         // MIs[2] cst1
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[3] b.z
+// CHECK-NEXT:         // No operand predicates
+// CHECK-NEXT:         // MIs[2] cst2
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/2, /*OpIdx*/2, // MIs[4]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[4] c.z
+// CHECK-NEXT:         // No operand predicates
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner21,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner22,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner23,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/4,
+// CHECK-NEXT:         GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
+// CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
+// CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
+// CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
+// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[1], c[1]]
+// CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
+// CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
+// CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:         GIR_Done,
+// CHECK-NEXT:       // Label 1: @84
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 2*/ 172, // Rule ID 6 //
+// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
+// CHECK-NEXT:         // MIs[0] dst
+// CHECK-NEXT:         // No operand predicates
+// CHECK-NEXT:         // MIs[0] cst0
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[1] a.z
+// CHECK-NEXT:         // No operand predicates
+// CHECK-NEXT:         // MIs[0] tmp
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
+// CHECK-NEXT:         // MIs[2] cst1
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[3] b.z
+// CHECK-NEXT:         // No operand predicates
+// CHECK-NEXT:         // MIs[2] cst2
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/2, /*OpIdx*/2, // MIs[4]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[4] b.b
+// CHECK-NEXT:         // MIs[4] c.b
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/4, /*OpIdx*/1, // MIs[5]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/5, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[5] b.x
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[3] cst2
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/6, /*MI*/3, /*OpIdx*/2, // MIs[6]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/6, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[6] c.b
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/7, /*MI*/6, /*OpIdx*/1, // MIs[7]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/7, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[7] c.x
+// CHECK-NEXT:         // MIs[5] c.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner0,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner1,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner2,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner18,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner19,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner20,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/4,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/5,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/6,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/7,
 // CHECK-NEXT:         GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[0], b[0], c[0]]
+// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[1], c[0]]
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
 // CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 1: @111
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 2*/ 208, // Rule ID 1 //
+// CHECK-NEXT:       // Label 2: @172
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 3*/ 260, // Rule ID 5 //
 // CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
 // CHECK-NEXT:         // MIs[0] dst
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] cst0
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[1] a.b
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/1, /*OpIdx*/1, // MIs[2]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[2] a.x
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[1] a.z
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] tmp
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/0, /*OpIdx*/2, // MIs[3]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_AND,
-// CHECK-NEXT:         // MIs[3] cst1
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
+// CHECK-NEXT:         // MIs[2] cst1
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:         // MIs[3] b.b
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/3, /*OpIdx*/1, // MIs[4]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[4] b.b
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/4, /*OpIdx*/1, // MIs[5]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/5, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[5] b.x
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[4] b.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[3] cst2
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/6, /*MI*/3, /*OpIdx*/2, // MIs[6]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/6, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[6] c.z
+// CHECK-NEXT:         // MIs[2] cst2
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/2, /*OpIdx*/2, // MIs[5]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/5, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[5] c.z
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner3,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner4,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner5,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner15,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner16,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner17,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/4,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/5,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/6,
 // CHECK-NEXT:         GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[0], b[0], c[1]]
+// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[0], c[1]]
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
 // CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 2: @208
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 3*/ 305, // Rule ID 2 //
+// CHECK-NEXT:       // Label 3: @260
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 4*/ 357, // Rule ID 4 //
 // CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
 // CHECK-NEXT:         // MIs[0] dst
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] cst0
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[1] a.b
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/1, /*OpIdx*/1, // MIs[2]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[2] a.x
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[1] a.z
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] tmp
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/0, /*OpIdx*/2, // MIs[3]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_AND,
-// CHECK-NEXT:         // MIs[3] cst1
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
+// CHECK-NEXT:         // MIs[2] cst1
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:         // MIs[3] b.b
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/3, /*OpIdx*/1, // MIs[4]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[4] b.z
+// CHECK-NEXT:         // MIs[4] b.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[3] cst2
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/3, /*OpIdx*/2, // MIs[5]
+// CHECK-NEXT:         // MIs[2] cst2
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/2, /*OpIdx*/2, // MIs[5]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/5, TargetOpcode::G_ZEXT,
 // CHECK-NEXT:         // MIs[5] c.b
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/6, /*MI*/5, /*OpIdx*/1, // MIs[6]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/6, TargetOpcode::G_TRUNC,
 // CHECK-NEXT:         // MIs[6] c.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner6,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner7,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner8,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner12,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner13,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner14,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
@@ -304,15 +333,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[0], b[1], c[0]]
+// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[0], c[0]]
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
 // CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 3: @305
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 4*/ 393, // Rule ID 3 //
+// CHECK-NEXT:       // Label 4: @357
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 5*/ 445, // Rule ID 3 //
 // CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
 // CHECK-NEXT:         // MIs[0] dst
 // CHECK-NEXT:         // No operand predicates
@@ -356,38 +385,38 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
 // CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 4: @393
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 5*/ 490, // Rule ID 4 //
+// CHECK-NEXT:       // Label 5: @445
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 6*/ 542, // Rule ID 2 //
 // CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
 // CHECK-NEXT:         // MIs[0] dst
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] cst0
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[1] a.z
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:         // MIs[1] a.b
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/1, /*OpIdx*/1, // MIs[2]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[2] a.x
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] tmp
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
-// CHECK-NEXT:         // MIs[2] cst1
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[3] b.b
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/0, /*OpIdx*/2, // MIs[3]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_AND,
+// CHECK-NEXT:         // MIs[3] cst1
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/3, /*OpIdx*/1, // MIs[4]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[4] b.x
+// CHECK-NEXT:         // MIs[4] b.z
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[2] cst2
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/2, /*OpIdx*/2, // MIs[5]
+// CHECK-NEXT:         // MIs[3] cst2
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/3, /*OpIdx*/2, // MIs[5]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/5, TargetOpcode::G_ZEXT,
 // CHECK-NEXT:         // MIs[5] c.b
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/6, /*MI*/5, /*OpIdx*/1, // MIs[6]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/6, TargetOpcode::G_TRUNC,
 // CHECK-NEXT:         // MIs[6] c.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner12,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner13,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner14,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner6,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner7,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner8,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
@@ -398,138 +427,109 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[0], c[0]]
+// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[0], b[1], c[0]]
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
 // CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 5: @490
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 6*/ 578, // Rule ID 5 //
+// CHECK-NEXT:       // Label 6: @542
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 7*/ 639, // Rule ID 1 //
 // CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
 // CHECK-NEXT:         // MIs[0] dst
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] cst0
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[1] a.z
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:         // MIs[1] a.b
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/1, /*OpIdx*/1, // MIs[2]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[2] a.x
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] tmp
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
-// CHECK-NEXT:         // MIs[2] cst1
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[3] b.b
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/0, /*OpIdx*/2, // MIs[3]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_AND,
+// CHECK-NEXT:         // MIs[3] cst1
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/3, /*OpIdx*/1, // MIs[4]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[4] b.x
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[2] cst2
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/2, /*OpIdx*/2, // MIs[5]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:         // MIs[4] b.b
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/4, /*OpIdx*/1, // MIs[5]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/5, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[5] c.z
+// CHECK-NEXT:         // MIs[5] b.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner15,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner16,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner17,
+// CHECK-NEXT:         // MIs[3] cst2
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/6, /*MI*/3, /*OpIdx*/2, // MIs[6]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/6, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[6] c.z
+// CHECK-NEXT:         // No operand predicates
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner3,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner4,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner5,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/4,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/5,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/6,
 // CHECK-NEXT:         GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[0], c[1]]
+// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[0], b[0], c[1]]
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
 // CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 6: @578
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 7*/ 666, // Rule ID 6 //
+// CHECK-NEXT:       // Label 7: @639
+// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 8*/ 745, // Rule ID 0 //
 // CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
 // CHECK-NEXT:         // MIs[0] dst
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] cst0
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[1] a.z
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:         // MIs[1] a.b
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/1, /*OpIdx*/1, // MIs[2]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[2] a.x
 // CHECK-NEXT:         // No operand predicates
 // CHECK-NEXT:         // MIs[0] tmp
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
-// CHECK-NEXT:         // MIs[2] cst1
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[3] b.z
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[2] cst2
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/2, /*OpIdx*/2, // MIs[4]
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/0, /*OpIdx*/2, // MIs[3]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_AND,
+// CHECK-NEXT:         // MIs[3] cst1
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/3, /*OpIdx*/1, // MIs[4]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[4] c.b
+// CHECK-NEXT:         // MIs[4] b.b
 // CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/5, /*MI*/4, /*OpIdx*/1, // MIs[5]
 // CHECK-NEXT:         GIM_CheckOpcode, /*MI*/5, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[5] c.x
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner18,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner19,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner20,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/4,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/5,
-// CHECK-NEXT:         GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
-// CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
-// CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
-// CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[1], c[0]]
-// CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
-// CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
-// CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
-// CHECK-NEXT:         GIR_EraseFromParent, /*InsnID*/0,
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 7: @666
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 8*/ 745, // Rule ID 7 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:         // MIs[0] dst
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] cst0
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[1] a.z
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] tmp
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_AND,
-// CHECK-NEXT:         // MIs[2] cst1
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/3, /*MI*/2, /*OpIdx*/1, // MIs[3]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/3, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[3] b.z
+// CHECK-NEXT:         // MIs[5] b.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[2] cst2
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/4, /*MI*/2, /*OpIdx*/2, // MIs[4]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/4, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:         // MIs[4] c.z
+// CHECK-NEXT:         // MIs[3] cst2
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/6, /*MI*/3, /*OpIdx*/2, // MIs[6]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/6, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:         // MIs[6] c.b
+// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/7, /*MI*/6, /*OpIdx*/1, // MIs[7]
+// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/7, TargetOpcode::G_TRUNC,
+// CHECK-NEXT:         // MIs[7] c.x
 // CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner21,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner22,
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner23,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner0,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner1,
+// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner2,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/2,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/3,
 // CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/4,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/5,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/6,
+// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/7,
 // CHECK-NEXT:         GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_CONSTANT,
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:         GIR_AddCImm, /*InsnID*/1, /*Type*/GILLT_s32, /*Imm*/0,
-// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[1], b[1], c[1]]
+// CHECK-NEXT:         // Combiner Rule #0: Test0 @ [a[0], b[0], c[0]]
 // CHECK-NEXT:         GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::COPY,
 // CHECK-NEXT:         GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
 // CHECK-NEXT:         GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
index cbe20d067fe9b..5226795cd9d35 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td
@@ -37,66 +37,66 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const int64_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static int64_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 0*/ 26,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_BUILD_VECTOR,
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 1*/ 15, // Rule ID 1 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
-// CHECK-NEXT:         GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
-// CHECK-NEXT:         // MIs[0] a
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] b
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // Combiner Rule #1: InstTest1
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 1: @15
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 2*/ 25, // Rule ID 0 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:         GIM_CheckNumOperands, /*MI*/0, /*Expected*/4,
-// CHECK-NEXT:         // MIs[0] a
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] b
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] c
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] d
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // Combiner Rule #0: InstTest0
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 2: @25
-// CHECK-NEXT:       GIM_Reject,
-// CHECK-NEXT:     // Label 0: @26
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ 52,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_UNMERGE_VALUES,
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 4*/ 41, // Rule ID 2 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule2Enabled,
-// CHECK-NEXT:         GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
-// CHECK-NEXT:         // MIs[0] a
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] b
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // Combiner Rule #2: InstTest2
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 4: @41
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 5*/ 51, // Rule ID 3 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule3Enabled,
-// CHECK-NEXT:         GIM_CheckNumOperands, /*MI*/0, /*Expected*/4,
-// CHECK-NEXT:         // MIs[0] a
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] b
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] c
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] d
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // Combiner Rule #3: InstTest3
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 5: @51
-// CHECK-NEXT:       GIM_Reject,
-// CHECK-NEXT:     // Label 3: @52
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/65, 69, /*)*//*default:*//*Label 2*/ 51,
+// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ 9, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_BUILD_VECTOR*//*Label 1*/ 30,
+// CHECK-NEXT:     // Label 0: @9
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ 19, // Rule ID 2 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule2Enabled,
+// CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] b
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // Combiner Rule #2: InstTest2
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 3: @19
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ 29, // Rule ID 3 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule3Enabled,
+// CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/4,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] b
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] c
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] d
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // Combiner Rule #3: InstTest3
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 4: @29
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 1: @30
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ 40, // Rule ID 1 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
+// CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] b
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // Combiner Rule #1: InstTest1
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 5: @40
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ 50, // Rule ID 0 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
+// CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/4,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] b
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] c
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] d
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // Combiner Rule #0: InstTest0
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 6: @50
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 2: @51
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     };
 // CHECK-NEXT:   return MatchTable0;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index 0f20e75d38a5d..b810c519d2ac3 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -132,82 +132,44 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // Verify match table.
 // CHECK:      const int64_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static int64_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 0*/ 20,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_TRUNC,
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 1*/ 12, // Rule ID 0 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
-// CHECK-NEXT:         // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 1: @12
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 2*/ 19, // Rule ID 1 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
-// CHECK-NEXT:         // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 2: @19
-// CHECK-NEXT:       GIM_Reject,
-// CHECK-NEXT:     // Label 0: @20
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ 30, // Rule ID 2 //
-// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SEXT,
-// CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/19, 126, /*)*//*default:*//*Label 6*/ 275,
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ 141, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ 181, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ 216, 0, 0, 0, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ 231, 0,
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ 239,
+// CHECK-NEXT:     // Label 0: @112
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ 133, // Rule ID 4 //
+// CHECK-NEXT:       GIM_CheckFeatures, GIFBS_HasAnswerToEverything,
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule3Enabled,
+// CHECK-NEXT:       // MIs[0] a
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] b
+// CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:       // MIs[1] c
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner0,
+// CHECK-NEXT:       GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT:       // Combiner Rule #3: InstTest1
 // CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 3: @30
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ 64,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::COPY,
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 5*/ 42, // Rule ID 3 //
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule2Enabled,
-// CHECK-NEXT:         // MIs[0] a
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] b
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // Combiner Rule #2: InstTest0
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner1,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 5: @42
-// CHECK-NEXT:       GIM_Try, /*On fail goto*//*Label 6*/ 63, // Rule ID 4 //
-// CHECK-NEXT:         GIM_CheckFeatures, GIFBS_HasAnswerToEverything,
-// CHECK-NEXT:         GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule3Enabled,
-// CHECK-NEXT:         // MIs[0] a
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         // MIs[0] b
-// CHECK-NEXT:         GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
-// CHECK-NEXT:         GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:         // MIs[1] c
-// CHECK-NEXT:         // No operand predicates
-// CHECK-NEXT:         GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_GICombiner0,
-// CHECK-NEXT:         GIM_CheckIsSafeToFold, /*InsnID*/1,
-// CHECK-NEXT:         // Combiner Rule #3: InstTest1
-// CHECK-NEXT:         GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
-// CHECK-NEXT:         GIR_Done,
-// CHECK-NEXT:       // Label 6: @63
-// CHECK-NEXT:       GIM_Reject,
-// CHECK-NEXT:     // Label 4: @64
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ 101, // Rule ID 5 //
-// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule4Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_STORE,
-// CHECK-NEXT:       // MIs[0] tmp
-// CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
-// CHECK-NEXT:       // MIs[1] ext
+// CHECK-NEXT:     // Label 7: @133
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ 140, // Rule ID 3 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule2Enabled,
+// CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       // MIs[0] ptr
+// CHECK-NEXT:       // MIs[0] b
 // CHECK-NEXT:       // No operand predicates
-// CHECK-NEXT:       GIM_CheckIsSafeToFold, /*InsnID*/1,
-// CHECK-NEXT:       // Combiner Rule #4: InOutInstTest0
-// CHECK-NEXT:       GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::G_STORE,
-// CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // ext
-// CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // ptr
-// CHECK-NEXT:       GIR_MergeMemOperands, /*InsnID*/0, /*MergeInsnID's*/0, 1, GIU_MergeMemOperands_EndOfList,
-// CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
-// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner2,
+// CHECK-NEXT:       // Combiner Rule #2: InstTest0
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner1,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 7: @101
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ 143, // Rule ID 6 //
+// CHECK-NEXT:     // Label 8: @140
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 1: @141
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ 180, // Rule ID 6 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule5Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_AND,
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -224,10 +186,54 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 8: @143
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ 181, // Rule ID 7 //
+// CHECK-NEXT:     // Label 9: @180
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 2: @181
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ 215, // Rule ID 5 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule4Enabled,
+// CHECK-NEXT:       // MIs[0] tmp
+// CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
+// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ZEXT,
+// CHECK-NEXT:       // MIs[1] ext
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       // MIs[0] ptr
+// CHECK-NEXT:       // No operand predicates
+// CHECK-NEXT:       GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT:       // Combiner Rule #4: InOutInstTest0
+// CHECK-NEXT:       GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::G_STORE,
+// CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // ext
+// CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // ptr
+// CHECK-NEXT:       GIR_MergeMemOperands, /*InsnID*/0, /*MergeInsnID's*/0, 1, GIU_MergeMemOperands_EndOfList,
+// CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner2,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 10: @215
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 3: @216
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ 223, // Rule ID 0 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
+// CHECK-NEXT:       // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 11: @223
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ 230, // Rule ID 1 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
+// CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 12: @230
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 4: @231
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ 238, // Rule ID 2 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule1Enabled,
+// CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
+// CHECK-NEXT:       GIR_CustomAction, GICXXCustomAction_CombineApplyGICombiner0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 13: @238
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 5: @239
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ 274, // Rule ID 7 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule6Enabled,
-// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_ZEXT,
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // MIs[0] cst
@@ -243,7 +249,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 9: @181
+// CHECK-NEXT:     // Label 14: @274
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     // Label 6: @275
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     };
 // CHECK-NEXT:   return MatchTable0;
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index 08a4db2fb3dec..b28915148ee51 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -3652,6 +3652,9 @@ void GICombinerEmitter::gatherRules(
 }
 
 void GICombinerEmitter::run(raw_ostream &OS) {
+  InstructionOpcodeMatcher::initOpcodeValuesMap(Target);
+  LLTOperandMatcher::initTypeIDValuesMap();
+
   Records.startTimer("Gather rules");
   std::vector<RuleMatcher> Rules;
   gatherRules(Rules, Combiner->getValueAsListOfDefs("Rules"));
@@ -3666,6 +3669,16 @@ void GICombinerEmitter::run(raw_ostream &OS) {
   for (const auto &Rule : Rules)
     MaxTemporaries = std::max(MaxTemporaries, Rule.countRendererFns());
 
+  llvm::stable_sort(Rules, [&](const RuleMatcher &A, const RuleMatcher &B) {
+    if (A.isHigherPriorityThan(B)) {
+      assert(!B.isHigherPriorityThan(A) && "Cannot be more important "
+                                           "and less important at "
+                                           "the same time");
+      return true;
+    }
+    return false;
+  });
+
   const MatchTable Table = buildMatchTable(Rules);
 
   Records.startTimer("Emit combiner");

From 515a8263269278466b4fbbf22073bc6f84e6fd70 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <44582521+dc03@users.noreply.github.com>
Date: Wed, 20 Sep 2023 13:06:13 +0530
Subject: [PATCH 55/57] [NFC][InferAlignment] Swap extern declaration and
 definition of EnableInferAlignmentPass

This prevents a linker issue when only InstCombine is linked without
PassBuilder, like in the case of bugpoint.
---
 llvm/lib/Passes/PassBuilderPipelines.cpp                     | 5 +----
 .../Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp    | 5 ++++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 29cf2f75fd6ca..5f5d92d277c1b 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -275,10 +275,7 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
     cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
 
-cl::opt<bool> EnableInferAlignmentPass(
-    "enable-infer-alignment-pass", cl::init(true), cl::Hidden, cl::ZeroOrMore,
-    cl::desc("Enable the InferAlignment pass, disabling alignment inference in "
-             "InstCombine"));
+extern cl::opt<bool> EnableInferAlignmentPass;
 
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 3767ecd6539f3..ec4523d1a7ced 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -36,7 +36,10 @@ static cl::opt<unsigned> MaxCopiedFromConstantUsers(
     cl::desc("Maximum users to visit in copy from constant transform"),
     cl::Hidden);
 
-extern cl::opt<bool> EnableInferAlignmentPass;
+cl::opt<bool> EnableInferAlignmentPass(
+    "enable-infer-alignment-pass", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Enable the InferAlignment pass, disabling alignment inference in "
+             "InstCombine"));
 
 /// isOnlyCopiedFromConstantMemory - Recursively walk the uses of a (derived)
 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we

From 1062c140f811fabb7a926fde74112ea4d2330b74 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 20 Sep 2023 10:00:28 +0200
Subject: [PATCH 56/57] [flang] Prevent IR name clashes between BIND(C) and
 external procedures (#66777)

Defining a procedure with a BIND(C, NAME="...") where the binding label
matches the assembly name of a non BIND(C) external procedure in the
same file causes a failure when generating the LLVM IR because of the
assembly symbol name clash.

Prevent this crash with a clearer semantic error.
---
 flang/include/flang/Common/Fortran.h          |  7 ++++
 .../Transforms/ExternalNameConversion.cpp     |  7 +---
 flang/lib/Semantics/check-declarations.cpp    | 42 +++++++++++++++++++
 flang/test/Semantics/bind-c14.f90             | 35 ++++++++++++++++
 4 files changed, 86 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Semantics/bind-c14.f90

diff --git a/flang/include/flang/Common/Fortran.h b/flang/include/flang/Common/Fortran.h
index 15db21bf3473c..4007bfc7994f9 100644
--- a/flang/include/flang/Common/Fortran.h
+++ b/flang/include/flang/Common/Fortran.h
@@ -114,5 +114,12 @@ bool AreCompatibleCUDADataAttrs(
 
 static constexpr char blankCommonObjectName[] = "__BLNK__";
 
+// Get the assembly name for a non BIND(C) external symbol other than the blank
+// common block.
+inline std::string GetExternalAssemblyName(
+    std::string symbolName, bool underscoring) {
+  return underscoring ? std::move(symbolName) + "_" : std::move(symbolName);
+}
+
 } // namespace Fortran::common
 #endif // FORTRAN_COMMON_FORTRAN_H_
diff --git a/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp b/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp
index 64791d673dacd..e967a8f19d53a 100644
--- a/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp
@@ -38,11 +38,8 @@ mangleExternalName(const std::pair<fir::NameUniquer::NameKind,
   if (result.first == fir::NameUniquer::NameKind::COMMON &&
       result.second.name.empty())
     return Fortran::common::blankCommonObjectName;
-
-  if (appendUnderscore)
-    return result.second.name + "_";
-
-  return result.second.name;
+  return Fortran::common::GetExternalAssemblyName(result.second.name,
+                                                  appendUnderscore);
 }
 
 /// Update the early outlining parent name
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 14f34666056e8..2c2866d590ae5 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -130,6 +130,7 @@ class CheckHelper {
   }
   bool IsResultOkToDiffer(const FunctionResult &);
   void CheckGlobalName(const Symbol &);
+  void CheckProcedureAssemblyName(const Symbol &symbol);
   void CheckExplicitSave(const Symbol &);
   void CheckBindC(const Symbol &);
   void CheckBindCFunctionResult(const Symbol &);
@@ -178,6 +179,9 @@ class CheckHelper {
   std::map<std::string, SymbolRef> globalNames_;
   // Collection of external procedures without global definitions
   std::map<std::string, SymbolRef> externalNames_;
+  // Collection of target dependent assembly names of external and BIND(C)
+  // procedures.
+  std::map<std::string, SymbolRef> procedureAssemblyNames_;
 };
 
 class DistinguishabilityHelper {
@@ -277,6 +281,7 @@ void CheckHelper::Check(const Symbol &symbol) {
     CheckContiguous(symbol);
   }
   CheckGlobalName(symbol);
+  CheckProcedureAssemblyName(symbol);
   if (symbol.attrs().test(Attr::ASYNCHRONOUS) &&
       !evaluate::IsVariable(symbol)) {
     messages_.Say(
@@ -2623,6 +2628,43 @@ void CheckHelper::CheckGlobalName(const Symbol &symbol) {
   }
 }
 
+void CheckHelper::CheckProcedureAssemblyName(const Symbol &symbol) {
+  if (!IsProcedure(symbol) || symbol != symbol.GetUltimate())
+    return;
+  const std::string *bindName{symbol.GetBindName()};
+  const bool hasExplicitBindingLabel{
+      symbol.GetIsExplicitBindName() && bindName};
+  if (hasExplicitBindingLabel || IsExternal(symbol)) {
+    const std::string assemblyName{hasExplicitBindingLabel
+            ? *bindName
+            : common::GetExternalAssemblyName(
+                  symbol.name().ToString(), context_.underscoring())};
+    auto pair{procedureAssemblyNames_.emplace(std::move(assemblyName), symbol)};
+    if (!pair.second) {
+      const Symbol &other{*pair.first->second};
+      const bool otherHasExplicitBindingLabel{
+          other.GetIsExplicitBindName() && other.GetBindName()};
+      if (otherHasExplicitBindingLabel != hasExplicitBindingLabel) {
+        // The BIND(C,NAME="...") binding label is the same as the name that
+        // will be used in LLVM IR for an external procedure declared without
+        // BIND(C) in the same file. While this is not forbidden by the
+        // standard, this name collision would lead to a crash when producing
+        // the IR.
+        if (auto *msg{messages_.Say(symbol.name(),
+                "%s procedure assembly name conflicts with %s procedure assembly name"_err_en_US,
+                hasExplicitBindingLabel ? "BIND(C)" : "Non BIND(C)",
+                hasExplicitBindingLabel ? "non BIND(C)" : "BIND(C)")}) {
+          msg->Attach(other.name(), "Conflicting declaration"_en_US);
+        }
+        context_.SetError(symbol);
+        context_.SetError(other);
+      }
+      // Otherwise, the global names also match and the conflict is analyzed
+      // by CheckGlobalName.
+    }
+  }
+}
+
 void CheckHelper::CheckBindC(const Symbol &symbol) {
   bool isExplicitBindC{symbol.attrs().test(Attr::BIND_C)};
   if (isExplicitBindC) {
diff --git a/flang/test/Semantics/bind-c14.f90 b/flang/test/Semantics/bind-c14.f90
new file mode 100644
index 0000000000000..40beac9a6dabd
--- /dev/null
+++ b/flang/test/Semantics/bind-c14.f90
@@ -0,0 +1,35 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -funderscoring
+
+subroutine conflict1()
+end subroutine
+
+!ERROR: BIND(C) procedure assembly name conflicts with non BIND(C) procedure assembly name
+subroutine foo(x)  bind(c, name="conflict1_")
+  real :: x
+end subroutine
+
+subroutine no_conflict1() bind(c, name="")
+end subroutine
+subroutine foo2() bind(c, name="conflict2_")
+end subroutine
+
+subroutine bar()
+  interface
+    subroutine no_conflict1() bind(c, name="")
+    end subroutine
+    ! ERROR: Non BIND(C) procedure assembly name conflicts with BIND(C) procedure assembly name
+    subroutine conflict2()
+    end subroutine
+  end interface
+  call no_conflict1()
+  call conflict2
+end subroutine
+
+subroutine no_conflict2() bind(c, name="no_conflict2_")
+end subroutine
+
+subroutine _()
+end subroutine
+
+subroutine dash_no_conflict() bind(c, name="")
+end subroutine

From fe5c18564174c1b488e45465c3896f9c74993310 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Wed, 20 Sep 2023 10:01:37 +0200
Subject: [PATCH 57/57] Revert "[Workflow] Add new code format helper.
 (#66684)"

This reverts commit da94bf0d561109529e4ab3dabfcbb8b6c258ea39.
---
 .github/workflows/pr-code-format.yml          |  54 ----
 .github/workflows/pr-python-format.yml        |  39 +++
 llvm/utils/git/code-format-helper.py          | 233 ------------------
 llvm/utils/git/requirements_formatting.txt    |  52 ----
 llvm/utils/git/requirements_formatting.txt.in |   3 -
 5 files changed, 39 insertions(+), 342 deletions(-)
 delete mode 100644 .github/workflows/pr-code-format.yml
 create mode 100644 .github/workflows/pr-python-format.yml
 delete mode 100644 llvm/utils/git/code-format-helper.py
 delete mode 100644 llvm/utils/git/requirements_formatting.txt
 delete mode 100644 llvm/utils/git/requirements_formatting.txt.in

diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
deleted file mode 100644
index 102e1a263b15a..0000000000000
--- a/.github/workflows/pr-code-format.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: "Check code formatting"
-on: pull_request
-permissions:
-  pull-requests: write
-
-jobs:
-  code_formatter:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@v4
-        with:
-          persist-credentials: false
-          fetch-depth: 2
-
-      - name: Get changed files
-        id: changed-files
-        uses: tj-actions/changed-files@v39
-        with:
-          separator: ","
-
-      - name: "Listed files"
-        run: |
-          echo "Formatting files:"
-          echo "${{ steps.changed-files.outputs.all_changed_files }}"
-
-      - name: Install clang-format
-        uses: aminya/setup-cpp@v1
-        with:
-          clangformat: 16.0.6
-
-      - name: Setup Python env
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-          cache: 'pip'
-          cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt'
-
-      - name: Install python dependencies
-        run: pip install -r llvm/utils/git/requirements_formatting.txt
-
-      - name: Run code formatter
-        env:
-          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
-          START_REV: ${{ github.event.pull_request.base.sha }}
-          END_REV: ${{ github.event.pull_request.head.sha }}
-          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-        run: |
-          python llvm/utils/git/code-format-helper.py \
-            --token ${{ secrets.GITHUB_TOKEN }} \
-            --issue-number $GITHUB_PR_NUMBER \
-            --start-rev $START_REV \
-            --end-rev $END_REV \
-            --changed-files "$CHANGED_FILES"
diff --git a/.github/workflows/pr-python-format.yml b/.github/workflows/pr-python-format.yml
new file mode 100644
index 0000000000000..c612295882654
--- /dev/null
+++ b/.github/workflows/pr-python-format.yml
@@ -0,0 +1,39 @@
+name: "Check Python Formatting"
+on:
+  pull_request:
+    # run on .py
+    paths:
+      - '**.py'
+
+jobs:
+  python_formatting:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fetch LLVM sources
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 2
+
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v39
+        with:
+          files: '**/*.py'
+
+      - name: "Listed files"
+        run: |
+          echo "Formatting files:"
+          echo "${{ steps.changed-files.outputs.all_changed_files }}"
+
+      - name: Setup Python env
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Python Formatting
+        uses: akaihola/darker@1.7.2
+        with:
+          options: "--check --diff --color"
+          version: "~=1.7.2"
+          src: "${{ steps.changed-files.outputs.all_changed_files }}"
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
deleted file mode 100644
index 8d3c30b309d01..0000000000000
--- a/llvm/utils/git/code-format-helper.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#!/usr/bin/env python3
-#
-# ====- code-format-helper, runs code formatters from the ci --*- python -*--==#
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ==-------------------------------------------------------------------------==#
-
-import argparse
-import os
-import subprocess
-import sys
-from functools import cached_property
-
-import github
-from github import IssueComment, PullRequest
-
-
-class FormatHelper:
-    COMMENT_TAG = "<!--LLVM CODE FORMAT COMMENT: {fmt}-->"
-    name = "unknown"
-
-    @property
-    def comment_tag(self) -> str:
-        return self.COMMENT_TAG.replace("fmt", self.name)
-
-    def format_run(self, changed_files: [str], args: argparse.Namespace) -> str | None:
-        pass
-
-    def pr_comment_text(self, diff: str) -> str:
-        return f"""
-{self.comment_tag}
-
-:warning: {self.friendly_name}, {self.name} found issues in your code. :warning:
-
-<details>
-<summary>
-You can test this locally with the following command:
-</summary>
-
-``````````bash
-{self.instructions}
-``````````
-
-</details>
-
-<details>
-<summary>
-View the diff from {self.name} here.
-</summary>
-
-``````````diff
-{diff}
-``````````
-
-</details>
-"""
-
-    def find_comment(
-        self, pr: PullRequest.PullRequest
-    ) -> IssueComment.IssueComment | None:
-        for comment in pr.as_issue().get_comments():
-            if self.comment_tag in comment.body:
-                return comment
-        return None
-
-    def update_pr(self, diff: str, args: argparse.Namespace):
-        repo = github.Github(args.token).get_repo(args.repo)
-        pr = repo.get_issue(args.issue_number).as_pull_request()
-
-        existing_comment = self.find_comment(pr)
-        pr_text = self.pr_comment_text(diff)
-
-        if existing_comment:
-            existing_comment.edit(pr_text)
-        else:
-            pr.as_issue().create_comment(pr_text)
-
-    def update_pr_success(self, args: argparse.Namespace):
-        repo = github.Github(args.token).get_repo(args.repo)
-        pr = repo.get_issue(args.issue_number).as_pull_request()
-
-        existing_comment = self.find_comment(pr)
-        if existing_comment:
-            existing_comment.edit(
-                f"""
-{self.comment_tag}
-:white_check_mark: With the latest revision this PR passed the {self.friendly_name}.
-"""
-            )
-
-    def run(self, changed_files: [str], args: argparse.Namespace):
-        diff = self.format_run(changed_files, args)
-        if diff:
-            self.update_pr(diff, args)
-            return False
-        else:
-            self.update_pr_success(args)
-            return True
-
-
-class ClangFormatHelper(FormatHelper):
-    name = "clang-format"
-    friendly_name = "C/C++ code formatter"
-
-    @property
-    def instructions(self):
-        return " ".join(self.cf_cmd)
-
-    @cached_property
-    def libcxx_excluded_files(self):
-        with open("libcxx/utils/data/ignore_format.txt", "r") as ifd:
-            return [excl.strip() for excl in ifd.readlines()]
-
-    def should_be_excluded(self, path: str) -> bool:
-        if path in self.libcxx_excluded_files:
-            print(f"Excluding file {path}")
-            return True
-        return False
-
-    def filter_changed_files(self, changed_files: [str]) -> [str]:
-        filtered_files = []
-        for path in changed_files:
-            _, ext = os.path.splitext(path)
-            if ext in (".cpp", ".c", ".h", ".hpp", ".hxx", ".cxx"):
-                if not self.should_be_excluded(path):
-                    filtered_files.append(path)
-        return filtered_files
-
-    def format_run(self, changed_files: [str], args: argparse.Namespace) -> str | None:
-        cpp_files = self.filter_changed_files(changed_files)
-        if not cpp_files:
-            return
-        cf_cmd = [
-            "git-clang-format",
-            "--diff",
-            args.start_rev,
-            args.end_rev,
-            "--",
-        ] + cpp_files
-        print(f"Running: {' '.join(cf_cmd)}")
-        self.cf_cmd = cf_cmd
-        proc = subprocess.run(cf_cmd, capture_output=True)
-
-        # formatting needed
-        if proc.returncode == 1:
-            return proc.stdout.decode("utf-8")
-
-        return None
-
-
-class DarkerFormatHelper(FormatHelper):
-    name = "darker"
-    friendly_name = "Python code formatter"
-
-    @property
-    def instructions(self):
-        return " ".join(self.darker_cmd)
-
-    def filter_changed_files(self, changed_files: [str]) -> [str]:
-        filtered_files = []
-        for path in changed_files:
-            name, ext = os.path.splitext(path)
-            if ext == ".py":
-                filtered_files.append(path)
-
-        return filtered_files
-
-    def format_run(self, changed_files: [str], args: argparse.Namespace) -> str | None:
-        py_files = self.filter_changed_files(changed_files)
-        if not py_files:
-            return
-        darker_cmd = [
-            "darker",
-            "--check",
-            "--diff",
-            "-r",
-            f"{args.start_rev}..{args.end_rev}",
-        ] + py_files
-        print(f"Running: {' '.join(darker_cmd)}")
-        self.darker_cmd = darker_cmd
-        proc = subprocess.run(darker_cmd, capture_output=True)
-
-        # formatting needed
-        if proc.returncode == 1:
-            return proc.stdout.decode("utf-8")
-
-        return None
-
-
-ALL_FORMATTERS = (DarkerFormatHelper(), ClangFormatHelper())
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--token", type=str, required=True, help="GitHub authentiation token"
-    )
-    parser.add_argument(
-        "--repo",
-        type=str,
-        default=os.getenv("GITHUB_REPOSITORY", "llvm/llvm-project"),
-        help="The GitHub repository that we are working with in the form of <owner>/<repo> (e.g. llvm/llvm-project)",
-    )
-    parser.add_argument("--issue-number", type=int, required=True)
-    parser.add_argument(
-        "--start-rev",
-        type=str,
-        required=True,
-        help="Compute changes from this revision.",
-    )
-    parser.add_argument(
-        "--end-rev", type=str, required=True, help="Compute changes to this revision"
-    )
-    parser.add_argument(
-        "--changed-files",
-        type=str,
-        help="Comma separated list of files that has been changed",
-    )
-
-    args = parser.parse_args()
-
-    changed_files = []
-    if args.changed_files:
-        changed_files = args.changed_files.split(",")
-
-    exit_code = 0
-    for fmt in ALL_FORMATTERS:
-        if not fmt.run(changed_files, args):
-            exit_code = 1
-
-    sys.exit(exit_code)
diff --git a/llvm/utils/git/requirements_formatting.txt b/llvm/utils/git/requirements_formatting.txt
deleted file mode 100644
index ff744f0d4225f..0000000000000
--- a/llvm/utils/git/requirements_formatting.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-#
-# This file is autogenerated by pip-compile with Python 3.11
-# by the following command:
-#
-#    pip-compile --output-file=llvm/utils/git/requirements_formatting.txt llvm/utils/git/requirements_formatting.txt.in
-#
-black==23.9.1
-    # via
-    #   -r llvm/utils/git/requirements_formatting.txt.in
-    #   darker
-certifi==2023.7.22
-    # via requests
-cffi==1.15.1
-    # via
-    #   cryptography
-    #   pynacl
-charset-normalizer==3.2.0
-    # via requests
-click==8.1.7
-    # via black
-cryptography==41.0.3
-    # via pyjwt
-darker==1.7.2
-    # via -r llvm/utils/git/requirements_formatting.txt.in
-deprecated==1.2.14
-    # via pygithub
-idna==3.4
-    # via requests
-mypy-extensions==1.0.0
-    # via black
-packaging==23.1
-    # via black
-pathspec==0.11.2
-    # via black
-platformdirs==3.10.0
-    # via black
-pycparser==2.21
-    # via cffi
-pygithub==1.59.1
-    # via -r llvm/utils/git/requirements_formatting.txt.in
-pyjwt[crypto]==2.8.0
-    # via pygithub
-pynacl==1.5.0
-    # via pygithub
-requests==2.31.0
-    # via pygithub
-toml==0.10.2
-    # via darker
-urllib3==2.0.4
-    # via requests
-wrapt==1.15.0
-    # via deprecated
diff --git a/llvm/utils/git/requirements_formatting.txt.in b/llvm/utils/git/requirements_formatting.txt.in
deleted file mode 100644
index 4aac571af1cf5..0000000000000
--- a/llvm/utils/git/requirements_formatting.txt.in
+++ /dev/null
@@ -1,3 +0,0 @@
-black~=23.0
-darker==1.7.2
-PyGithub==1.59.1