From c9da4892a5ea8025e14e97e8414a372e132037af Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 19 Feb 2025 16:38:54 -0500
Subject: [PATCH 01/14] Rename aio_thread_count to intra_op_parallelism (#7056)

Propagate API change.

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/swap_tensor/aio_config.py   | 20 ++++++++++++-------
 deepspeed/runtime/swap_tensor/constants.py    |  6 +++---
 .../runtime/swap_tensor/optimizer_utils.py    |  2 +-
 .../partitioned_optimizer_swapper.py          |  8 +++++---
 .../swap_tensor/partitioned_param_swapper.py  | 20 +++++++++++--------
 .../pipelined_optimizer_swapper.py            | 18 ++++++++++-------
 6 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
index 46c3f2a0c954..be6c7d93c86a 100644
--- a/deepspeed/runtime/swap_tensor/aio_config.py
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -10,7 +10,7 @@
 AIO_DEFAULT_DICT = {
     AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT,
     AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT,
-    AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT,
+    AIO_INTRA_OP_PARALLELISM: AIO_INTRA_OP_PARALLELISM_DEFAULT,
     AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT,
     AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT,
     AIO_USE_GDS: AIO_USE_GDS_DEFAULT
@@ -21,12 +21,18 @@ def get_aio_config(param_dict):
     if AIO in param_dict.keys() and param_dict[AIO] is not None:
         aio_dict = param_dict[AIO]
         aio_config = {
-            AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
-            AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
-            AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT),
-            AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
-            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT),
-            AIO_USE_GDS: get_scalar_param(aio_dict, AIO_USE_GDS, AIO_USE_GDS_DEFAULT)
+            AIO_BLOCK_SIZE:
+            get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
+            AIO_QUEUE_DEPTH:
+            get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
+            AIO_INTRA_OP_PARALLELISM:
+            get_scalar_param(aio_dict, AIO_INTRA_OP_PARALLELISM, AIO_INTRA_OP_PARALLELISM_DEFAULT),
+            AIO_SINGLE_SUBMIT:
+            get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
+            AIO_OVERLAP_EVENTS:
+            get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT),
+            AIO_USE_GDS:
+            get_scalar_param(aio_dict, AIO_USE_GDS, AIO_USE_GDS_DEFAULT)
         }
 
         if aio_config[AIO_USE_GDS]:
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
index cee20ac7b78c..c1207749eac6 100644
--- a/deepspeed/runtime/swap_tensor/constants.py
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -9,7 +9,7 @@
 "aio": {
   "block_size": 1048576,
   "queue_depth": 8,
-  "thread_count": 1,
+  "intra_op_parallelism": 1,
   "single_submit": false,
   "overlap_events": true,
   "use_gds": false
@@ -20,8 +20,8 @@
 AIO_BLOCK_SIZE_DEFAULT = 1048576
 AIO_QUEUE_DEPTH = "queue_depth"
 AIO_QUEUE_DEPTH_DEFAULT = 8
-AIO_THREAD_COUNT = "thread_count"
-AIO_THREAD_COUNT_DEFAULT = 1
+AIO_INTRA_OP_PARALLELISM = "intra_op_parallelism"
+AIO_INTRA_OP_PARALLELISM_DEFAULT = 1
 AIO_SINGLE_SUBMIT = "single_submit"
 AIO_SINGLE_SUBMIT_DEFAULT = False
 AIO_OVERLAP_EVENTS = "overlap_events"
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
index 5d837e386a95..d7b0ea9634b2 100644
--- a/deepspeed/runtime/swap_tensor/optimizer_utils.py
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -130,7 +130,7 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
 
         # Read/Write alignment for each thread during Intra-request parallelism
         self.min_aio_bytes = max(MIN_AIO_BYTES, aio_config[AIO_BLOCK_SIZE])
-        self.aligned_bytes = AIO_ALIGNED_BYTES * aio_config[AIO_THREAD_COUNT]
+        self.aligned_bytes = AIO_ALIGNED_BYTES * aio_config[AIO_INTRA_OP_PARALLELISM]
         self.numel_alignment = self.aligned_bytes // self.swap_element_size
 
         # Swap buffer management
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
index e53a280befe4..8b6cbe8fbb51 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -33,9 +33,11 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
                                                           largest_numel, device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
-                                            aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
-                                            aio_config[AIO_THREAD_COUNT])
+        self.aio_handle = aio_op.aio_handle(block_size=aio_config[AIO_BLOCK_SIZE],
+                                            queue_depth=aio_config[AIO_QUEUE_DEPTH],
+                                            single_submit=aio_config[AIO_SINGLE_SUBMIT],
+                                            overlap_events=aio_config[AIO_OVERLAP_EVENTS],
+                                            intra_op_parallelism=aio_config[AIO_INTRA_OP_PARALLELISM])
 
         # Overlap swapping out
         self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.aio_handle,
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index 26fbf6164d54..f80fe1501c00 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -98,7 +98,7 @@ def _configure_aio(self, ds_config):
 
         # Read/Write alignment for each thread during Intra-request parallelism
         self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE])
-        self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_THREAD_COUNT]
+        self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_INTRA_OP_PARALLELISM]
         self.numel_alignment = self.aligned_bytes // self.swap_element_size
 
         self.elements_per_buffer = self.swap_config.buffer_size
@@ -108,13 +108,17 @@ def _configure_aio(self, ds_config):
         self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
         self.reserved_buffer_ids = []
 
-        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
-                                               self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS],
-                                               self.aio_config[AIO_THREAD_COUNT])
-
-        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
-                                                self.aio_config[AIO_SINGLE_SUBMIT],
-                                                self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT])
+        self.aio_read_handle = self.aio_handle(block_size=self.aio_config[AIO_BLOCK_SIZE],
+                                               queue_depth=self.aio_config[AIO_QUEUE_DEPTH],
+                                               single_submit=self.aio_config[AIO_SINGLE_SUBMIT],
+                                               overlap_events=self.aio_config[AIO_OVERLAP_EVENTS],
+                                               intra_op_parallelism=self.aio_config[AIO_INTRA_OP_PARALLELISM])
+
+        self.aio_write_handle = self.aio_handle(block_size=self.aio_config[AIO_BLOCK_SIZE],
+                                                queue_depth=self.aio_config[AIO_QUEUE_DEPTH],
+                                                single_submit=self.aio_config[AIO_SINGLE_SUBMIT],
+                                                overlap_events=self.aio_config[AIO_OVERLAP_EVENTS],
+                                                intra_op_parallelism=self.aio_config[AIO_INTRA_OP_PARALLELISM])
 
         if self.use_gds:
             self.buffers = torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count),
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
index 66a372877d38..8f6d72e35f63 100644
--- a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -56,13 +56,17 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
                                                         device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
-                                                  aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
-                                                  aio_config[AIO_THREAD_COUNT])
-
-        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
-                                                 aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
-                                                 aio_config[AIO_THREAD_COUNT])
+        self.write_aio_handle = aio_op.aio_handle(block_size=aio_config[AIO_BLOCK_SIZE],
+                                                  queue_depth=aio_config[AIO_QUEUE_DEPTH],
+                                                  single_submit=aio_config[AIO_SINGLE_SUBMIT],
+                                                  overlap_events=aio_config[AIO_OVERLAP_EVENTS],
+                                                  intra_op_parallelism=aio_config[AIO_INTRA_OP_PARALLELISM])
+
+        self.read_aio_handle = aio_op.aio_handle(block_size=aio_config[AIO_BLOCK_SIZE],
+                                                 queue_depth=aio_config[AIO_QUEUE_DEPTH],
+                                                 single_submit=aio_config[AIO_SINGLE_SUBMIT],
+                                                 overlap_events=aio_config[AIO_OVERLAP_EVENTS],
+                                                 intra_op_parallelism=aio_config[AIO_INTRA_OP_PARALLELISM])
 
         # Overlap gradient swap out
         self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.write_aio_handle,

From d98204b9f303e0938463334a435f871ce03f99e9 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Thu, 20 Feb 2025 06:15:03 +0800
Subject: [PATCH 02/14] add autoTP training zero2  tests (#7049)

- add zero2 test
- minor fix with transformer version update & ds master merge.

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/module_inject/replace_module.py            | 4 ++++
 deepspeed/runtime/engine.py                          | 2 +-
 deepspeed/runtime/utils.py                           | 5 +++--
 tests/unit/model_parallelism/test_autotp_training.py | 4 ++--
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 9510f96b89c6..ed94a5021fee 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -335,6 +335,10 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
         return new_module
 
     def set_lm_head(module):
+        if is_autotp_training_mode():
+            # we need to handle autoTP training mode separately.
+            return
+
         embedding_weight = None
         for n, p in module.named_parameters():
             if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 8575df9d1d5d..4d932f8d5046 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -424,7 +424,7 @@ def _configure_tensor_parallel_states(self, model):
         # sanity check
         # currently, the compatibility between 'autotp' and 'zero > 1' has not been validated
         assert self.zero_optimization_stage(
-        ) <= 1, "Currently, the compatibility between 'autotp' and 'zero_stage > 1' has not been validated"
+        ) <= 2, "Currently, the compatibility between 'autotp' and 'zero_stage = 3' has not been validated"
 
         self.mpu = groups
         self.mpu._init_tp_mesh_device(tensor_model_parallel_size=self.autotp_size())
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 91fe7cbdcc96..9fd7a65a53ba 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -1134,9 +1134,10 @@ def compare_tensors_in_structures(inputs1: Union[List, Dict], inputs2: Union[Lis
         if inputs1.keys() != inputs2.keys():
             return False
         for key in inputs1:
-            val1 = inputs1[key].to(get_accelerator().current_device())
-            val2 = inputs2[key].to(get_accelerator().current_device())
+            val1, val2 = inputs1[key], inputs2[key]
             if isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor):
+                val1 = val1.to(get_accelerator().current_device())
+                val2 = val2.to(get_accelerator().current_device())
                 if not torch.equal(val1, val2):
                     return False
             elif val1 != val2:
diff --git a/tests/unit/model_parallelism/test_autotp_training.py b/tests/unit/model_parallelism/test_autotp_training.py
index 73e61b1d3398..7680b28ce6b5 100644
--- a/tests/unit/model_parallelism/test_autotp_training.py
+++ b/tests/unit/model_parallelism/test_autotp_training.py
@@ -360,7 +360,7 @@ def prepare_tp_model(hidden_dim, nlayers, linear_indices, allreduce_indices, gro
     return model, base_model
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1])
+@pytest.mark.parametrize("zero_stage", [0, 1, 2])
 @pytest.mark.parametrize("tp_size", [2, 4])
 class TestSave(DistributedTest):
 
@@ -492,7 +492,7 @@ def test_ckpt_save(self, tmpdir, tp_size: int, zero_stage: int):
         compare_lr_scheduler_states(trained_model, loaded_model)
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1])
+@pytest.mark.parametrize("zero_stage", [0, 1, 2])
 @pytest.mark.parametrize("tp_size", [2, 4])
 class TestTpGradNorm(DistributedTest):
 

From e2dc3eeb1923073e32739596a4fd051417d4ff92 Mon Sep 17 00:00:00 2001
From: wukong1992 <wukong1992@users.noreply.github.com>
Date: Thu, 20 Feb 2025 21:36:17 +0800
Subject: [PATCH 03/14] Fix, bf16 optimizer remove dup loop (#7054)

bf16 with moe refresh optimizer state from bf16 ckpt will raise
IndexError: list index out of range

Signed-off-by: shaomin <wukon1992@gmail.com>
Co-authored-by: shaomin <wukon1992@gmail.com>
Co-authored-by: Hongwei Chen <33092912+hwchen2017@users.noreply.github.com>
---
 deepspeed/runtime/bf16_optimizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index 6b63efbb23f7..78895e70df03 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -472,10 +472,10 @@ def state_dict(self):
 
     # Restore base optimizer fp32 weights bfloat16 weights
     def _restore_from_bit16_weights(self):
-        for i, group in enumerate(self.bf16_groups):
+        for i, (bf16_partitions,
+                fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            for bf16_partitions, fp32_partition in zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition):
-                fp32_partition.data.copy_(bf16_partitions[partition_id].data)
+            fp32_partition.data.copy_(bf16_partitions[partition_id].data)
 
     def refresh_fp32_params(self):
         self._restore_from_bit16_weights()

From fa8967e209ee909de556fc7d363eeae1c8ee3b04 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Thu, 20 Feb 2025 07:27:28 -0800
Subject: [PATCH 04/14] Update version.txt after 0.16.4 release (#7063)

**Auto-generated PR to update version.txt after a DeepSpeed release**
Released version - 0.16.4
Author           - @loadams

Co-authored-by: loadams <loadams@users.noreply.github.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 5f2491c5adca..19270385eaf7 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.16.4
+0.16.5

From 461d641f00a04e96f2d3c58af55ff6af4b50fc92 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 20 Feb 2025 07:27:54 -0800
Subject: [PATCH 05/14] fix an outdated doc wrt CUDA_VISIBLE_DEVICES (#7058)

@jeffra and I fixed this many years ago, so bringing this doc to a
correct state.

---------

Signed-off-by: Stas Bekman <stas@stason.org>
---
 docs/_tutorials/getting-started.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index 36dcdf41d9d0..2c6e27d1319d 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -316,10 +316,14 @@ local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
 as the hostname.
 
-Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
-which devices should be used. For example, to use only gpu1 of the current
-node, do:
+Also note that `CUDA_VISIBLE_DEVICES` can be used with `deepspeed` to control
+which devices should be used on a single node. So either of these would work
+to launch just on devices 0 and 1 of the current node:
 
 ```bash
-deepspeed --include localhost:1 ...
+deepspeed --include localhost:0,1 ...
+```
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 deepspeed ...
 ```

From cb20d44978f70a177f7f5a07f8881976ed314428 Mon Sep 17 00:00:00 2001
From: siqi654321 <siqi202311@163.com>
Date: Fri, 21 Feb 2025 04:46:21 +0800
Subject: [PATCH 06/14] Tecorigin sdaa accelerator (#6903)

Description
This PR includes Tecorigin SDAA accelerator support.
With this PR, DeepSpeed supports SDAA as backend for training tasks.

---------

Signed-off-by: siqi <siqi@tecorigin.com>
Co-authored-by: siqi <siqi@tecorigin.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 accelerator/real_accelerator.py |  19 +-
 accelerator/sdaa_accelerator.py | 328 ++++++++++++++++++++++++++++++++
 op_builder/sdaa/__init__.py     |  36 ++++
 op_builder/sdaa/builder.py      |  60 ++++++
 op_builder/sdaa/cpu_adam.py     |  53 ++++++
 op_builder/sdaa/fused_adam.py   |  67 +++++++
 op_builder/sdaa/no_impl.py      |  59 ++++++
 7 files changed, 621 insertions(+), 1 deletion(-)
 create mode 100755 accelerator/sdaa_accelerator.py
 create mode 100755 op_builder/sdaa/__init__.py
 create mode 100755 op_builder/sdaa/builder.py
 create mode 100755 op_builder/sdaa/cpu_adam.py
 create mode 100755 op_builder/sdaa/fused_adam.py
 create mode 100755 op_builder/sdaa/no_impl.py

diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index ac17fe2c67e5..9c1713ca4c6c 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -20,7 +20,7 @@
 except ImportError as e:
     dsa2 = None
 
-SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu', 'mlu']
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu', 'mlu', 'sdaa']
 
 ds_accelerator = None
 
@@ -80,6 +80,12 @@ def get_accelerator():
             except ImportError as e:
                 raise ValueError(f"NPU_Accelerator requires torch_npu, which is not installed on this system.")
             pass
+        elif accelerator_name == "sdaa":
+            try:
+                import torch_sdaa  # noqa: F401 # type: ignore
+            except ImportError as e:
+                raise ValueError(f"SDAA_Accelerator requires torch_sdaa, which is not installed on this system.")
+            pass
         elif accelerator_name == "mps":
             try:
                 import torch.mps
@@ -137,6 +143,13 @@ def get_accelerator():
                 accelerator_name = "npu"
             except ImportError as e:
                 pass
+        if accelerator_name is None:
+            try:
+                import torch_sdaa  # noqa: F401,F811 # type: ignore
+
+                accelerator_name = "sdaa"
+            except ImportError as e:
+                pass
         if accelerator_name is None:
             try:
                 import torch.mps
@@ -205,6 +218,10 @@ def get_accelerator():
         from .npu_accelerator import NPU_Accelerator
 
         ds_accelerator = NPU_Accelerator()
+    elif accelerator_name == "sdaa":
+        from .sdaa_accelerator import SDAA_Accelerator
+
+        ds_accelerator = SDAA_Accelerator()
     elif accelerator_name == "mps":
         from .mps_accelerator import MPS_Accelerator
 
diff --git a/accelerator/sdaa_accelerator.py b/accelerator/sdaa_accelerator.py
new file mode 100755
index 000000000000..26113d38dd15
--- /dev/null
+++ b/accelerator/sdaa_accelerator.py
@@ -0,0 +1,328 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+import importlib
+import inspect
+import functools
+
+from .abstract_accelerator import DeepSpeedAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.sdaa
+except ImportError:
+    pass
+
+
+class SDAA_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'sdaa'
+        self._communication_backend_name = 'tccl'
+        self._compile_backend = "inductor"
+        self.class_dict = None
+
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index is None:
+            return 'sdaa'
+        return 'sdaa:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.sdaa.device(device_index)
+
+    def set_device(self, device_index):
+        torch.sdaa.set_device(device_index)
+
+    def current_device(self):
+        return torch.sdaa.current_device()
+
+    def current_device_name(self):
+        return 'sdaa:{}'.format(torch.sdaa.current_device())
+
+    def device_count(self):
+        return torch.sdaa.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.sdaa.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.sdaa.set_rng_state(new_state)
+
+        return torch.sdaa.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.sdaa.get_rng_state()
+
+        return torch.sdaa.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.sdaa.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.sdaa.manual_seed_all(seed)
+
+    def initial_seed(self):
+        return torch.sdaa.initial_seed()
+
+    def default_generator(self, device_index):
+        return torch.sdaa.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.sdaa.Stream
+
+    def stream(self, stream):
+        return torch.sdaa.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.sdaa.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.sdaa.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.sdaa.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.sdaa.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.sdaa.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.sdaa.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.sdaa.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.sdaa.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.sdaa.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.sdaa.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.sdaa, 'memory_stats'):
+            return torch.sdaa.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.sdaa, 'reset_peak_memory_stats'):
+            return torch.sdaa.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.sdaa, 'memory_reserved'):
+            return torch.sdaa.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.sdaa, 'max_memory_reserved'):
+            return torch.sdaa.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.sdaa.get_device_properties(device_index).total_memory
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.sdaa.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        return True
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.sdaa, 'amp'):
+            return torch.sdaa.amp
+        return None
+
+    def is_available(self):
+        return torch.sdaa.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        return torch.sdaa._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Graph operations
+    def create_graph(self):
+        return None
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='sdaa')
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='sdaa')
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device='sdaa')
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device='sdaa')
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device='sdaa')
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device='sdaa')
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device='sdaa')
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('sdaa:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.sdaa"
+        except ImportError:
+            return "deepspeed.ops.op_builder.sdaa"
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict:
+            return
+
+        op_builder_module = importlib.import_module(self.op_builder_dir())
+
+        # get op builder class from op_builder/sdaa/__init__.py
+        self.class_dict = {}
+        for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+            self.class_dict[class_name] = class_obj
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        builder_class = self.get_op_builder(class_name)
+        return builder_class()
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder']
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return ['NCCL', 'LD_LIBRARY', 'PATH']
+
+    def visible_devices_envs(self):
+        return ['SDAA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/op_builder/sdaa/__init__.py b/op_builder/sdaa/__init__.py
new file mode 100755
index 000000000000..2a6eb8bbfa2c
--- /dev/null
+++ b/op_builder/sdaa/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) Microsoft Corporation.
+
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .no_impl import NotImplementedBuilder
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
diff --git a/op_builder/sdaa/builder.py b/op_builder/sdaa/builder.py
new file mode 100755
index 000000000000..81f0e98c0768
--- /dev/null
+++ b/op_builder/sdaa/builder.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class SDAAOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        return ['-O3', '-g', '-Wno-reorder']
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/sdaa/cpu_adam.py b/op_builder/sdaa/cpu_adam.py
new file mode 100755
index 000000000000..b38a71a6275d
--- /dev/null
+++ b/op_builder/sdaa/cpu_adam.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+from .builder import SDAAOpBuilder
+
+
+class CPUAdamBuilder(SDAAOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/sdaa/fused_adam.py b/op_builder/sdaa/fused_adam.py
new file mode 100755
index 000000000000..73a2dff41459
--- /dev/null
+++ b/op_builder/sdaa/fused_adam.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+from .builder import SDAAOpBuilder
+
+try:
+    import torch
+except ImportError as e:
+    pass
+
+
+class SDAAFusedAdam:
+
+    @staticmethod
+    def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
+                          bias_correction, weight_decay, *args):
+        g_tensor_lis, p_tensor_lis, m_tensor_lis, v_tensor_lis = tensor_lists
+        torch.ops.sdaa.fused_adam(g_tensor_lis, p_tensor_lis, m_tensor_lis, v_tensor_lis, [], beta1, beta2, epsilon,
+                                  lr, weight_decay, adam_w_mode, step, bias_correction)
+
+
+class FusedAdamBuilder(SDAAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return SDAAFusedAdam
diff --git a/op_builder/sdaa/no_impl.py b/op_builder/sdaa/no_impl.py
new file mode 100755
index 000000000000..10a8b8f48652
--- /dev/null
+++ b/op_builder/sdaa/no_impl.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+from .builder import SDAAOpBuilder
+
+
+class NotImplementedBuilder(SDAAOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on SDAA backend.")
+
+    def sources(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        return []

From 8577bd244903f454e319953a6ae1c3838558ac69 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Thu, 20 Feb 2025 14:33:18 -0800
Subject: [PATCH 07/14] Handle special case of libuv for Windows (#7064)

More information on libuv in pytorch:
https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html
Issue tracking the prevalence of the error on Windows (unresolved at the
time of this PR): https://github.com/pytorch/pytorch/issues/139990
LibUV github: https://github.com/libuv/libuv

Windows error:
```
  File "C:\hostedtoolcache\windows\Python\3.12.7\x64\Lib\site-packages\torch\distributed\rendezvous.py", line 189, in _create_c10d_store
    return TCPStore(
           ^^^^^^^^^
RuntimeError: use_libuv was requested but PyTorch was build without libuv support
```

use_libuv isn't well supported on Windows in pytorch <2.4, so we need to
guard around this case.

---------

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 deepspeed/comm/torch.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index efa0640fb87b..1146832d7655 100755
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -145,11 +145,22 @@ def has_reduce_scatter_tensor(self):
 
     def init_process_group(self, backend, timeout, init_method, rank, world_size):
         if not torch.distributed.is_initialized():
-            torch.distributed.init_process_group(backend,
-                                                 timeout=timeout,
-                                                 init_method=init_method,
-                                                 rank=rank,
-                                                 world_size=world_size)
+            if not required_torch_version(min_version=2.4):
+                # Windows torch builds do not come with lib_uv by default.
+                # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html
+                use_libuv = False if os.name == "nt" else True
+                torch.distributed.init_process_group(backend,
+                                                     timeout=timeout,
+                                                     init_method=init_method,
+                                                     rank=rank,
+                                                     world_size=world_size,
+                                                     use_libuv=use_libuv)
+            else:
+                torch.distributed.init_process_group(backend,
+                                                     timeout=timeout,
+                                                     init_method=init_method,
+                                                     rank=rank,
+                                                     world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
     @disable_compiler_collective

From 9f20148a77c1e986204d62437ea7541c9e918b08 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 21 Feb 2025 04:05:29 -0800
Subject: [PATCH 08/14] Update README with info on newest accelerator (#7065)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index db77d419cce7..7cd3f4088abf 100755
--- a/README.md
+++ b/README.md
@@ -172,6 +172,7 @@ dynamically link them at runtime.
 | Intel       | Intel(R) Gaudi(R) 2 AI accelerator  | hpu              | Yes | Yes                |
 | Intel       | Intel(R) Xeon(R) Processors         | cpu              | Yes | Yes                |
 | Intel       | Intel(R) Data Center GPU Max series | xpu              | Yes | Yes                |
+| Tecorigin   | Scalable Data Analytics Accelerator | sdaa             | Yes | No                 |
 
 ## PyPI
 We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.

From 38327e07f6cd39b63418f60cedd1651b0be3e7c3 Mon Sep 17 00:00:00 2001
From: Wei Wu <45323446+U-rara@users.noreply.github.com>
Date: Fri, 21 Feb 2025 20:05:41 +0800
Subject: [PATCH 09/14] Bug Fix for offload_states API (#7050)

@fukun07 and I discovered a bug when using the `offload_states` and
`reload_states` APIs of the Zero3 optimizer. When using grouped
parameters (for example, in weight decay or grouped lr scenarios), the
order of the parameters mapping in `reload_states`
([here](https://github.com/deepspeedai/DeepSpeed/blob/14b3cce4aaedac69120d386953e2b4cae8c2cf2c/deepspeed/runtime/zero/stage3.py#L2953))
does not correspond with the initialization of `self.lp_param_buffer`
([here](https://github.com/deepspeedai/DeepSpeed/blob/14b3cce4aaedac69120d386953e2b4cae8c2cf2c/deepspeed/runtime/zero/stage3.py#L731)),
which leads to misaligned parameter loading. This issue was overlooked
by the corresponding unit tests
([here](https://github.com/deepspeedai/DeepSpeed/blob/master/tests/unit/runtime/zero/test_offload_states.py)),
so we fixed the bug in our PR and added the corresponding unit tests.

---------

Signed-off-by: Wei Wu <wuwei211x@gmail.com>
Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
---
 deepspeed/runtime/zero/stage3.py               | 12 ++++++------
 tests/unit/runtime/zero/test_offload_states.py | 13 ++++++++++---
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 9cc58fdbac01..ee97b6278d9e 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -732,10 +732,7 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
         # move parameters to flattened buffer
         if not self.offload_param:  # partitioned params remain in GPU during training
             # move parameter partitions into a single contiguous flat buffer
-            parameter_partitions: List[Tensor] = []
-            for sub_group in self.fp16_groups:
-                for param in sub_group:
-                    parameter_partitions.append(param.ds_tensor)
+            parameter_partitions = self._get_parameter_partitions()
 
             # We need to keep the reference to this buffer to make sure you can free it in `offload_states`
             self.lp_param_buffer = __class__.defragment(parameter_partitions)
@@ -786,6 +783,9 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
             assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty'
             self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel)
 
+    def _get_parameter_partitions(self) -> List[Tensor]:
+        return [param.ds_tensor for sub_group in self.fp16_groups for param in sub_group]
+
     def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
         offset = 0
         elements_in_sub_group = sum([t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
@@ -2954,8 +2954,8 @@ def reload_states(self, non_blocking: bool = False):
             self.lp_param_buffer.data = cpu_buffer.data.to(device, non_blocking=non_blocking)
             self._set_fp16_partitioned_groups_flat()
 
-            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(
-                [p.ds_tensor for p in self.module.parameters()]):
+            parameter_partitions = self._get_parameter_partitions()
+            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(parameter_partitions):
                 tensor.data = self.lp_param_buffer.narrow(0, offset, tensor_numel)
             self.offloaded_states.remove(OffloadStateTypeEnum.lp_params)
 
diff --git a/tests/unit/runtime/zero/test_offload_states.py b/tests/unit/runtime/zero/test_offload_states.py
index 9105a54661fa..44bff480e27b 100644
--- a/tests/unit/runtime/zero/test_offload_states.py
+++ b/tests/unit/runtime/zero/test_offload_states.py
@@ -33,11 +33,11 @@ def compare_device(state) -> bool:
                 assert compare_device(state), f"State {state} is not on device {device}"
 
 
-def run_model(model, config_dict, hidden_dim, dtype, include, pin_memory, non_blocking):
+def run_model(model, param_groups, config_dict, hidden_dim, dtype, include, pin_memory, non_blocking):
     # Currently we only support OffloadDeviceEnum.cpu
     offload_device = OffloadDeviceEnum.cpu
 
-    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=param_groups, config=config_dict)
     data_loader = random_dataloader(model=model,
                                     total_samples=10,
                                     hidden_dim=hidden_dim,
@@ -124,5 +124,12 @@ def test_offload_states(self, included_state, pin_memory, non_blocking):
         with deepspeed.zero.Init(config_dict_or_path=config_dict):
             model = SimpleModel(hidden_dim, nlayers=4)
 
+        param_groups = [{
+            "params": [p for n, p in model.named_parameters() if not 'bias' in n],
+            "weight_decay": 0.1
+        }, {
+            "params": [p for n, p in model.named_parameters() if 'bias' in n],
+            "weight_decay": 0.0
+        }]
         include = None if included_state is None else [included_state]
-        run_model(model, config_dict, hidden_dim, torch.bfloat16, include, pin_memory, non_blocking)
+        run_model(model, param_groups, config_dict, hidden_dim, torch.bfloat16, include, pin_memory, non_blocking)

From 9d820e4c053160af0136a9cfc3a3a55d4297a023 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Sun, 23 Feb 2025 22:01:06 -0800
Subject: [PATCH 10/14] Fix TOCTOU issues, switch to fstat (#7067)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 csrc/aio/common/deepspeed_aio_common.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 81c315e9a558..9d7ff5093017 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -284,12 +284,13 @@ int open_file(const char* filename, const bool read_op)
 
 int regular_read(const char* filename, std::vector<char>& buffer)
 {
-    int64_t num_bytes;
-    const auto f_size = get_file_size(filename, num_bytes);
-    assert(f_size != -1);
-    buffer.resize(num_bytes);
     const auto fd = open(filename, O_RDONLY, 0600);
     assert(fd != -1);
+    struct stat fs;
+    const auto result = fstat(fd, &fs);
+    assert(result != -1);
+    int64_t num_bytes = fs.st_size;
+    buffer.resize(num_bytes);
     int64_t read_bytes = 0;
     auto r = 0;
     do {

From e1903f0d0a666f2e1b901f50c76f252f87a001c5 Mon Sep 17 00:00:00 2001
From: Shelly Nahir <73890534+ShellyNR@users.noreply.github.com>
Date: Mon, 24 Feb 2025 16:50:09 +0200
Subject: [PATCH 11/14] config torch to avoid graph breaks caused by logger
 (#6999)

Following changes in Pytorch trace rules , my previous PR to avoid graph
breaks caused by logger is no longer relevant. So instead I've added
this functionality to torch dynamo -
https://github.com/pytorch/pytorch/commit/16ea0ddcdbe49f5560bfbbc6ab331c3422e27b2e
This commit allows the user to config torch to ignore logger methods and
avoid associated graph breaks.

To enable ignore logger methods -
os.environ["DISABLE_LOGS_WHILE_COMPILING"] = "1"
To ignore logger methods except for a specific method / methods (for
example, info and isEnabledFor) -
os.environ["DISABLE_LOGS_WHILE_COMPILING"] = "1"
and os.environ["LOGGER_METHODS_TO_EXCLUDE_FROM_DISABLE"] = "info,
isEnabledFor"

Signed-off-by: ShellyNR <shelly.nahir@live.biu.ac.il>
Co-authored-by: snahir <snahir@habana.ai>
---
 deepspeed/utils/logging.py | 41 ++++++++++----------------------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index d5db29485db3..77173f2839ca 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -7,7 +7,8 @@
 import logging
 import sys
 import os
-from deepspeed.runtime.compiler import is_compile_supported, is_compiling
+import torch
+from deepspeed.utils.torch import required_torch_version
 
 log_levels = {
     "debug": logging.DEBUG,
@@ -20,31 +21,6 @@
 
 class LoggerFactory:
 
-    def create_warning_filter(logger):
-        warn = False
-
-        def warn_once(record):
-            nonlocal warn
-            if is_compile_supported() and is_compiling() and not warn:
-                warn = True
-                logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to"
-                               " disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1")
-            return True
-
-        return warn_once
-
-    @staticmethod
-    def logging_decorator(func):
-
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            if is_compiling():
-                return
-            else:
-                return func(*args, **kwargs)
-
-        return wrapper
-
     @staticmethod
     def create_logger(name=None, level=logging.INFO):
         """create a logger
@@ -70,12 +46,15 @@ def create_logger(name=None, level=logging.INFO):
         ch.setLevel(level)
         ch.setFormatter(formatter)
         logger_.addHandler(ch)
-        if os.getenv("DISABLE_LOGS_WHILE_COMPILING", "0") == "1":
-            for method in ['info', 'debug', 'error', 'warning', 'critical', 'exception']:
+        if required_torch_version(min_version=2.6) and os.getenv("DISABLE_LOGS_WHILE_COMPILING", "0") == "1":
+            excluded_set = {
+                item.strip()
+                for item in os.getenv("LOGGER_METHODS_TO_EXCLUDE_FROM_DISABLE", "").split(",")
+            }
+            ignore_set = {'info', 'debug', 'error', 'warning', 'critical', 'exception', 'isEnabledFor'} - excluded_set
+            for method in ignore_set:
                 original_logger = getattr(logger_, method)
-                setattr(logger_, method, LoggerFactory.logging_decorator(original_logger))
-        else:
-            logger_.addFilter(LoggerFactory.create_warning_filter(logger_))
+                torch._dynamo.config.ignore_logger_methods.add(original_logger)
         return logger_
 
 
From 4b7e2c909fb9c6b161ed9e62c647dea49b486e41 Mon Sep 17 00:00:00 2001
From: Yejing-Lai <yejing.lai@intel.com>
Date: Tue, 25 Feb 2025 00:57:26 +0800
Subject: [PATCH 12/14] Fix meta load tensor imcompatible issue (#7073)

The partition tensor doesn't need to move to the current device when
meta load is used.

Signed-off-by: Lai, Yejing <yejing.lai@intel.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/module_inject/layers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
index 2a24c2920466..5087d71a3d62 100644
--- a/deepspeed/module_inject/layers.py
+++ b/deepspeed/module_inject/layers.py
@@ -48,7 +48,8 @@ def move(tensor, device):
     # to save host resources when DP > 1。
 
     if tensor.is_meta:
-        return torch.empty_like(tensor, device=device)
+        # Keep tensor in meta device if tensor is meta.
+        return tensor
     else:
         # Using new tensors help in freeing memory (after split for example) was done before by calling clone().
         # Using copy=True instead of clone() will help in case of cpu --> cpu.

From 1d30b58cba83ea03c1e81d20c8c077467983d769 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 24 Feb 2025 12:40:24 -0800
Subject: [PATCH 13/14] Replace calls to `python setup.py sdist` with `python
 -m build --sdist` (#7069)

With future changes coming to pip/python/etc, we need to modify to no
longer call `python setup.py ...` and replace it instead:
https://packaging.python.org/en/latest/guides/modernize-setup-py-project/#should-setup-py-be-deleted


![image](https://github.com/user-attachments/assets/ea39ef7b-3cbe-4916-86f0-bc46a5fce96d)

This means we need to install the build package which is added here as
well.

Additionally, we pass the `--sdist` flag to only build the sdist rather
than the wheel as well here.

---------

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/no-torch.yml      | 3 ++-
 .github/workflows/release.yml       | 3 ++-
 build_win.bat                       | 2 +-
 docs/_tutorials/advanced-install.md | 2 +-
 docs/_tutorials/ds-sequence.md      | 2 +-
 install.sh                          | 2 +-
 release/release.sh                  | 2 +-
 setup.py                            | 2 +-
 8 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
index 5b89a6f36787..75c4ecb850c9 100644
--- a/.github/workflows/no-torch.yml
+++ b/.github/workflows/no-torch.yml
@@ -32,11 +32,12 @@ jobs:
         run: |
           pip uninstall torch --yes
           pip install setuptools
+          pip install build
           pip list
 
       - name: Build deepspeed
         run: |
-          DS_BUILD_STRING=" " python setup.py sdist
+          DS_BUILD_STRING=" " python -m build --sdist
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b2ccca69dfda..4bddbc26be4a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,7 +26,8 @@ jobs:
     - name: Build DeepSpeed
       run: |
         pip install setuptools
-        DS_BUILD_STRING=" " python setup.py sdist
+        pip install build
+        DS_BUILD_STRING=" " python -m build --sdist
     - name: Publish to PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
diff --git a/build_win.bat b/build_win.bat
index 81387471983a..627694dbe8a0 100644
--- a/build_win.bat
+++ b/build_win.bat
@@ -11,6 +11,6 @@ set DS_BUILD_GDS=0
 set DS_BUILD_RAGGED_DEVICE_OPS=0
 set DS_BUILD_SPARSE_ATTN=0
 
-python setup.py bdist_wheel
+python -m build --wheel --no-isolation
 
 :end
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index d01378484172..b251485f8988 100755
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -84,7 +84,7 @@ This should complete the full build 2-3 times faster. You can adjust `-j` to spe
 You can also build a binary wheel and install it on multiple machines that have the same type of GPUs and the same software environment (CUDA toolkit, PyTorch, Python, etc.)
 
 ```bash
-DS_BUILD_OPS=1 python setup.py build_ext -j8 bdist_wheel
+DS_BUILD_OPS=1 python -m build --wheel --no-isolation --config-setting="--build-option=build_ext" --config-setting="--build-option=-j8"
 ```
 
 This will create a pypi binary wheel under `dist`, e.g., ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` and then you can install it directly on multiple machines, in our example:
diff --git a/docs/_tutorials/ds-sequence.md b/docs/_tutorials/ds-sequence.md
index 7a6041f393f9..41a76b784782 100755
--- a/docs/_tutorials/ds-sequence.md
+++ b/docs/_tutorials/ds-sequence.md
@@ -111,7 +111,7 @@ pip install .
 cd ${WORK_DIR}
 git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
 cd flash-attention
-python setup.py install
+python -m pip install .
 ```
 
 You may also want to ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the FlashAttention documentation for more details.
diff --git a/install.sh b/install.sh
index 8d56afe40493..8be574c6ec1f 100755
--- a/install.sh
+++ b/install.sh
@@ -152,7 +152,7 @@ if [ ! -f $hostfile ]; then
 fi
 
 echo "Building deepspeed wheel"
-python setup.py $VERBOSE bdist_wheel
+python -m build $VERBOSE --wheel --no-isolation
 
 if [ "$local_only" == "1" ]; then
     echo "Installing deepspeed"
diff --git a/release/release.sh b/release/release.sh
index a83fafcb9b1f..cc3ee2feae62 100644
--- a/release/release.sh
+++ b/release/release.sh
@@ -38,7 +38,7 @@ if [ $? != 0 ]; then
     exit 1
 fi
 
-DS_BUILD_STRING="" python setup.py sdist
+DS_BUILD_STRING="" python -m build --sdist
 
 if [ ! -f dist/deepspeed-${version}.tar.gz ]; then
     echo "prepared version does not match version given ($version), bump version first?"
diff --git a/setup.py b/setup.py
index 6dc9afb8fbc2..0ad54bb99403 100755
--- a/setup.py
+++ b/setup.py
@@ -233,7 +233,7 @@ def op_enabled(op_name):
 version_str = open('version.txt', 'r').read().strip()
 
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
+# Example: `DS_BUILD_STRING=".dev20201022" python -m build --no-isolation`.
 
 # Building wheel for distribution, update version file.
 if is_env_set('DS_BUILD_STRING'):

From 729dfafca366961b82daddb6dea6294aecfa74bc Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 24 Feb 2025 18:45:11 -0800
Subject: [PATCH 14/14] Revert "Handle special case of libuv for Windows
 (#7064)" (#7076)

This reverts commit 8577bd244903f454e319953a6ae1c3838558ac69.

Fixes: #7072
---
 deepspeed/comm/torch.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index 1146832d7655..efa0640fb87b 100755
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -145,22 +145,11 @@ def has_reduce_scatter_tensor(self):
 
     def init_process_group(self, backend, timeout, init_method, rank, world_size):
         if not torch.distributed.is_initialized():
-            if not required_torch_version(min_version=2.4):
-                # Windows torch builds do not come with lib_uv by default.
-                # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html
-                use_libuv = False if os.name == "nt" else True
-                torch.distributed.init_process_group(backend,
-                                                     timeout=timeout,
-                                                     init_method=init_method,
-                                                     rank=rank,
-                                                     world_size=world_size,
-                                                     use_libuv=use_libuv)
-            else:
-                torch.distributed.init_process_group(backend,
-                                                     timeout=timeout,
-                                                     init_method=init_method,
-                                                     rank=rank,
-                                                     world_size=world_size)
+            torch.distributed.init_process_group(backend,
+                                                 timeout=timeout,
+                                                 init_method=init_method,
+                                                 rank=rank,
+                                                 world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
     @disable_compiler_collective