update weight update process group

ji-huazhong · ji-huazhong · commit bbf07523b0d8 · 2025-04-03T11:37:51.000+08:00
diff --git a/trl/distributed_util.py b/trl/distributed_util.py
@@ -0,0 +1,89 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from datetime import timedelta
+from typing import Any, Optional, Union
+
+import torch
+import torch.distributed
+from torch.distributed.distributed_c10d import (
+    Backend,
+    PrefixStore,
+    Store,
+    _new_process_group_helper,
+    _world,
+    default_pg_timeout,
+    rendezvous,
+)
+
+ 
+def init_process_group(
+    backend: Union[str, Backend] = None,
+    init_method: Optional[str] = None,
+    timeout: Optional[timedelta] = None,
+    world_size: int = -1,
+    rank: int = -1,
+    store: Optional[Store] = None,
+    group_name: str = None,
+    pg_options: Optional[Any] = None,
+):
+    """
+    Copy from pytorch to allow creating multiple main groups.
+    https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
+    Reference implementation from: https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
+    """
+    assert (store is None) or (init_method is None), "Cannot specify both init_method and store."
+
+    if store is not None:
+        assert world_size > 0, "world_size must be positive if using store"
+        assert rank >= 0, "rank must be non-negative if using store"
+    elif init_method is None:
+        init_method = "env://"
+
+    if backend:
+        backend = Backend(backend)
+    else:
+        backend = Backend("undefined")
+
+    if timeout is None:
+        timeout = default_pg_timeout
+
+    # backward compatible API
+    if store is None:
+        rendezvous_iterator = rendezvous(init_method, rank, world_size, timeout=timeout)
+        store, rank, world_size = next(rendezvous_iterator)
+        store.set_timeout(timeout)
+
+        # Use a PrefixStore to avoid accidental overrides of keys used by
+        # different systems (e.g. RPC) in case the store is multi-tenant.
+        store = PrefixStore(group_name, store)
+
+    # NOTE: The pg_options parameter was renamed into backend_options in PyTorch 2.6.0
+    # https://github.com/pytorch/pytorch/commit/a0c7029a75628cd5fa8df83c0de0ea98ee7fd844
+    # We need to determine the appropriate parameter name based on PyTorch version
+    pg_options_param_name = "backend_options" if str(torch.__version__) >= "2.6" else "pg_options"
+    pg, _ = _new_process_group_helper(
+        world_size,
+        rank,
+        [],
+        backend,
+        store,
+        group_name=group_name,
+        **{pg_options_param_name: pg_options},
+        timeout=timeout,
+    )
+
+    _world.pg_group_ranks[pg] = {i: i for i in range(world_size)}
+
+    return pg
diff --git a/trl/extras/vllm_client.py b/trl/extras/vllm_client.py
@@ -20,6 +20,7 @@
 import torch
 from torch import nn
 
+from ..distributed_util import init_process_group
 from ..import_utils import is_requests_available, is_vllm_available
 
 
@@ -28,11 +29,6 @@
     from requests import ConnectionError
 
 
-if is_vllm_available():
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-    from vllm.distributed.utils import StatelessProcessGroup
-
-
 logger = logging.getLogger(__name__)
 
 
@@ -53,6 +49,8 @@ class VLLMClient:
         connection_timeout (`float`, *optional*, defaults to `0.0`):
             Total timeout duration in seconds to wait for the server to be up. If the server is not up after the
             timeout, a `ConnectionError` is raised.
+        backend (`str`, *optional*, default to `nccl`):
+            The backend to use for collective communication.
 
     Examples:
         Run the vLLM server with the model `Qwen/Qwen2.5-7B`:
@@ -80,7 +78,7 @@ class VLLMClient:
     """
 
     def __init__(
-        self, host: str = "0.0.0.0", server_port: int = 8000, group_port: int = 51216, connection_timeout: float = 0.0
+        self, host: str = "0.0.0.0", server_port: int = 8000, group_port: int = 51216, connection_timeout: float = 0.0, backend: str = "nccl"
     ):
         if not is_requests_available():
             raise ImportError("requests is not installed. Please install it with `pip install requests`.")
@@ -91,9 +89,10 @@ def __init__(
         self.host = host
         self.server_port = server_port
         self.group_port = group_port
+        self.backend = backend
         self.check_server(connection_timeout)  # check server and fail after timeout
-        self.init_communicator()
-        atexit.register(self.close_communicator)  # when the client object is deleted, close the weight update group
+        self.init_weight_update_group()
+        atexit.register(self.close_weight_update_group)  # when the client object is deleted, close the weight update group
 
     def check_server(self, total_timeout: float = 0.0, retry_interval: float = 2.0):
         """
@@ -188,7 +187,7 @@ def generate(
         else:
             raise Exception(f"Request failed: {response.status_code}, {response.text}")
 
-    def init_communicator(self):
+    def init_weight_update_group(self):
         """
         Initializes the weight update group in a distributed setup for model synchronization.
         """
@@ -204,15 +203,20 @@ def init_communicator(self):
         self.rank = tensor_parallel_size  # The client's rank is the last process
 
         # Initialize weight update group
-        url = f"http://{self.host}:{self.server_port}/init_communicator/"
+        url = f"http://{self.host}:{self.server_port}/init_weight_update_group/"
         # In the server side, the host is set to 0.0.0.0
-        response = self.session.post(url, json={"host": "0.0.0.0", "port": self.group_port, "world_size": world_size})
+        response = self.session.post(url, json={"host": "0.0.0.0", "port": self.group_port, "world_size": world_size, "backend": self.backend})
         if response.status_code != 200:
             raise Exception(f"Request failed: {response.status_code}, {response.text}")
 
         # Set up the communication group for weight broadcasting
-        pg = StatelessProcessGroup.create(host=self.host, port=self.group_port, rank=self.rank, world_size=world_size)
-        self.pynccl_comm = PyNcclCommunicator(pg, device="cuda:0")
+        self.weight_update_group = init_process_group(
+            backend=self.backend,
+            init_method=f"tcp://{self.host}:{self.group_port}",
+            world_size=world_size,
+            rank=self.rank,
+            group_name="weight_update_group",
+        )
 
     def update_named_param(self, name: str, weights: torch.Tensor):
         """
@@ -231,8 +235,8 @@ def update_named_param(self, name: str, weights: torch.Tensor):
             raise Exception(f"Request failed: {response.status_code}, {response.text}")
 
         # Broadcast the weights to the other processes
-        self.pynccl_comm.broadcast(weights, src=self.rank, stream=torch.cuda.current_stream())
-        self.pynccl_comm.group.barrier()
+        torch.distributed.broadcast(weights, src=self.rank, group_name=self.weight_update_group)
+        torch.distributed.barrier()
 
     def update_model_params(self, model: nn.Module):
         """
@@ -255,11 +259,11 @@ def reset_prefix_cache(self):
         if response.status_code != 200:
             raise Exception(f"Request failed: {response.status_code}, {response.text}")
 
-    def close_communicator(self):
+    def close_weight_update_group(self):
         """
-        Closes the weight update group and cleans up the communication group.
+        Closes the weight update group and cleans up the weight update group.
         """
-        url = f"http://{self.host}:{self.server_port}/close_communicator/"
+        url = f"http://{self.host}:{self.server_port}/close_weight_update_group/"
         response = self.session.post(url)
         if response.status_code != 200:
             raise Exception(f"Request failed: {response.status_code}, {response.text}")
diff --git a/trl/scripts/vllm_serve.py b/trl/scripts/vllm_serve.py
@@ -20,8 +20,10 @@
 
 import torch
 import torch.distributed as dist
+from accelerate.utils import is_npu_available
 
 from trl import TrlParser
+from trl.distributed_util import init_process_group
 from trl.import_utils import is_fastapi_available, is_pydantic_available, is_uvicorn_available, is_vllm_available
 
 
@@ -39,11 +41,11 @@
 
 if is_vllm_available():
     from vllm import LLM, SamplingParams
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
     from vllm.distributed.parallel_state import get_world_group
-    from vllm.distributed.utils import StatelessProcessGroup
     from vllm.sampling_params import GuidedDecodingParams
     from vllm.worker.worker import Worker
+    if is_npu_available():
+        from vllm_ascend.worker.worker import NPUWorker as Worker
 else:
     Worker = object
 
@@ -72,13 +74,13 @@ def __init__(self, *args, **kwargs):
 
         super().__init__(*args, **kwargs)
 
-        # The following attributes are initialized when `init_communicator` method is called.
-        self.pynccl_comm = None  # Communicator for weight updates
+        # The following attributes are initialized when `weight_update_group` method is called.
+        self.weight_update_group = None  # Communicator for weight updates
         self.client_rank = None  # Source rank for broadcasting updated weights
 
-    def init_communicator(self, host: str, port: int, world_size: int) -> None:
+    def init_weight_update_group(self, host: str, port: int, world_size: int, backend: str) -> None:
         """
-        Initializes the weight update communicator using a stateless process group.
+        Initializes the weight update process group.
 
         This method creates a `StatelessProcessGroup` that allows external training processes to
         communicate with vLLM workers without interfering with the global torch distributed group.
@@ -90,18 +92,23 @@ def init_communicator(self, host: str, port: int, world_size: int) -> None:
                 Port number to be used for communication.
             world_size (`int`):
                 Total number of participating processes in the update group.
+            backend (`str`):
+                The backend to use for collective communication.
         """
-        if self.pynccl_comm is not None:
-            raise RuntimeError("Weight update group already initialized. Call close_communicator first.")
+        if self.weight_update_group is not None:
+            raise RuntimeError("Weight update group already initialized. Call close_weight_update_group first.")
 
         # Get the rank of the current worker in the global world group.
         rank = get_world_group().rank
 
         # Create a stateless process group to manage communication between training processes and vLLM workers.
-        pg = StatelessProcessGroup.create(host=host, port=port, rank=rank, world_size=world_size)
-
-        # Initialize the NCCL-based communicator for weight synchronization.
-        self.pynccl_comm = PyNcclCommunicator(pg, device=self.device)
+        self.weight_update_group = init_process_group(
+            backend=backend,
+            init_method=f"tcp://{host}:{port}",
+            world_size=world_size,
+            rank=rank,
+            group_name="weight_update_group",
+        )
 
         # The client process that sends updated weights has the highest rank (world_size - 1).
         self.client_rank = world_size - 1
@@ -118,29 +125,28 @@ def update_named_param(self, name: str, dtype: torch.dtype, shape: Sequence[int]
             shape (`Sequence[int]`):
                 Shape of the weight tensor.
         """
-        if self.pynccl_comm is None:
-            raise RuntimeError("Communicator not initialized. Call `init_communicator` first.")
+        if self.weight_update_group is None:
+            raise RuntimeError("weight update group not initialized. Call `weight_update_group` first.")
 
         # Allocate memory for the incoming weight tensor on the correct device.
         weight = torch.empty(shape, dtype=dtype, device=self.device)
 
-        # Use NCCL to broadcast the updated weights from the client (src) to all workers.
-        self.pynccl_comm.broadcast(weight, src=self.client_rank, stream=torch.cuda.current_stream())
-        self.pynccl_comm.group.barrier()
+        # Broadcast the updated weights from the client (src) to all workers.
+        torch.distributed.broadcast(weight, src=self.client_rank, group=self.weight_update_group)
 
         # Load the received weights into the model.
         self.model_runner.model.load_weights(weights=[(name, weight)])
 
-    def close_communicator(self) -> None:
+    def close_weight_update_group(self) -> None:
         """
         Closes the communicator when weight synchronization is no longer needed.
 
         This method deletes the NCCL communicator to release associated resources.
         """
 
-        if self.pynccl_comm is not None:
-            del self.pynccl_comm
-            self.pynccl_comm = None  # Ensure attribute is reset to None
+        if self.weight_update_group is not None:
+            del self.weight_update_group
+            self.weight_update_group = None  # Ensure attribute is reset to None
             self.client_rank = None  # Ensure attribute is reset to None
 
 
@@ -345,13 +351,15 @@ async def generate(request: GenerateRequest):
         completion_ids = [list(output.token_ids) for outputs in all_outputs for output in outputs.outputs]
         return {"completion_ids": completion_ids}
 
-    class InitCommunicatorRequest(BaseModel):
+    class InitWeightUpdateGroupRequest(BaseModel):
         host: str
         port: int
         world_size: int
+        backend: str
+
 
-    @app.post("/init_communicator/")
-    async def init_communicator(request: InitCommunicatorRequest, background_tasks: BackgroundTasks):
+    @app.post("/init_weight_update_group/")
+    async def init_weight_update_group(request: InitWeightUpdateGroupRequest, background_tasks: BackgroundTasks):
         """
         Initializes the communicator for synchronizing model weights between a client and multiple server
         workers.
@@ -364,8 +372,8 @@ async def init_communicator(request: InitCommunicatorRequest, background_tasks:
         """
         background_tasks.add_task(
             llm.collective_rpc,
-            "init_communicator",
-            args=(request.host, request.port, script_args.tensor_parallel_size + 1),
+            "init_weight_update_group",
+            args=(request.host, request.port, script_args.tensor_parallel_size + 1, request.backend),
         )
         return {"message": "Request received, initializing communicator"}
 
@@ -406,13 +414,13 @@ async def reset_prefix_cache():
         success = llm.llm_engine.reset_prefix_cache()
         return {"message": "Request received, resetting prefix cache status: " + str(success)}
 
-    @app.post("/close_communicator/")
-    async def close_communicator():
+    @app.post("/close_weight_update_group/")
+    async def close_weight_update_group():
         """
         Closes the weight update group and cleans up associated resources.
         """
-        llm.collective_rpc("close_communicator")
-        return {"message": "Request received, closing communicator"}
+        llm.collective_rpc("close_weight_update_group")
+        return {"message": "Request received, closing weight update group"}
 
     # Start the server
     uvicorn.run(app, host=script_args.host, port=script_args.port)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -22,7 +22,7 @@
 import torch
 import torch.utils.data
 import transformers
-from accelerate.utils import broadcast_object_list, gather, gather_object, is_peft_model, set_seed
+from accelerate.utils import broadcast_object_list, gather, gather_object, is_npu_available, is_peft_model, set_seed
 from datasets import Dataset, IterableDataset
 from packaging import version
 from torch import nn
@@ -474,8 +474,9 @@ def data_collator(features):  # No data collation is needed in GRPO
                 )
 
             if self.accelerator.is_main_process:
+                backend = "hccl" if is_npu_available() else "nccl" 
                 self.vllm_client = VLLMClient(
-                    args.vllm_server_host, args.vllm_server_port, connection_timeout=args.vllm_server_timeout
+                    args.vllm_server_host, args.vllm_server_port, connection_timeout=args.vllm_server_timeout, backend=backend
                 )
 
             # vLLM specific sampling arguments