EricLBuehler · EricLBuehler · Feb 11, 2025 · Feb 11, 2025
diff --git a/docs/DISTRIBUTED.md b/docs/DISTRIBUTED.md
@@ -10,6 +10,8 @@ TP splits the model into shards and benefits from fast single-node interconnects
 
 > Note: In mistral.rs, if NCCL is enabled, then automatic device mapping *will not* be used.
 
+**Important**: To build for NCCL, be sure to add the `nccl` feature flag (for example: `--features nccl,cuda`).
+
 See the following environment variables:
 
 |Name|Function|Usage|

diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
@@ -30,3 +30,4 @@ metal = ["mistralrs-core/metal"]
 flash-attn = ["cuda", "mistralrs-core/flash-attn"]
 accelerate = ["mistralrs-core/accelerate"]
 mkl = ["mistralrs-core/mkl"]
+nccl = ["mistralrs-core/nccl"]
diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -88,7 +88,6 @@ safetensors.workspace = true
 pyo3_macros = ["pyo3"]
 cuda = [
     "candle-core/cuda",
-    "candle-core/nccl",
     "candle-nn/cuda",
     "dep:bindgen_cuda",
     "mistralrs-quant/cuda",
@@ -110,6 +109,7 @@ flash-attn = ["cuda", "dep:candle-flash-attn"]
 flash-attn-v3 = ["cuda", "dep:candle-flash-attn-v3"]
 accelerate = ["candle-core/accelerate", "candle-nn/accelerate", "mistralrs-quant/accelerate"]
 mkl = ["candle-core/mkl", "candle-nn/mkl"]
+nccl = ["cuda", "mistralrs-quant/nccl"]
 
 [build-dependencies]
 bindgen_cuda = { version = "0.1.5", optional = true }
diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -301,7 +301,8 @@ impl Loader for NormalLoader {
         let use_nccl = available_devices.iter().all(|dev| dev.is_cuda())
             && available_devices.len() > 1
             && (std::env::var("MISTRALRS_NO_NCCL").is_err()
-                || std::env::var("MISTRALRS_NO_NCCL").is_ok_and(|x| x != "1"));
+                || std::env::var("MISTRALRS_NO_NCCL").is_ok_and(|x| x != "1"))
+            && cfg!(feature = "nccl");
 
         // If auto, convert to Map if not using nccl
         if use_nccl {
@@ -463,6 +464,11 @@ impl Loader for NormalLoader {
         let multi_progress = Arc::new(MultiProgress::new());
 
         let mut parallel_models = if use_nccl {
+            #[cfg(not(feature = "nccl"))]
+            warn!(
+                "NCCL support was included in the build, be sure to build with `--features nccl`."
+            );
+
             // NCCL case!
 
             let pipeline_parallel_size = std::env::var("MISTRALRS_PIPELINE_PARALLEL")

diff --git a/mistralrs-quant/Cargo.toml b/mistralrs-quant/Cargo.toml
@@ -32,7 +32,12 @@ safetensors.workspace = true
 regex.workspace = true
 
 [features]
-cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda"]
+cuda = [
+    "candle-core/cuda",
+    "candle-nn/cuda",
+    "dep:bindgen_cuda"
+]
+nccl = ["cuda", "candle-core/nccl"]
 metal = ["candle-core/metal", "candle-nn/metal", "dep:metal"]
 accelerate = ["candle-core/accelerate", "candle-nn/accelerate"]
 

diff --git a/mistralrs-quant/src/distributed/mod.rs b/mistralrs-quant/src/distributed/mod.rs
@@ -16,7 +16,7 @@ impl BarrierLike for Barrier {
     }
 }
 
-#[cfg(feature = "cuda")]
+#[cfg(all(feature = "cuda", feature = "nccl"))]
 mod ops {
     use std::{fmt::Debug, ops::Deref, sync::Arc};
 
@@ -165,7 +165,7 @@ mod ops {
     }
 }
 
-#[cfg(not(feature = "cuda"))]
+#[cfg(not(all(feature = "cuda", feature = "nccl")))]
 mod ops {
     use std::sync::Arc;
 

diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml
@@ -45,3 +45,4 @@ metal = ["mistralrs-core/metal"]
 flash-attn = ["cuda", "mistralrs-core/flash-attn"]
 accelerate = ["mistralrs-core/accelerate"]
 mkl = ["mistralrs-core/mkl"]
+nccl = ["mistralrs-core/nccl"]
diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml
@@ -34,6 +34,7 @@ metal = ["mistralrs-core/metal"]
 flash-attn = ["cuda", "mistralrs-core/flash-attn"]
 accelerate = ["mistralrs-core/accelerate"]
 mkl = ["mistralrs-core/mkl"]
+nccl = ["mistralrs-core/nccl"]
 
 [[example]]
 name = "simple"