EricLBuehler · EricLBuehler · Sep 3, 2024 · Sep 3, 2024
diff --git a/docs/PHI3.5MOE.md b/docs/PHI3.5MOE.md
@@ -83,6 +83,7 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
             use_flash_attn: false,
             prompt_batchsize: None,
             topology: None,
+            organization: Default::default(),
         },
         None,
         None,

diff --git a/examples/python/mixture_of_quant_experts.py b/examples/python/mixture_of_quant_experts.py
@@ -0,0 +1,25 @@
+from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
+
+runner = Runner(
+    which=Which.Plain(
+        model_id="microsoft/Phi-3.5-MoE-instruct",
+        arch=Architecture.Mistral,
+        organization="moqe",
+    ),
+    in_situ_quant="Q4K",
+)
+
+res = runner.send_chat_completion_request(
+    ChatCompletionRequest(
+        model="mistral",
+        messages=[
+            {"role": "user", "content": "Tell me a story about the Rust type system."}
+        ],
+        max_tokens=256,
+        presence_penalty=1.0,
+        top_p=0.1,
+        temperature=0.1,
+    )
+)
+print(res.choices[0].message.content)
+print(res.usage)
diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs
@@ -68,8 +68,8 @@ pub use paged_attention::{MemoryGpuConfig, PagedAttentionConfig};
 pub use pipeline::{
     chat_template::ChatTemplate, parse_isq_value, AnyMoeLoader, AnyMoePipeline, GGMLLoader,
     GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig,
-    GemmaLoader, Idefics2Loader, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader,
-    LocalModelPaths, MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader,
+    GemmaLoader, Idefics2Loader, IsqOrganization, LLaVALoader, LLaVANextLoader, LlamaLoader,
+    Loader, LocalModelPaths, MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader,
     NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader,
     Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline,
     Starcoder2Loader, TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType,

diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs
@@ -121,11 +121,13 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             arch,
             dtype: _,
             topology,
+            organization,
         } => NormalLoaderBuilder::new(
             NormalSpecificConfig {
                 use_flash_attn,
                 prompt_batchsize: args.prompt_batchsize,
                 topology: Topology::from_option_path(topology)?,
+                organization: organization.unwrap_or_default(),
             },
             args.chat_template,
             tokenizer_json,
@@ -146,6 +148,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
                 use_flash_attn,
                 prompt_batchsize: args.prompt_batchsize,
                 topology: Topology::from_option_path(topology)?,
+                organization: Default::default(),
             },
             args.chat_template,
             tokenizer_json,
@@ -174,6 +177,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
                 use_flash_attn,
                 prompt_batchsize: args.prompt_batchsize,
                 topology: Topology::from_option_path(topology)?,
+                organization: Default::default(),
             },
             args.chat_template,
             tokenizer_json,

diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs
@@ -1,7 +1,7 @@
 use clap::Subcommand;
 
 use crate::{
-    pipeline::{NormalLoaderType, VisionLoaderType},
+    pipeline::{IsqOrganization, NormalLoaderType, VisionLoaderType},
     ModelDType,
 };
 
@@ -47,6 +47,10 @@
         /// Path to a topology YAML file.
         #[arg(long)]
         topology: Option<String>,
+
+        /// ISQ organization: `default` or `moqe` (Mixture of Quantized Experts: https://arxiv.org/abs/2310.02410).
+        #[arg(short, long)]
+        organization: Option<IsqOrganization>,
     },
 
     /// Select an X-LoRA architecture

diff --git a/mistralrs-core/src/models/phi3_5_moe.rs b/mistralrs-core/src/models/phi3_5_moe.rs
@@ -692,6 +692,23 @@ impl IsqModel for Model {
         }
         (tensors, &*self.mapper)
     }
+    fn get_layers_moe_experts_only(
+        &mut self,
+    ) -> (
+        Vec<(&mut Arc<dyn QuantMethod>, Option<usize>)>,
+        &dyn DeviceMapper,
+    ) {
+        let mut tensors = Vec::new();
+        tensors.push((&mut self.lm_head, None));
+        for (i, layer) in self.layers.iter_mut().enumerate() {
+            for expert in &mut layer.mlp.experts {
+                tensors.push((&mut expert.w1, Some(i)));
+                tensors.push((&mut expert.w2, Some(i)));
+                tensors.push((&mut expert.w3, Some(i)));
+            }
+        }
+        (tensors, &*self.mapper)
+    }
 }
 
 impl NormalModel for Model {

diff --git a/mistralrs-core/src/pipeline/isq.rs b/mistralrs-core/src/pipeline/isq.rs
@@ -1,12 +1,14 @@
 use std::{
     collections::HashSet,
+    str::FromStr,
     sync::{atomic::AtomicUsize, Arc},
     time::Instant,
 };
 
 use candle_core::Device;
 use indicatif::{ProgressBar, ProgressStyle};
 use mistralrs_quant::{IsqType, QuantMethod};
+use serde::Deserialize;
 use tracing::info;
 
 use crate::{device_map::DeviceMapper, topology::LayerTopology, Topology};
@@ -75,24 +77,66 @@
     Ok(tp)
 }
 
+#[derive(Clone, Debug, Copy, Default, Deserialize)]
+pub enum IsqOrganization {
+    #[default]
+    #[serde(rename = "default")]
+    Default,
+    /// Only quantize MoE experts, if applicable. The enables MoQE.
+    /// https://arxiv.org/abs/2310.02410
+    #[serde(rename = "moqe")]
+    MoeExpertsOnly,
+}
+
+impl FromStr for IsqOrganization {
+    type Err = String;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "default" => Ok(Self::Default),
+            "moqe" => Ok(Self::MoeExpertsOnly),
+            other => Err(format!(
+                "Expected ISQ organization `default` or `moqe`, got `{other}`"
+            )),
+        }
+    }
+}
+
 pub trait IsqModel {
+    /// Corresponds to `IsqOrganization::Default`
     #[allow(clippy::type_complexity)]
     fn get_layers(
         &mut self,
     ) -> (
         Vec<(&mut Arc<dyn QuantMethod>, Option<usize>)>,
         &dyn DeviceMapper,
     );
+
+    /// Corresponds to `IsqOrganization::MoeExpertsOnly`
+    /// https://arxiv.org/abs/2310.02410
+    #[allow(clippy::type_complexity)]
+    fn get_layers_moe_experts_only(
+        &mut self,
+    ) -> (
+        Vec<(&mut Arc<dyn QuantMethod>, Option<usize>)>,
+        &dyn DeviceMapper,
+    ) {
+        self.get_layers()
+    }
+
     /// Quantize the model in-situ.
     fn quantize(
         &mut self,
         dtype: Option<IsqType>,
         device: Device,
         topology: Option<&Topology>,
         silent: bool,
+        organization: IsqOrganization,
     ) -> candle_core::Result<()> {
         {
-            let (tensors, mapper) = self.get_layers();
+            let (tensors, mapper) = match organization {
+                IsqOrganization::Default => self.get_layers(),
+                IsqOrganization::MoeExpertsOnly => self.get_layers_moe_experts_only(),
+            };
             let total_tensors = tensors.len();
             let n_quantized = AtomicUsize::new(0);
             if let Some(topology) = topology {

diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
@@ -22,7 +22,7 @@ use chat_template::ChatTemplate;
 pub use ggml::{GGMLLoader, GGMLLoaderBuilder, GGMLSpecificConfig};
 pub use gguf::{GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig};
 pub use inputs_processor::InputProcessorOutput;
-pub use isq::{parse_isq_value, IsqModel};
+pub use isq::{parse_isq_value, IsqModel, IsqOrganization};
 pub use loaders::{
     AdapterKind, AutoLoader, Gemma2Loader, GemmaLoader, Idefics2Loader, LLaVALoader,
     LLaVANextLoader, LlamaLoader, Loader, LocalModelPaths, MistralLoader, MixtralLoader, ModelKind,

diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -6,7 +6,7 @@ use super::{
 };
 use super::{
     AdapterActivationMixin, AnyMoePipelineMixin, CacheManagerMixin, ForwardInputsResult,
-    IsqPipelineMixin, MetadataMixin, ModelCategory, PreProcessingMixin,
+    IsqOrganization, IsqPipelineMixin, MetadataMixin, ModelCategory, PreProcessingMixin,
 };
 use super::{
     AutoLoader, Gemma2Loader, GemmaLoader, LlamaLoader, MistralLoader, MixtralLoader,
@@ -58,6 +58,7 @@ pub struct NormalPipeline {
     metadata: Arc<GeneralMetadata>,
     topology: Option<Topology>,
     silent: bool,
+    organization: IsqOrganization,
 }
 
 /// A loader for a "normal" (non-quantized) model.
@@ -94,6 +95,7 @@ pub struct NormalSpecificConfig {
     pub use_flash_attn: bool,
     pub prompt_batchsize: Option<NonZeroUsize>,
     pub topology: Option<Topology>,
+    pub organization: IsqOrganization,
 }
 
 impl NormalLoaderBuilder {
@@ -348,6 +350,7 @@ impl Loader for NormalLoader {
                 device.clone(),
                 self.config.topology.as_ref(),
                 silent,
+                self.config.organization,
             )?;
         }
 
@@ -406,6 +409,7 @@ impl Loader for NormalLoader {
             }),
             topology: self.config.topology.clone(),
             silent,
+            organization: self.config.organization,
         })))
     }
 
@@ -434,7 +438,13 @@ impl IsqPipelineMixin for NormalPipeline {
     fn re_isq_model(&mut self, dtype: IsqType) -> Result<()> {
         let device = self.device().clone();
         self.model
-            .quantize(Some(dtype), device, self.topology.as_ref(), self.silent)
+            .quantize(
+                Some(dtype),
+                device,
+                self.topology.as_ref(),
+                self.silent,
+                self.organization,
+            )
             .map_err(anyhow::Error::msg)
     }
 }

diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs
@@ -11,7 +11,7 @@ use crate::aici::toktree::TokTrie;
 use crate::paged_attention::{calculate_cache_config, AttentionImplementation, CacheEngine};
 use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
 use crate::pipeline::sampling::sample_and_add_toks;
-use crate::pipeline::{get_chat_template, ChatTemplate, LocalModelPaths};
+use crate::pipeline::{get_chat_template, ChatTemplate, IsqOrganization, LocalModelPaths};
 use crate::prefix_cacher::PrefixCacheManager;
 use crate::sequence::Sequence;
 use crate::utils::debug::DeviceRepr;
@@ -267,6 +267,7 @@ impl Loader for VisionLoader {
                 device.clone(),
                 self.config.topology.as_ref(),
                 silent,
+                IsqOrganization::Default,
             )?;
         }
 
@@ -345,7 +346,13 @@ impl IsqPipelineMixin for VisionPipeline {
     fn re_isq_model(&mut self, dtype: IsqType) -> Result<()> {
         let device = self.device().clone();
         self.model
-            .quantize(Some(dtype), device, self.topology.as_ref(), self.silent)
+            .quantize(
+                Some(dtype),
+                device,
+                self.topology.as_ref(),
+                self.silent,
+                IsqOrganization::Default,
+            )
             .map_err(anyhow::Error::msg)
     }
 }

diff --git a/mistralrs-core/src/toml_selector.rs b/mistralrs-core/src/toml_selector.rs
@@ -3,10 +3,11 @@ use std::{fs::File, num::NonZeroUsize};
 use serde::Deserialize;
 
 use crate::{
-    amoe::AnyMoeConfig, AnyMoeLoader, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder,
-    GGUFSpecificConfig, Loader, ModelDType, NormalLoaderBuilder, NormalLoaderType,
-    NormalSpecificConfig, SpeculativeConfig, SpeculativeLoader, Topology, VisionLoaderBuilder,
-    VisionLoaderType, VisionSpecificConfig, GGUF_MULTI_FILE_DELIMITER,
+    amoe::AnyMoeConfig, pipeline::IsqOrganization, AnyMoeLoader, GGMLLoaderBuilder,
+    GGMLSpecificConfig, GGUFLoaderBuilder, GGUFSpecificConfig, Loader, ModelDType,
+    NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, SpeculativeConfig,
+    SpeculativeLoader, Topology, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
+    GGUF_MULTI_FILE_DELIMITER,
 };
 
 fn default_one() -> usize {
@@ -38,6 +39,9 @@ pub enum TomlModelSelected {
 
         /// Path to a topology YAML file.
         topology: Option<String>,
+
+        /// ISQ organization: `default` or `moqe` (Mixture of Quantized Experts: https://arxiv.org/abs/2310.02410).
+        organization: Option<IsqOrganization>,
     },
 
     /// Select an X-LoRA architecture
@@ -344,11 +348,13 @@ fn loader_from_selected(
             arch,
             dtype: _,
             topology,
+            organization,
         } => NormalLoaderBuilder::new(
             NormalSpecificConfig {
                 use_flash_attn,
                 prompt_batchsize: args.prompt_batchsize,
                 topology: Topology::from_option_path(topology)?,
+                organization: organization.unwrap_or_default(),
             },
             args.chat_template,
             args.tokenizer_json,
@@ -368,6 +374,7 @@ fn loader_from_selected(
                 use_flash_attn,
                 prompt_batchsize: args.prompt_batchsize,
                 topology: Topology::from_option_path(topology)?,
+                organization: Default::default(),
             },
             args.chat_template,
             args.tokenizer_json,
@@ -395,6 +402,7 @@ fn loader_from_selected(
                 use_flash_attn,
                 prompt_batchsize: args.prompt_batchsize,
                 topology: Topology::from_option_path(topology)?,
+                organization: Default::default(),
             },
             args.chat_template,
             args.tokenizer_json,

diff --git a/mistralrs-pyo3/API.md b/mistralrs-pyo3/API.md
@@ -46,6 +46,8 @@ class Which(Enum):
         arch: Architecture | None = None
         tokenizer_json: str | None = None
         topology: str | None = None
+        # ISQ organization: `default` or `moqe` (Mixture of Quantized Experts: https://arxiv.org/abs/2310.02410).
+        organization: str | None = None
 
     @dataclass
     class XLora:

diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
@@ -101,6 +101,7 @@ class Which(Enum):
         arch: Architecture | None = None
         tokenizer_json: str | None = None
         topology: str | None = None
+        organization: str | None = None
 
     @dataclass
     class XLora: