EricLBuehler · EricLBuehler · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
@@ -26,3 +26,8 @@ cli-table = "0.4.7"
 
 [features]
 cuda = ["mistralrs-core/cuda"]
+cudnn = ["mistralrs-core/cudnn"]
+metal = ["mistralrs-core/metal"]
+flash-attn = ["cuda", "mistralrs-core/flash-attn"]
+accelerate = ["mistralrs-core/accelerate"]
+mkl = ["mistralrs-core/mkl"]
diff --git a/mistralrs-core/src/dummy_paged_attention/block_engine.rs b/mistralrs-core/src/dummy_paged_attention/block_engine.rs
@@ -231,18 +231,19 @@ impl BlockEngine {
     }
 
     pub fn free_sequence(&mut self, id: usize) {
-        let block_table = self.block_tables.get(&id).unwrap();
-
-        // Free from block table
-        for block in block_table {
-            if block.deref_mut().is_gpu {
-                self.gpu_allocator.free_block(block.clone())
-            } else {
-                self.cpu_allocator.free_block(block.clone())
+        // Handle double free if run out of tokens
+        if let Some(block_table) = self.block_tables.get(&id) {
+            // Free from block table
+            for block in block_table {
+                if block.deref_mut().is_gpu {
+                    self.gpu_allocator.free_block(block.clone())
+                } else {
+                    self.cpu_allocator.free_block(block.clone())
+                }
             }
-        }
 
-        self.block_tables.remove(&id);
+            self.block_tables.remove(&id);
+        }
     }
 
     #[allow(dead_code)]
@@ -300,7 +301,7 @@ impl BlockEngine {
         &mut self,
         sequence: &impl BlockEngineSequence,
     ) -> Option<(usize, usize)> {
-        let table = self.block_tables.get_mut(&sequence.get_id()).unwrap();
+        let table = self.block_tables.get_mut(&sequence.get_id())?;
 
         match sequence.blocks_to_add_new_tok() {
             1 => {

diff --git a/mistralrs-core/src/dummy_paged_attention/mod.rs b/mistralrs-core/src/dummy_paged_attention/mod.rs
@@ -95,6 +95,9 @@ pub fn calculate_cache_config(
 
     let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
     let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
+    if num_gpu_blocks == 0 {
+        anyhow::bail!("Num GPU blocks is 0. This means there is not enough memory. Either reduce the memory allocated or reduce the memory utilization.");
+    }
     info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
     Ok(CacheConfig {
         block_size,

diff --git a/mistralrs-core/src/dummy_paged_attention/scheduler.rs b/mistralrs-core/src/dummy_paged_attention/scheduler.rs
@@ -82,23 +82,28 @@ impl PagedAttentionScheduler {
                 match can_allocate {
                     AllocStatus::Later => break, // If we can only allocate later, do not bother iterating over the rest.
                     AllocStatus::Impossible => {
+                        let id = *get_mut_arcmutex!(seq).id();
+                        let len = get_mut_arcmutex!(seq).get_toks().len();
                         warn!(
-                            "Input prompt with length of {} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
-                            get_mut_arcmutex!(seq).prompt_tokens()
+                            "Sequence {id} with length of {len} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
                         );
                         get_mut_arcmutex!(seq).set_state(SequenceState::FinishedIgnored);
                         did_ignore = true;
                     }
                     _ => {}
                 }
 
-                get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
-                let seq_handle = get_mut_arcmutex!(seq);
-                self._allocate(&seq_handle);
+                if !did_ignore {
+                    get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
+                    let seq_handle = get_mut_arcmutex!(seq);
+                    self._allocate(&seq_handle);
+                }
 
                 let seq = self.waiting.pop_front().unwrap();
                 self.running.push_back(seq.clone());
-                scheduled.push_back(seq);
+                if !did_ignore {
+                    scheduled.push_back(seq);
+                }
             }
 
             // If we did schedule, or we ignored sequences.

diff --git a/mistralrs-core/src/paged_attention/block_engine.rs b/mistralrs-core/src/paged_attention/block_engine.rs
@@ -231,18 +231,19 @@ impl BlockEngine {
     }
 
     pub fn free_sequence(&mut self, id: usize) {
-        let block_table = self.block_tables.get(&id).unwrap();
-
-        // Free from block table
-        for block in block_table {
-            if block.deref_mut().is_gpu {
-                self.gpu_allocator.free_block(block.clone())
-            } else {
-                self.cpu_allocator.free_block(block.clone())
+        // Handle double free if run out of tokens
+        if let Some(block_table) = self.block_tables.get(&id) {
+            // Free from block table
+            for block in block_table {
+                if block.deref_mut().is_gpu {
+                    self.gpu_allocator.free_block(block.clone())
+                } else {
+                    self.cpu_allocator.free_block(block.clone())
+                }
             }
-        }
 
-        self.block_tables.remove(&id);
+            self.block_tables.remove(&id);
+        }
     }
 
     #[allow(dead_code)]
@@ -300,7 +301,7 @@ impl BlockEngine {
         &mut self,
         sequence: &impl BlockEngineSequence,
     ) -> Option<(usize, usize)> {
-        let table = self.block_tables.get_mut(&sequence.get_id()).unwrap();
+        let table = self.block_tables.get_mut(&sequence.get_id())?;
 
         match sequence.blocks_to_add_new_tok() {
             1 => {

diff --git a/mistralrs-core/src/paged_attention/mod.rs b/mistralrs-core/src/paged_attention/mod.rs
@@ -99,6 +99,9 @@ pub fn calculate_cache_config(
 
     let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
     let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
+    if num_gpu_blocks == 0 {
+        anyhow::bail!("Num GPU blocks is 0. This means there is not enough memory. Either reduce the memory allocated or reduce the memory utilization.");
+    }
     info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
     Ok(CacheConfig {
         block_size,

diff --git a/mistralrs-core/src/paged_attention/scheduler.rs b/mistralrs-core/src/paged_attention/scheduler.rs
@@ -82,23 +82,28 @@ impl PagedAttentionScheduler {
                 match can_allocate {
                     AllocStatus::Later => break, // If we can only allocate later, do not bother iterating over the rest.
                     AllocStatus::Impossible => {
+                        let id = *get_mut_arcmutex!(seq).id();
+                        let len = get_mut_arcmutex!(seq).get_toks().len();
                         warn!(
-                            "Input prompt with length of {} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
-                            get_mut_arcmutex!(seq).prompt_tokens()
+                            "Sequence {id} with length of {len} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
                         );
                         get_mut_arcmutex!(seq).set_state(SequenceState::FinishedIgnored);
                         did_ignore = true;
                     }
                     _ => {}
                 }
 
-                get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
-                let seq_handle = get_mut_arcmutex!(seq);
-                self._allocate(&seq_handle);
+                if !did_ignore {
+                    get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
+                    let seq_handle = get_mut_arcmutex!(seq);
+                    self._allocate(&seq_handle);
+                }
 
                 let seq = self.waiting.pop_front().unwrap();
                 self.running.push_back(seq.clone());
-                scheduled.push_back(seq);
+                if !did_ignore {
+                    scheduled.push_back(seq);
+                }
             }
 
             // If we did schedule, or we ignored sequences.

diff --git a/mistralrs-core/src/vision_models/clip.rs b/mistralrs-core/src/vision_models/clip.rs
@@ -258,7 +258,7 @@ impl ClipEncoder {
         let vs = vs.pp("layers");
         let mut layers: Vec<ClipEncoderLayer> = Vec::new();
         for index in 0..c.num_hidden_layers {
-            let layer = ClipEncoderLayer::new(vs.pp(&index.to_string()), c)?;
+            let layer = ClipEncoderLayer::new(vs.pp(index.to_string()), c)?;
             layers.push(layer)
         }
         Ok(ClipEncoder { layers })