Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tweak handling when PA cannot allocate #632

Merged
merged 3 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions mistralrs-bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ cli-table = "0.4.7"

[features]
cuda = ["mistralrs-core/cuda"]
cudnn = ["mistralrs-core/cudnn"]
metal = ["mistralrs-core/metal"]
flash-attn = ["cuda", "mistralrs-core/flash-attn"]
accelerate = ["mistralrs-core/accelerate"]
mkl = ["mistralrs-core/mkl"]
23 changes: 12 additions & 11 deletions mistralrs-core/src/dummy_paged_attention/block_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,18 +231,19 @@ impl BlockEngine {
}

pub fn free_sequence(&mut self, id: usize) {
let block_table = self.block_tables.get(&id).unwrap();

// Free from block table
for block in block_table {
if block.deref_mut().is_gpu {
self.gpu_allocator.free_block(block.clone())
} else {
self.cpu_allocator.free_block(block.clone())
// Handle double free if run out of tokens
if let Some(block_table) = self.block_tables.get(&id) {
// Free from block table
for block in block_table {
if block.deref_mut().is_gpu {
self.gpu_allocator.free_block(block.clone())
} else {
self.cpu_allocator.free_block(block.clone())
}
}
}

self.block_tables.remove(&id);
self.block_tables.remove(&id);
}
}

#[allow(dead_code)]
Expand Down Expand Up @@ -300,7 +301,7 @@ impl BlockEngine {
&mut self,
sequence: &impl BlockEngineSequence,
) -> Option<(usize, usize)> {
let table = self.block_tables.get_mut(&sequence.get_id()).unwrap();
let table = self.block_tables.get_mut(&sequence.get_id())?;

match sequence.blocks_to_add_new_tok() {
1 => {
Expand Down
3 changes: 3 additions & 0 deletions mistralrs-core/src/dummy_paged_attention/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ pub fn calculate_cache_config(

let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
if num_gpu_blocks == 0 {
anyhow::bail!("Num GPU blocks is 0. This means there is not enough memory. Either reduce the memory allocated or reduce the memory utilization.");
}
info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
Ok(CacheConfig {
block_size,
Expand Down
17 changes: 11 additions & 6 deletions mistralrs-core/src/dummy_paged_attention/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,23 +82,28 @@ impl PagedAttentionScheduler {
match can_allocate {
AllocStatus::Later => break, // If we can only allocate later, do not bother iterating over the rest.
AllocStatus::Impossible => {
let id = *get_mut_arcmutex!(seq).id();
let len = get_mut_arcmutex!(seq).get_toks().len();
warn!(
"Input prompt with length of {} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
get_mut_arcmutex!(seq).prompt_tokens()
"Sequence {id} with length of {len} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
);
get_mut_arcmutex!(seq).set_state(SequenceState::FinishedIgnored);
did_ignore = true;
}
_ => {}
}

get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
let seq_handle = get_mut_arcmutex!(seq);
self._allocate(&seq_handle);
if !did_ignore {
get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
let seq_handle = get_mut_arcmutex!(seq);
self._allocate(&seq_handle);
}

let seq = self.waiting.pop_front().unwrap();
self.running.push_back(seq.clone());
scheduled.push_back(seq);
if !did_ignore {
scheduled.push_back(seq);
}
}

// If we did schedule, or we ignored sequences.
Expand Down
23 changes: 12 additions & 11 deletions mistralrs-core/src/paged_attention/block_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,18 +231,19 @@ impl BlockEngine {
}

pub fn free_sequence(&mut self, id: usize) {
let block_table = self.block_tables.get(&id).unwrap();

// Free from block table
for block in block_table {
if block.deref_mut().is_gpu {
self.gpu_allocator.free_block(block.clone())
} else {
self.cpu_allocator.free_block(block.clone())
// Handle double free if run out of tokens
if let Some(block_table) = self.block_tables.get(&id) {
// Free from block table
for block in block_table {
if block.deref_mut().is_gpu {
self.gpu_allocator.free_block(block.clone())
} else {
self.cpu_allocator.free_block(block.clone())
}
}
}

self.block_tables.remove(&id);
self.block_tables.remove(&id);
}
}

#[allow(dead_code)]
Expand Down Expand Up @@ -300,7 +301,7 @@ impl BlockEngine {
&mut self,
sequence: &impl BlockEngineSequence,
) -> Option<(usize, usize)> {
let table = self.block_tables.get_mut(&sequence.get_id()).unwrap();
let table = self.block_tables.get_mut(&sequence.get_id())?;

match sequence.blocks_to_add_new_tok() {
1 => {
Expand Down
3 changes: 3 additions & 0 deletions mistralrs-core/src/paged_attention/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ pub fn calculate_cache_config(

let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
if num_gpu_blocks == 0 {
anyhow::bail!("Num GPU blocks is 0. This means there is not enough memory. Either reduce the memory allocated or reduce the memory utilization.");
}
info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
Ok(CacheConfig {
block_size,
Expand Down
17 changes: 11 additions & 6 deletions mistralrs-core/src/paged_attention/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,23 +82,28 @@ impl PagedAttentionScheduler {
match can_allocate {
AllocStatus::Later => break, // If we can only allocate later, do not bother iterating over the rest.
AllocStatus::Impossible => {
let id = *get_mut_arcmutex!(seq).id();
let len = get_mut_arcmutex!(seq).get_toks().len();
warn!(
"Input prompt with length of {} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
get_mut_arcmutex!(seq).prompt_tokens()
"Sequence {id} with length of {len} tokens is too long and exceeds capacity of block engine. Sequence will be ignored.",
);
get_mut_arcmutex!(seq).set_state(SequenceState::FinishedIgnored);
did_ignore = true;
}
_ => {}
}

get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
let seq_handle = get_mut_arcmutex!(seq);
self._allocate(&seq_handle);
if !did_ignore {
get_mut_arcmutex!(seq).set_state(SequenceState::RunningPrompt);
let seq_handle = get_mut_arcmutex!(seq);
self._allocate(&seq_handle);
}

let seq = self.waiting.pop_front().unwrap();
self.running.push_back(seq.clone());
scheduled.push_back(seq);
if !did_ignore {
scheduled.push_back(seq);
}
}

// If we did schedule, or we ignored sequences.
Expand Down
2 changes: 1 addition & 1 deletion mistralrs-core/src/vision_models/clip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ impl ClipEncoder {
let vs = vs.pp("layers");
let mut layers: Vec<ClipEncoderLayer> = Vec::new();
for index in 0..c.num_hidden_layers {
let layer = ClipEncoderLayer::new(vs.pp(&index.to_string()), c)?;
let layer = ClipEncoderLayer::new(vs.pp(index.to_string()), c)?;
layers.push(layer)
}
Ok(ClipEncoder { layers })
Expand Down
Loading