Skip to content

Commit

Permalink
Prompt prefix caching for multi-LoRA (#655)
Browse files Browse the repository at this point in the history
  • Loading branch information
tgaddair authored Oct 23, 2024
1 parent 71ca771 commit 373c3e6
Show file tree
Hide file tree
Showing 7 changed files with 427 additions and 219 deletions.
45 changes: 26 additions & 19 deletions docs/reference/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,7 @@
"example": 0.5,
"nullable": false,
"minimum": 0.0,
"maximim": 1.0
"maximum": 1.0
},
"majority_sign_method": {
"type": "string",
Expand All @@ -727,7 +727,7 @@
"example": 1,
"nullable": true,
"minimum": 0.0,
"exclusiveMinimum": 0.0
"exclusiveMinimum": true
},
"decoder_input_details": {
"type": "boolean",
Expand All @@ -748,7 +748,7 @@
"default": "null",
"nullable": true,
"minimum": 0.0,
"exclusiveMinimum": 0.0
"exclusiveMinimum": true
},
"ignore_eos_token": {
"type": "boolean",
Expand All @@ -761,7 +761,8 @@
"default": "null",
"example": 1.03,
"nullable": true,
"exclusiveMinimum": 0.0
"minimum": 0.0,
"exclusiveMinimum": true
},
"return_full_text": {
"type": "boolean",
Expand All @@ -776,7 +777,7 @@
"example": "null",
"nullable": true,
"minimum": 0.0,
"exclusiveMinimum": 0.0
"exclusiveMinimum": true
},
"stop": {
"type": "array",
Expand All @@ -794,15 +795,17 @@
"default": "null",
"example": 0.5,
"nullable": true,
"exclusiveMinimum": 0.0
"minimum": 0.0,
"exclusiveMinimum": false
},
"top_k": {
"type": "integer",
"format": "int32",
"default": "null",
"example": 10,
"nullable": true,
"exclusiveMinimum": 0.0
"minimum": 0.0,
"exclusiveMinimum": true
},
"top_p": {
"type": "number",
Expand All @@ -811,7 +814,7 @@
"example": 0.95,
"nullable": true,
"maximum": 1.0,
"exclusiveMinimum": 0.0
"exclusiveMinimum": true
},
"truncate": {
"type": "integer",
Expand All @@ -827,7 +830,7 @@
"example": 0.95,
"nullable": true,
"maximum": 1.0,
"exclusiveMinimum": 0.0
"exclusiveMinimum": true
},
"watermark": {
"type": "boolean",
Expand Down Expand Up @@ -930,7 +933,8 @@
"default": "null",
"example": 0.5,
"nullable": true,
"exclusiveMinimum": 0.0
"minimum": 0.0,
"exclusiveMinimum": false
},
"top_p": {
"type": "number",
Expand All @@ -939,22 +943,23 @@
"example": 0.95,
"nullable": true,
"maximum": 1.0,
"exclusiveMinimum": 0.0
"minimum": 0.0,
"exclusiveMinimum": true
},
"n": {
"type": "integer",
"default": "null",
"example": 3,
"nullable": true,
"exclusiveMinimum": 0
"minimum": 0,
"exclusiveMinimum": true
},
"max_tokens": {
"type": "integer",
"format": "int32",
"default": "20",
"minimum": 0.0,
"exclusiveMaximum": 512.0,
"exclusiveMinimum": 0.0
"exclusiveMinimum": true
},
"stop": {
"type": "array",
Expand Down Expand Up @@ -997,7 +1002,8 @@
"default": "null",
"example": 0.5,
"nullable": true,
"exclusiveMinimum": 0.0
"minimum": 0.0,
"exclusiveMinimum": false
},
"top_p": {
"type": "number",
Expand All @@ -1006,22 +1012,23 @@
"example": 0.95,
"nullable": true,
"maximum": 1.0,
"exclusiveMinimum": 0.0
"minimum": 0.0,
"exclusiveMinimum": true
},
"n": {
"type": "integer",
"default": "null",
"example": 3,
"nullable": true,
"exclusiveMinimum": 0
"minimum": 0,
"exclusiveMinimum": true
},
"max_tokens": {
"type": "integer",
"format": "int32",
"default": "20",
"minimum": 0.0,
"exclusiveMaximum": 512.0,
"exclusiveMinimum": 0.0
"exclusiveMinimum": true
},
"stop": {
"type": "array",
Expand Down
10 changes: 7 additions & 3 deletions router/src/batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ pub(crate) trait BatchEntries: Sync + Send + Debug {
blocks: Vec<u32>,
slots: Vec<u32>,
prefix_len: u32,
chunk_len: Option<u32>,
);
fn extend(&mut self, entries: Box<dyn BatchEntries>);
fn drain(&mut self) -> Vec<(Adapter, u64, Entry)>;
Expand Down Expand Up @@ -323,6 +324,7 @@ impl BatchEntries for GenerateBatchEntries {
blocks: Vec<u32>,
slots: Vec<u32>,
prefix_len: u32,
chunk_len: Option<u32>,
) {
let valid_request = entry
.request
Expand All @@ -343,7 +345,7 @@ impl BatchEntries for GenerateBatchEntries {
blocks,
slots,
cache_len: prefix_len,
chunk_len: None,
chunk_len: chunk_len,
};

self.state.add(id, entry, adapter, request_proto);
Expand Down Expand Up @@ -455,6 +457,7 @@ impl BatchEntries for EmbedBatchEntries {
blocks: Vec<u32>,
slots: Vec<u32>,
prefix_len: u32,
chunk_len: Option<u32>,
) {
let valid_request = entry
.request
Expand All @@ -475,7 +478,7 @@ impl BatchEntries for EmbedBatchEntries {
blocks,
slots,
cache_len: prefix_len,
chunk_len: None,
chunk_len: chunk_len,
};

self.state.add(id, entry, adapter, request_proto);
Expand Down Expand Up @@ -580,6 +583,7 @@ impl BatchEntries for ClassifyBatchEntries {
blocks: Vec<u32>,
slots: Vec<u32>,
prefix_len: u32,
chunk_len: Option<u32>,
) {
let valid_request = entry
.request
Expand All @@ -600,7 +604,7 @@ impl BatchEntries for ClassifyBatchEntries {
blocks,
slots,
cache_len: prefix_len,
chunk_len: None,
chunk_len: chunk_len,
};

self.state.add(id, entry, adapter, request_proto);
Expand Down
12 changes: 9 additions & 3 deletions router/src/block_allocator.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{cmp::min, sync::Arc};
use std::sync::Arc;
use tokio::sync::{mpsc, oneshot};

use crate::radix::RadixAllocator;
Expand Down Expand Up @@ -56,12 +56,14 @@ impl BlockAllocator {

pub(crate) async fn allocate(
&self,
adapter_index: u32,
tokens: u32,
prefill_tokens: Option<Arc<Vec<u32>>>,
) -> Option<BlockAllocation> {
let (response_sender, response_receiver) = oneshot::channel();
self.block_allocator
.send(BlockAllocatorCommand::Allocate {
adapter_index,
tokens,
prefill_tokens,
response_sender,
Expand Down Expand Up @@ -103,12 +105,13 @@ async fn block_allocator_task(
allocation_id,
} => allocator.free(blocks, allocation_id),
BlockAllocatorCommand::Allocate {
adapter_index,
tokens,
prefill_tokens,
response_sender,
} => {
response_sender
.send(allocator.allocate(tokens, prefill_tokens))
.send(allocator.allocate(adapter_index, tokens, prefill_tokens))
.unwrap();
}
}
Expand All @@ -122,6 +125,7 @@ enum BlockAllocatorCommand {
allocation_id: u64,
},
Allocate {
adapter_index: u32,
tokens: u32,
prefill_tokens: Option<Arc<Vec<u32>>>,
response_sender: oneshot::Sender<Option<BlockAllocation>>,
Expand All @@ -131,6 +135,7 @@ enum BlockAllocatorCommand {
pub(crate) trait Allocator {
fn allocate(
&mut self,
adapter_index: u32,
tokens: u32,
prefill_tokens: Option<Arc<Vec<u32>>>,
) -> Option<BlockAllocation>;
Expand Down Expand Up @@ -158,6 +163,7 @@ impl SimpleAllocator {
impl Allocator for SimpleAllocator {
fn allocate(
&mut self,
_adapter_index: u32,
tokens: u32,
_prefill_tokens: Option<Arc<Vec<u32>>>,
) -> Option<BlockAllocation> {
Expand All @@ -167,7 +173,7 @@ impl Allocator for SimpleAllocator {
None => (tokens, 1),
Some(window_size) => {
let repeats = (tokens + window_size - 1) / window_size;
let tokens = min(tokens, window_size);
let tokens = core::cmp::min(tokens, window_size);
(tokens, repeats as usize)
}
};
Expand Down
1 change: 1 addition & 0 deletions router/src/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ impl Infer {
speculate,
max_batch_total_tokens,
prefix_caching,
chunked_prefill,
is_causal_lm,
);

Expand Down
Loading

0 comments on commit 373c3e6

Please sign in to comment.