More elegant way for handing non-streaming finish signal.

EricLBuehler · Jul 24, 2024 · b9db828 · b9db828
1 parent 4d9c864
commit b9db828
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 3 deletions.
diff --git a/src/main.rs b/src/main.rs
@@ -137,21 +137,23 @@ async fn main() -> Result<(), APIError> {
         dtype: config.kv_cache_dtype,
     };
     println!("Cache config {:?}", cache_config);
-
+    let finish_notify = Arc::new(Notify::new());
     let llm_engine = LLMEngine::new(
         model.0,
         SchedulerConfig {
             max_num_seqs: args.max_num_seqs,
         },
         cache_config,
         Arc::new(Notify::new()),
+        finish_notify.clone(),
     )?;
 
     let server_data = OpenAIServerData {
         pipeline_config: model.1,
         model: llm_engine,
         record_conversation: args.record_conversation,
         device: Device::Cpu,
+        finish_notify: finish_notify.clone(),
     };
 
     println!("Server started at http://127.0.0.1:{}.", args.port);

diff --git a/src/openai/mod.rs b/src/openai/mod.rs
@@ -1,7 +1,7 @@
 use candle_core::Device;
 use std::sync::Arc;
 use tokenizers::{EncodeInput, Encoding, Tokenizer};
-use tokio::sync::Mutex;
+use tokio::sync::{Mutex, Notify};
 
 use self::{pipelines::llm_engine::LLMEngine, responses::APIError};
 
@@ -45,6 +45,7 @@ pub struct OpenAIServerData {
     pub pipeline_config: PipelineConfig,
     pub record_conversation: bool,
     pub device: Device,
+    pub finish_notify: Arc<Notify>,
 }
 
 pub mod conversation;

diff --git a/src/openai/openai_server.rs b/src/openai/openai_server.rs
@@ -219,8 +219,15 @@ pub async fn chat_completions(
         )
     } else {
         // wait until current response finished
-        tokio::time::sleep(Duration::from_millis(100)).await; //permits generation thread to work
+        data.finish_notify.notified().await;
         let model = data.model.lock().await;
+        if !model.completion_records.contains_key(&request_id) {
+            return ChatResponder::ModelError(APIError::from(format!(
+                "Unable to generate response for request {}",
+                request_id
+            )));
+        }
+
         let choices = &model.completion_records[&request_id].0;
         let usage = &model.completion_records[&request_id].1;
 

diff --git a/src/openai/pipelines/llm_engine.rs b/src/openai/pipelines/llm_engine.rs
@@ -49,6 +49,7 @@ pub struct LLMEngine {
     cache_engine: CacheEngine,
     sliding_window: Option<usize>,
     pub notify: Arc<Notify>,
+    pub finish_notify: Arc<Notify>,
     pub completion_records: HashMap<String, (Vec<ChatChoice>, ChatCompletionUsageResponse)>,
 }
 
@@ -58,6 +59,7 @@ impl LLMEngine {
         scheduler_config: SchedulerConfig,
         cache_config: CacheConfig,
         notify: Arc<Notify>,
+        finish_notify: Arc<Notify>,
     ) -> Result<Arc<Mutex<Self>>, APIError> {
         let cache_engine = CacheEngine::new(
             pipeline.get_model_config(),
@@ -76,6 +78,7 @@ impl LLMEngine {
             cache_engine,
             sliding_window,
             notify: notify.clone(),
+            finish_notify: finish_notify.clone(),
             completion_records: HashMap::new(),
         }));
         let engine_clone = engine.clone();
@@ -133,6 +136,7 @@ impl LLMEngine {
                     );
                     e.completion_records
                         .insert(request_id.clone(), (choices, usage));
+                    finish_notify.notify_one();
                 }
             });
         });

diff --git a/tests/tests.rs b/tests/tests.rs
@@ -34,6 +34,7 @@ async fn test_llama() -> Result<(), APIError> {
         None,
     )?;
     let model = loader.load_model(paths, DType::F16, Device::Cpu)?;
+    let finish_notify = Arc::new(Notify::new());
     let llm_engine = LLMEngine::new(
         model.0,
         SchedulerConfig { max_num_seqs: 256 },
@@ -45,13 +46,15 @@ async fn test_llama() -> Result<(), APIError> {
             dtype: DType::F16,
         },
         Arc::new(Notify::new()),
+        finish_notify.clone(),
     )?;
 
     let server_data = OpenAIServerData {
         pipeline_config: model.1,
         model: llm_engine,
         device: Device::Cpu,
         record_conversation: false,
+        finish_notify: finish_notify.clone(),
     };
 
     let allow_origin = AllowOrigin::any();