feat(cyclotron): batch job updates (#24998)

Co-authored-by: Ben White <[email protected]>
PostHog · Sep 18, 2024 · 6ed9485 · 6ed9485
1 parent dfbd23b
commit 6ed9485
Show file tree

Hide file tree

Showing 24 changed files with 595 additions and 410 deletions.
diff --git a/plugin-server/package.json b/plugin-server/package.json
@@ -147,6 +147,6 @@
     },
     "cyclotron": {
         "//This is a short term workaround to ensure that cyclotron changes trigger a rebuild": true,
-        "version": "0.1.1"
+        "version": "0.1.2"
     }
 }
diff --git a/plugin-server/src/cdp/cdp-consumers.ts b/plugin-server/src/cdp/cdp-consumers.ts
@@ -760,7 +760,7 @@ export class CdpCyclotronWorker extends CdpConsumerBase {
 
     private async updateJobs(invocations: HogFunctionInvocationResult[]) {
         await Promise.all(
-            invocations.map(async (item) => {
+            invocations.map((item) => {
                 const id = item.invocation.id
                 if (item.error) {
                     status.debug('⚡️', 'Updating job to failed', id)
@@ -775,15 +775,19 @@ export class CdpCyclotronWorker extends CdpConsumerBase {
 
                     this.cyclotronWorker?.updateJob(id, 'available', updates)
                 }
-                await this.cyclotronWorker?.flushJob(id)
+                return this.cyclotronWorker?.releaseJob(id)
             })
         )
     }
 
     private async handleJobBatch(jobs: CyclotronJob[]) {
         gaugeBatchUtilization.labels({ queue: this.queue }).set(jobs.length / this.hub.CDP_CYCLOTRON_BATCH_SIZE)
+        if (!this.cyclotronWorker) {
+            throw new Error('No cyclotron worker when trying to handle batch')
+        }
         const invocations: HogFunctionInvocation[] = []
-
+        // A list of all the promises related to job releasing that we need to await
+        const failReleases: Promise<void>[] = []
         for (const job of jobs) {
             // NOTE: This is all a bit messy and might be better to refactor into a helper
             if (!job.functionId) {
@@ -797,8 +801,8 @@ export class CdpCyclotronWorker extends CdpConsumerBase {
                 status.error('Error finding hog function', {
                     id: job.functionId,
                 })
-                this.cyclotronWorker?.updateJob(job.id, 'failed')
-                await this.cyclotronWorker?.flushJob(job.id)
+                this.cyclotronWorker.updateJob(job.id, 'failed')
+                failReleases.push(this.cyclotronWorker.releaseJob(job.id))
                 continue
             }
 
@@ -807,6 +811,7 @@ export class CdpCyclotronWorker extends CdpConsumerBase {
         }
 
         await this.processBatch(invocations)
+        await Promise.all(failReleases)
         counterJobsProcessed.inc({ queue: this.queue }, jobs.length)
     }
 

diff --git a/rust/Cargo.lock b/rust/Cargo.lock
diff --git a/rust/cyclotron-core/Cargo.toml b/rust/cyclotron-core/Cargo.toml
@@ -13,5 +13,5 @@ chrono = { workspace = true }
 tokio = { workspace = true }
 thiserror = { workspace = true }
 uuid = { workspace = true }
-rand = { workspace = true }
 futures = { workspace = true }
+tracing = { workspace = true }
diff --git a/rust/cyclotron-core/src/bin/create_test_data.rs b/rust/cyclotron-core/src/bin/create_test_data.rs
diff --git a/rust/cyclotron-core/src/bin/load_test.rs b/rust/cyclotron-core/src/bin/load_test.rs
diff --git a/rust/cyclotron-core/src/config.rs b/rust/cyclotron-core/src/config.rs
@@ -40,3 +40,39 @@ pub struct ManagerConfig {
     pub shard_depth_limit: Option<u64>, // Defaults to 10_000 available jobs per shard
     pub shard_depth_check_interval_seconds: Option<u64>, // Defaults to 10 seconds - checking shard capacity
 }
+
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct WorkerConfig {
+    #[serde(alias = "heartbeatWindowSeconds")]
+    pub heartbeat_window_seconds: Option<u64>, // Defaults to 5
+    #[serde(alias = "lingerTimeMs")]
+    pub linger_time_ms: Option<u64>, // Defaults to 500
+    #[serde(alias = "maxUpdatesBuffered")]
+    pub max_updates_buffered: Option<usize>, // Defaults to 100
+    #[serde(alias = "maxBytesBuffered")]
+    pub max_bytes_buffered: Option<usize>, // Defaults to 10MB
+    #[serde(alias = "flushLoopIntervalMs")]
+    pub flush_loop_interval_ms: Option<u64>, // Defaults to 10
+}
+
+impl WorkerConfig {
+    pub fn heartbeat_window(&self) -> chrono::Duration {
+        chrono::Duration::seconds(self.heartbeat_window_seconds.unwrap_or(5) as i64)
+    }
+
+    pub fn linger_time(&self) -> chrono::Duration {
+        chrono::Duration::milliseconds(self.linger_time_ms.unwrap_or(500) as i64)
+    }
+
+    pub fn flush_loop_interval(&self) -> chrono::Duration {
+        chrono::Duration::milliseconds(self.flush_loop_interval_ms.unwrap_or(10) as i64)
+    }
+
+    pub fn max_updates_buffered(&self) -> usize {
+        self.max_updates_buffered.unwrap_or(100)
+    }
+
+    pub fn max_bytes_buffered(&self) -> usize {
+        self.max_bytes_buffered.unwrap_or(10_000_000)
+    }
+}
diff --git a/rust/cyclotron-core/src/error.rs b/rust/cyclotron-core/src/error.rs
@@ -4,14 +4,24 @@ use uuid::Uuid;
 pub enum QueueError {
     #[error("sqlx error: {0}")]
     SqlxError(#[from] sqlx::Error),
-    #[error("Unknown job id: {0}")]
-    UnknownJobId(Uuid),
-    #[error("Job {0} flushed without a new state, which would leave it in a running state forever (or until reaped)")]
-    FlushWithoutNextState(Uuid),
-    #[error("Invalid lock {0} used to update job {1}. This usually means a job has been reaped from under a worker - did you forget to set the heartbeat?")]
-    InvalidLock(Uuid, Uuid),
     #[error("Shard over capacity {0} for this manager, insert aborted")]
     ShardFull(u64),
     #[error("Timed waiting for shard to have capacity")]
     TimedOutWaitingForCapacity,
+    #[error(transparent)]
+    JobError(#[from] JobError),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum JobError {
+    #[error("Unknown job id: {0}")]
+    UnknownJobId(Uuid),
+    #[error("Invalid lock id: {0} for job {1}")]
+    InvalidLock(Uuid, Uuid),
+    #[error("Cannot flush job {0} without a next state")]
+    FlushWithoutNextState(Uuid),
+    #[error("Deadline to flush update for job {0} exceeded")]
+    DeadlineExceeded(Uuid),
+    #[error("Update dropped before being flushed.")]
+    UpdateDropped,
 }
diff --git a/rust/cyclotron-core/src/lib.rs b/rust/cyclotron-core/src/lib.rs
@@ -14,6 +14,9 @@ pub use types::JobUpdate;
 
 // Errors
 mod error;
+// Errors about some job operation - locks being lost, invalid states, flush deadlines exceeded etc
+pub use error::JobError;
+// Errors about the queue itself - full shards, timeouts, postgres/network errors
 pub use error::QueueError;
 
 // Manager
@@ -22,6 +25,8 @@ pub use manager::QueueManager;
 
 // Worker
 mod worker;
+// A handle to a released job update, that can be awaited to block waiting for the flush to complete
+pub use worker::FlushHandle;
 pub use worker::Worker;
 
 // Janitor
@@ -32,6 +37,7 @@ pub use janitor::Janitor;
 mod config;
 pub use config::ManagerConfig;
 pub use config::PoolConfig;
+pub use config::WorkerConfig;
 
 // The shard id is a fixed value that is set by the janitor when it starts up.
 // Workers may use this value when reporting metrics. The `Worker` struct provides