diff options
author | Carl Lerche <me@carllerche.com> | 2020-03-28 13:55:12 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-28 13:55:12 -0700 |
commit | caa7e180e43fdf914774de86f01f88e6b41f4a32 (patch) | |
tree | acd63c2a01e11f2c728f2d7527efafbc99c66132 /tokio/src/runtime/thread_pool | |
parent | 7b2438e7441e98b2a3f72eb239b1c51489b7d9b8 (diff) |
rt: cap fifo scheduler slot to avoid starvation (#2349)
The work-stealing scheduler includes an optimization where each worker
includes a single slot to store the **last** scheduled task. Tasks in
scheduler's LIFO slot are executed next. This speeds up and reduces
latency with message passing patterns.
Previously, this optimization was susceptible to starving other tasks in
certain cases. If two tasks ping-ping between each other without ever
yielding, the worker would never execute other tasks.
An early PR (#2160) introduced a form of pre-emption. Each task is
allocated a per-poll operation budget. Tokio resources will return ready
until the budget is depleted, at which point, Tokio resources will
always return `Pending`.
This patch leverages the operation budget to limit the LIFO scheduler
optimization. When executing tasks from the LIFO slot, the budget is
**not** reset. Once the budget goes to zero, the task in the LIFO slot
is pushed to the back of the queue.
Diffstat (limited to 'tokio/src/runtime/thread_pool')
-rw-r--r-- | tokio/src/runtime/thread_pool/worker.rs | 72 |
1 files changed, 59 insertions, 13 deletions
diff --git a/tokio/src/runtime/thread_pool/worker.rs b/tokio/src/runtime/thread_pool/worker.rs index c07aa054..400e2a93 100644 --- a/tokio/src/runtime/thread_pool/worker.rs +++ b/tokio/src/runtime/thread_pool/worker.rs @@ -34,6 +34,13 @@ struct Core { /// Used to schedule bookkeeping tasks every so often. tick: u8, + /// When a task is scheduled from a worker, it is stored in this slot. The + /// worker will check this slot for a task **before** checking the run + /// queue. This effectively results in the **last** scheduled task to be run + /// next (LIFO). This is an optimization for message passing patterns and + /// helps to reduce latency. + lifo_slot: Option<Notified>, + /// The worker-local run queue. run_queue: queue::Local<Arc<Worker>>, @@ -128,6 +135,7 @@ pub(super) fn create(size: usize, park: Parker) -> (Arc<Shared>, Launch) { cores.push(Box::new(Core { tick: 0, + lifo_slot: None, run_queue, is_searching: false, is_shutdown: false, @@ -296,13 +304,37 @@ impl Context { *self.core.borrow_mut() = Some(core); // Run the task - task.run(); - - // Try to take the core back - match self.core.borrow_mut().take() { - Some(core) => Ok(core), - None => Err(()), - } + crate::coop::budget(|| { + task.run(); + + // As long as there is budget remaining and a task exists in the + // `lifo_slot`, then keep running. + loop { + // Check if we still have the core. If not, the core was stolen + // by another worker. + let mut core = match self.core.borrow_mut().take() { + Some(core) => core, + None => return Err(()), + }; + + // Check for a task in the LIFO slot + let task = match core.lifo_slot.take() { + Some(task) => task, + None => return Ok(core), + }; + + if crate::coop::has_budget_remaining() { + // Run the LIFO task, then loop + *self.core.borrow_mut() = Some(core); + task.run(); + } else { + // Not enough budget left to run the LIFO task, push it to + // the back of the queue and return. + core.run_queue.push_back(task, self.worker.inject()); + return Ok(core); + } + } + }) } fn maintenance(&self, mut core: Box<Core>) -> Box<Core> { @@ -373,12 +405,16 @@ impl Core { /// Return the next notified task available to this worker. fn next_task(&mut self, worker: &Worker) -> Option<Notified> { if self.tick % GLOBAL_POLL_INTERVAL == 0 { - worker.inject().pop().or_else(|| self.run_queue.pop()) + worker.inject().pop().or_else(|| self.next_local_task()) } else { - self.run_queue.pop().or_else(|| worker.inject().pop()) + self.next_local_task().or_else(|| worker.inject().pop()) } } + fn next_local_task(&mut self) -> Option<Notified> { + self.lifo_slot.take().or_else(|| self.run_queue.pop()) + } + fn steal_work(&mut self, worker: &Worker) -> Option<Notified> { if !self.transition_to_searching(worker) { return None; @@ -444,9 +480,9 @@ impl Core { /// Returns `true` if the transition happened. fn transition_from_parked(&mut self, worker: &Worker) -> bool { - // If there is a non-stealable task, then we must unpark regardless of + // If a task is in the lifo slot, then we must unpark regardless of // being notified - if self.run_queue.has_unstealable() { + if self.lifo_slot.is_some() { worker.shared.idle.unpark_worker_by_id(worker.index); self.is_searching = true; return true; @@ -494,7 +530,7 @@ impl Core { } // Drain the queue - while let Some(_) = self.run_queue.pop() {} + while let Some(_) = self.next_local_task() {} } fn drain_pending_drop(&mut self, worker: &Worker) { @@ -639,7 +675,17 @@ impl Shared { core.run_queue.push_back(task, &self.inject); true } else { - core.run_queue.push(task, &self.inject) + // Push to the LIFO slot + let prev = core.lifo_slot.take(); + let ret = prev.is_some(); + + if let Some(prev) = prev { + core.run_queue.push_back(prev, &self.inject); + } + + core.lifo_slot = Some(task); + + ret }; // Only notify if not currently parked. If `park` is `None`, then the |