-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Core] Allow spillback while hitting scheduling class cap #28804
Changes from 5 commits
1bb7f4e
746e853
76c9c48
5138392
f311fb2
f51a40f
7a08a1e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -153,6 +153,15 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { | |
int64_t target_time = get_time_ms_() + wait_time; | ||
sched_cls_info.next_update_time = | ||
std::min(target_time, sched_cls_info.next_update_time); | ||
|
||
// While we're over capacity and cannot run the task, | ||
// try to spill to a node that can run it. | ||
bool did_spill = TrySpillback(work, is_infeasible); | ||
if (did_spill) { | ||
work_it = dispatch_queue.erase(work_it); | ||
continue; | ||
} | ||
|
||
break; | ||
} | ||
} | ||
|
@@ -228,11 +237,6 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { | |
internal::UnscheduledWorkCause::WAITING_FOR_RESOURCES_AVAILABLE); | ||
break; | ||
} | ||
num_unschedulable_task_spilled_++; | ||
if (!spec.GetDependencies().empty()) { | ||
task_dependency_manager_.RemoveTaskDependencies( | ||
task.GetTaskSpecification().TaskId()); | ||
} | ||
work_it = dispatch_queue.erase(work_it); | ||
} else { | ||
// Force us to recalculate the next update time the next time a task | ||
|
@@ -370,7 +374,7 @@ bool LocalTaskManager::TrySpillback(const std::shared_ptr<internal::Work> &work, | |
bool &is_infeasible) { | ||
auto scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode( | ||
work->task.GetTaskSpecification(), | ||
work->PrioritizeLocalNode(), | ||
/*prioritize_local_node*/ true, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why was this change needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, when a node is selected by the cluster task manager and the task is added to that node's local task manager queue. Then we should only spillback if local has no resource (in the other word, we should prefer staying local if possible). |
||
/*exclude_local_node*/ false, | ||
/*requires_object_store_memory*/ false, | ||
&is_infeasible); | ||
|
@@ -382,6 +386,11 @@ bool LocalTaskManager::TrySpillback(const std::shared_ptr<internal::Work> &work, | |
|
||
NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary()); | ||
Spillback(node_id, work); | ||
num_unschedulable_task_spilled_++; | ||
if (!work->task.GetTaskSpecification().GetDependencies().empty()) { | ||
task_dependency_manager_.RemoveTaskDependencies( | ||
work->task.GetTaskSpecification().TaskId()); | ||
} | ||
return true; | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we move this code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is moved into
TrySpillback()
since this needs to be called whenever spill happens. By moving it inside TrySpillback() we make sure it always happens when spill also we avoid the code duplication since TrySpillback() now is called in two places.