Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support bundle checkpoint / preemptible workers #3882

Merged
merged 78 commits into from
Apr 14, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
29f0a05
Bundle checkpoint / preemptible changes
epicfaace Nov 10, 2021
f784441
worker preemptible checkin
epicfaace Nov 10, 2021
be45063
update
epicfaace Nov 10, 2021
6425c3c
add migration
epicfaace Nov 11, 2021
17615b7
fix
epicfaace Nov 11, 2021
d4c787c
fix
epicfaace Nov 11, 2021
d1454e9
fix
epicfaace Nov 11, 2021
48f4ec5
update
epicfaace Nov 17, 2021
d28d669
Merge branch 'master' of github.com:codalab/codalab-worksheets into c…
epicfaace Dec 2, 2021
a40a19a
Update codalab/model/bundle_model.py
epicfaace Dec 2, 2021
d346f3e
Merge branch 'checkpoint' of github.com:codalab/codalab-worksheets in…
epicfaace Dec 2, 2021
9a7a16b
updates
epicfaace Dec 2, 2021
f674228
add documentation
epicfaace Dec 2, 2021
4ea0f52
add test
epicfaace Dec 2, 2021
ae1004c
fix flake8
epicfaace Dec 2, 2021
c55877f
Update Checkpoints.md
epicfaace Feb 8, 2022
dbbb756
Update codalab/model/bundle_model.py
epicfaace Feb 8, 2022
1180e6f
Update docs/Checkpoints.md
epicfaace Feb 8, 2022
3c2c3ec
Merge branch 'master' into checkpoint
epicfaace Feb 15, 2022
47eef0a
Fixes
epicfaace Feb 15, 2022
f226631
fix
epicfaace Feb 15, 2022
20db8a3
Merge branch 'worker2' into checkpoint
epicfaace Feb 15, 2022
79aab01
rename again
epicfaace Feb 15, 2022
4124e34
Revert "rename again"
epicfaace Feb 15, 2022
877269f
fixes, tests
epicfaace Feb 15, 2022
1ca7900
.dev.yml
epicfaace Feb 16, 2022
47f253c
fixes
epicfaace Feb 16, 2022
59828ea
fix test
epicfaace Feb 16, 2022
c3d1a21
fix doc
epicfaace Feb 16, 2022
030256f
fix bug
epicfaace Feb 16, 2022
311e34d
update doc
epicfaace Feb 16, 2022
5a87afc
fix
epicfaace Feb 21, 2022
f5223a0
fix
epicfaace Feb 21, 2022
7abe85c
fix
epicfaace Feb 21, 2022
58932d1
fix
epicfaace Feb 21, 2022
a39bb2c
fix
epicfaace Feb 21, 2022
d6a1398
fix
epicfaace Feb 21, 2022
4c184d3
comment for now
epicfaace Feb 21, 2022
8596d05
Merge branch 'master' of github.com:codalab/codalab-worksheets into c…
epicfaace Mar 16, 2022
a0dadc3
update test
epicfaace Mar 16, 2022
f0d1530
fix
epicfaace Mar 16, 2022
2dda725
ci tmp
epicfaace Mar 16, 2022
0753818
Update test-setup-preemptible.sh
epicfaace Mar 16, 2022
314638e
fix
epicfaace Mar 16, 2022
dcf960b
Merge branch 'checkpoint' of github.com:codalab/codalab-worksheets in…
epicfaace Mar 16, 2022
743f507
sleep
epicfaace Mar 16, 2022
0c6008c
update test
epicfaace Mar 17, 2022
8f9fd41
format / comments
epicfaace Mar 17, 2022
dea755e
fix command
epicfaace Mar 17, 2022
184e405
fix
epicfaace Mar 17, 2022
289b1a2
Update test.yml
epicfaace Mar 21, 2022
78acc06
Update test_cli.py
epicfaace Mar 21, 2022
47aa494
Update test.yml
epicfaace Mar 21, 2022
5eac2d3
Merge branch 'checkpoint' of github.com:codalab/codalab-worksheets in…
epicfaace Mar 21, 2022
115b8b5
fix
epicfaace Mar 21, 2022
ca03ad1
fix
epicfaace Mar 21, 2022
03da441
update
epicfaace Mar 21, 2022
4ad23e6
fix
epicfaace Mar 21, 2022
4061733
fixes
epicfaace Mar 21, 2022
853d4bc
comment
epicfaace Mar 21, 2022
f978cb8
Update test-setup-preemptible.sh
epicfaace Mar 21, 2022
3e9c0c8
Update test_cli.py
epicfaace Mar 21, 2022
663d35b
Update test.yml
epicfaace Mar 21, 2022
0778d58
Update test.yml
epicfaace Mar 21, 2022
6a0a99d
Update test_cli.py
epicfaace Mar 21, 2022
962017f
Update docker-compose.dev.yml
epicfaace Mar 21, 2022
71fb046
update
epicfaace Mar 28, 2022
9dde912
Merge branch 'master' of github.com:codalab/codalab-worksheets into c…
epicfaace Apr 12, 2022
6d45d83
update
epicfaace Apr 12, 2022
6f6c44a
fix
epicfaace Apr 12, 2022
f397936
revert
epicfaace Apr 12, 2022
b77ec32
fix test
epicfaace Apr 12, 2022
6423cc3
update doc
epicfaace Apr 12, 2022
8ff8a4f
tests
epicfaace Apr 12, 2022
5c315a4
update
epicfaace Apr 12, 2022
398bc81
Update codalab/worker/worker_run_state.py
epicfaace Apr 14, 2022
3f5def8
Update test_cli.py
epicfaace Apr 14, 2022
6b03702
Update worker_run_state.py
epicfaace Apr 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion codalab/bundles/run_bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ class RunBundle(DerivedBundle):
METADATA_SPECS.append(MetadataSpec('exitcode', int, 'Exitcode of the process.', generated=True))
METADATA_SPECS.append(MetadataSpec('job_handle', str, 'Identifies the job handle (internal).', generated=True, hide_when_anonymous=True))
METADATA_SPECS.append(MetadataSpec('remote', str, 'Where this job is/was run (internal).', generated=True, hide_when_anonymous=True))
METADATA_SPECS.append(MetadataSpec('remotes', list, 'Previous workers where this job was run (internal); multiple values indicate that the bundle was preempted and moved to a different worker.', generated=True, hide_when_anonymous=True))
METADATA_SPECS.append(MetadataSpec('remotes', list, 'Previous workers where this job was run (internal); multiple values indicate that the bundle was preempted and moved to a different worker.',
epicfaace marked this conversation as resolved.
Show resolved Hide resolved
generated=True, hide_when_anonymous=True))
METADATA_SPECS.append(MetadataSpec('preemptible', bool, 'Whether the bundle is currently running / finished on a preemptible worker.', generated=True, hide_when_anonymous=True, default=False))
epicfaace marked this conversation as resolved.
Show resolved Hide resolved
# fmt: on

@classmethod
Expand Down
17 changes: 11 additions & 6 deletions codalab/model/bundle_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,7 +915,15 @@ def transition_bundle_preparing(self, bundle, user_id, worker_id, start_time, re

bundle_update = {
'state': State.PREPARING,
'metadata': {'started': start_time, 'last_updated': start_time, 'remote': remote},
'metadata': {
'started': start_time,
'last_updated': start_time,
'remote': remote,
'remotes': (bundle.metadata.remotes or [])
epicfaace marked this conversation as resolved.
Show resolved Hide resolved
+ [
remote
], # Store the history of which workers ran this bundle before in the bundle metadata.
epicfaace marked this conversation as resolved.
Show resolved Hide resolved
},
}
self.update_bundle(bundle, bundle_update, connection)

Expand Down Expand Up @@ -987,7 +995,7 @@ def transition_bundle_worker_offline(self, bundle):
Transitions bundle to WORKER_OFFLINE state:
Updates the last_updated metadata.
Removes the corresponding row from worker_run if it exists.

If the worker is preemptible, move the bundle to the STAGED state instead.
epicfaace marked this conversation as resolved.
Show resolved Hide resolved
"""
with self.engine.begin() as connection:
Expand All @@ -1002,14 +1010,11 @@ def transition_bundle_worker_offline(self, bundle):
# The user deleted the bundle or the bundle finished
return False

worker = self.get_bundle_worker(bundle.uuid)
if worker['preemptible']:
if bundle.metadata.preemptible:
bundle_update = {
epicfaace marked this conversation as resolved.
Show resolved Hide resolved
epicfaace marked this conversation as resolved.
Show resolved Hide resolved
'state': State.STAGED,
'metadata': {'last_updated': int(time.time())},
'remotes': worker.id
}
# TODO: we should store the history of which workers ran this bundle before in the bundle metadata.
else:
bundle_update = {
'state': State.WORKER_OFFLINE,
Expand Down