diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ee7d7e399..158c8cb5f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -355,7 +355,62 @@ jobs: if: always() uses: actions/upload-artifact@v1 with: - name: logs-test-protectedmode-${{ matrix.test }} + name: logs-test-${{ matrix.test }} + path: /tmp/logs + + test_backend_preemptible_worker: + name: Test backend - preemptible workers + runs-on: ubuntu-latest + needs: [build] + strategy: + matrix: + test: + - preemptible + steps: + - name: Clear free space + run: | + sudo rm -rf /opt/ghc + df -h + - uses: actions/checkout@v2 + - uses: actions/setup-python@v1 + with: + python-version: 3.6 + - uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: pip-${{ hashFiles('requirements.txt') }} + restore-keys: | + pip- + - run: pip install -r requirements.txt + - run: pip install -e . + - name: Setup tests + run: | + sudo service mysql stop + python3 codalab_service.py build services --version ${VERSION} --pull + env: + VERSION: ${{ github.head_ref || 'master' }} + - name: Run tests + run: | + python3 codalab_service.py start --services default no-worker worker-preemptible --version ${VERSION} + sleep 20 + python3 codalab_service.py start --services worker-preemptible2 --version ${VERSION} + ./tests/test-setup-preemptible.sh + python3 test_runner.py --version ${VERSION} ${TEST} + env: + TEST: ${{ matrix.test }} + VERSION: ${{ github.head_ref || 'master' }} + CODALAB_USERNAME: codalab + CODALAB_PASSWORD: codalab + - name: Save logs + if: always() + run: | + mkdir /tmp/logs + for c in $(docker ps -a --format="{{.Names}}"); do docker logs $c > /tmp/logs/$c.log 2> /tmp/logs/$c.err.log; done + - name: Upload logs + if: always() + uses: actions/upload-artifact@v1 + with: + name: logs-test-${{ matrix.test }} path: /tmp/logs test_backend_azure_blob: @@ -371,7 +426,7 @@ jobs: - run time - run2 - search read kill write mimic workers edit_user sharing_workers -# - search link read kill write mimic workers edit_user sharing_workers + # - search link read kill write mimic workers edit_user sharing_workers - resources - memoize - copy netcat netcurl diff --git a/alembic/versions/2022021522_add_preemptible_column_to__f720aaefd0b2.py b/alembic/versions/2022021522_add_preemptible_column_to__f720aaefd0b2.py new file mode 100644 index 000000000..fd92189b7 --- /dev/null +++ b/alembic/versions/2022021522_add_preemptible_column_to__f720aaefd0b2.py @@ -0,0 +1,26 @@ +"""add preemptible column to worker + +Revision ID: f720aaefd0b2 +Revises: a325a0756797 +Create Date: 2022-02-15 22:26:06.466720 + +""" + +# revision identifiers, used by Alembic. +revision = 'f720aaefd0b2' +down_revision = 'a325a0756797' + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('worker', sa.Column('preemptible', sa.Boolean(), nullable=False)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('worker', 'preemptible') + # ### end Alembic commands ### diff --git a/codalab/bundles/run_bundle.py b/codalab/bundles/run_bundle.py index cda51d017..bcbfa8491 100644 --- a/codalab/bundles/run_bundle.py +++ b/codalab/bundles/run_bundle.py @@ -72,6 +72,11 @@ class RunBundle(DerivedBundle): METADATA_SPECS.append(MetadataSpec('exitcode', int, 'Exitcode of the process.', generated=True)) METADATA_SPECS.append(MetadataSpec('job_handle', str, 'Identifies the job handle (internal).', generated=True, hide_when_anonymous=True)) METADATA_SPECS.append(MetadataSpec('remote', str, 'Where this job is/was run (internal).', generated=True, hide_when_anonymous=True)) + METADATA_SPECS.append(MetadataSpec('remote_history', list, 'All workers where this job has been run (internal); multiple values indicate' + 'that the bundle was preempted and moved to a different worker.', + generated=True, hide_when_anonymous=True)) + METADATA_SPECS.append(MetadataSpec('on_preemptible_worker', bool, 'Whether the bundle is currently running / finished on a preemptible worker.', generated=True, hide_when_anonymous=True, + default=False)) # fmt: on @classmethod diff --git a/codalab/model/bundle_model.py b/codalab/model/bundle_model.py index a00e0dfc0..a121c91cc 100644 --- a/codalab/model/bundle_model.py +++ b/codalab/model/bundle_model.py @@ -914,10 +914,22 @@ def transition_bundle_preparing(self, bundle, user_id, worker_id, start_time, re ).fetchone() if not run_row or run_row.user_id != user_id or run_row.worker_id != worker_id: return False + worker_row = connection.execute( + cl_worker.select().where(and_(cl_worker.c.worker_id == worker_id)) + ).fetchone() bundle_update = { 'state': State.PREPARING, - 'metadata': {'started': start_time, 'last_updated': start_time, 'remote': remote}, + 'metadata': { + 'started': start_time, + 'last_updated': start_time, + 'remote': remote, + 'on_preemptible_worker': worker_row.preemptible, + 'remote_history': getattr(bundle.metadata, "remote_history", []) + + [ + remote + ], # Store the history of which workers ran this bundle before in the bundle metadata. + }, } self.update_bundle(bundle, bundle_update, connection) @@ -989,6 +1001,8 @@ def transition_bundle_worker_offline(self, bundle): Transitions bundle to WORKER_OFFLINE state: Updates the last_updated metadata. Removes the corresponding row from worker_run if it exists. + + If the bundle is preemptible, move the bundle to the STAGED state instead. """ with self.engine.begin() as connection: # Check that it still exists and is running @@ -1004,15 +1018,21 @@ def transition_bundle_worker_offline(self, bundle): # The user deleted the bundle or the bundle finished return False + if getattr(bundle.metadata, "on_preemptible_worker", False): + # If the bundle is running on a preemptible worker, move the bundle to the STAGED state instead. + bundle_update = { + 'state': State.STAGED, + 'metadata': {'last_updated': int(time.time())}, + } + else: + bundle_update = { + 'state': State.WORKER_OFFLINE, + 'metadata': {'last_updated': int(time.time())}, + } # Delete row in worker_run connection.execute( cl_worker_run.delete().where(cl_worker_run.c.run_uuid == bundle.uuid) ) - - bundle_update = { - 'state': State.WORKER_OFFLINE, - 'metadata': {'last_updated': int(time.time())}, - } self.update_bundle(bundle, bundle_update, connection) return True diff --git a/codalab/model/tables.py b/codalab/model/tables.py index 5f76327b7..a4ce5f40a 100644 --- a/codalab/model/tables.py +++ b/codalab/model/tables.py @@ -521,6 +521,7 @@ 'exit_after_num_runs', Integer, nullable=False ), # Number of jobs allowed to run on worker. Column('is_terminating', Boolean, nullable=False), + Column('preemptible', Boolean, nullable=False), # Whether worker is preemptible. mysql_charset=TABLE_DEFAULT_CHARSET, ) diff --git a/codalab/model/worker_model.py b/codalab/model/worker_model.py index ccbaae754..468b42b9d 100644 --- a/codalab/model/worker_model.py +++ b/codalab/model/worker_model.py @@ -52,6 +52,7 @@ def worker_checkin( tag_exclusive, exit_after_num_runs, is_terminating, + preemptible, ): """ Adds the worker to the database, if not yet there. Returns the socket ID @@ -69,6 +70,7 @@ def worker_checkin( 'tag_exclusive': tag_exclusive, 'exit_after_num_runs': exit_after_num_runs, 'is_terminating': is_terminating, + 'preemptible': preemptible, } # Populate the group for this worker, if group_name is valid @@ -206,6 +208,7 @@ def get_workers(self): 'tag_exclusive': row.tag_exclusive, 'exit_after_num_runs': row.exit_after_num_runs, 'is_terminating': row.is_terminating, + 'preemptible': row.preemptible, } for row in worker_rows } diff --git a/codalab/rest/workers.py b/codalab/rest/workers.py index bf5415d4d..57917d6d3 100644 --- a/codalab/rest/workers.py +++ b/codalab/rest/workers.py @@ -39,6 +39,7 @@ def checkin(worker_id): request.json.get("tag_exclusive", False), request.json.get("exit_after_num_runs", DEFAULT_EXIT_AFTER_NUM_RUNS), request.json.get("is_terminating", False), + request.json.get("preemptible", False), ) for run in request.json["runs"]: diff --git a/codalab/worker/main.py b/codalab/worker/main.py index eb9fc3b00..939bf3865 100644 --- a/codalab/worker/main.py +++ b/codalab/worker/main.py @@ -191,6 +191,9 @@ def parse_args(): default=1, help='The shared memory size of the run container in GB (defaults to 1).', ) + parser.add_argument( + '--preemptible', action='store_true', help='Whether the worker is preemptible.', + ) return parser.parse_args() @@ -328,6 +331,7 @@ def main(): delete_work_dir_on_exit=args.delete_work_dir_on_exit, exit_on_exception=args.exit_on_exception, shared_memory_size_gb=args.shared_memory_size_gb, + preemptible=args.preemptible, ) # Register a signal handler to ensure safe shutdown. diff --git a/codalab/worker/worker.py b/codalab/worker/worker.py index e564c27f4..85e276856 100644 --- a/codalab/worker/worker.py +++ b/codalab/worker/worker.py @@ -79,6 +79,7 @@ def __init__( # A flag indicating if the worker will exit if it encounters an exception exit_on_exception=False, # type: bool shared_memory_size_gb=1, # type: int + preemptible=False, # type: bool ): self.image_manager = image_manager self.dependency_manager = dependency_manager @@ -114,6 +115,7 @@ def __init__( self.terminate_and_restage = False self.pass_down_termination = pass_down_termination self.exit_on_exception = exit_on_exception + self.preemptible = preemptible self.checkin_frequency_seconds = checkin_frequency_seconds self.last_checkin_successful = False @@ -428,6 +430,7 @@ def checkin(self): 'tag_exclusive': self.tag_exclusive, 'exit_after_num_runs': self.exit_after_num_runs - self.num_runs, 'is_terminating': self.terminate or self.terminate_and_restage, + 'preemptible': self.preemptible, } try: response = self.bundle_service.checkin(self.id, request) diff --git a/codalab/worker/worker_run_state.py b/codalab/worker/worker_run_state.py index 008c8e2c1..5a19df0e6 100644 --- a/codalab/worker/worker_run_state.py +++ b/codalab/worker/worker_run_state.py @@ -344,8 +344,7 @@ def mount_dependency(dependency, shared_file_system): bundle_dir_wait_num_tries=next_bundle_dir_wait_num_tries, ) else: - remove_path(run_state.bundle_path) - os.makedirs(run_state.bundle_path) + os.makedirs(run_state.bundle_path, exist_ok=True) # 2) Set up symlinks docker_dependencies = [] diff --git a/codalab/worker_manager/main.py b/codalab/worker_manager/main.py index c696e4c23..20056f886 100644 --- a/codalab/worker_manager/main.py +++ b/codalab/worker_manager/main.py @@ -114,6 +114,11 @@ def main(): type=int, help="The shared memory size in GB of the run container started by the CodaLab Workers.", ) + parser.add_argument( + '--worker-preemptible', + action='store_true', + help='Whether the CodaLab workers are preemptible.', + ) subparsers = parser.add_subparsers( title='Worker Manager to run', description='Which worker manager to run (AWS Batch etc.)', diff --git a/codalab/worker_manager/worker_manager.py b/codalab/worker_manager/worker_manager.py index 7d9cf23fa..1ec7a0455 100644 --- a/codalab/worker_manager/worker_manager.py +++ b/codalab/worker_manager/worker_manager.py @@ -151,6 +151,8 @@ def build_command(self, worker_id: str, work_dir: str) -> List[str]: ) if self.args.worker_shared_memory_size_gb: command.extend(['--shared-memory-size-gb', str(self.args.worker_shared_memory_size_gb)]) + if self.args.worker_preemptible: + command.extend(['--worker-preemptible']) return command diff --git a/codalab_service.py b/codalab_service.py index 347859e57..ec16ba077 100755 --- a/codalab_service.py +++ b/codalab_service.py @@ -41,6 +41,8 @@ 'worker-manager-cpu', 'worker-manager-gpu', 'worker2', + 'worker-preemptible', + 'worker-preemptible2', ] ALL_NO_SERVICES = [ @@ -59,6 +61,8 @@ 'monitor': 'server', 'worker': 'worker', 'worker2': 'worker', + 'worker-preemptible': 'worker', + 'worker-preemptible2': 'worker', } # Max timeout in seconds to wait for request to a service to get through @@ -953,6 +957,8 @@ def start_services(self): else: self.bring_up_service('worker') self.bring_up_service('worker2') + self.bring_up_service('worker-preemptible') + self.bring_up_service('worker-preemptible2') self.bring_up_service('monitor') diff --git a/docker_config/compose_files/docker-compose.yml b/docker_config/compose_files/docker-compose.yml index b8df63118..3396a2ac5 100644 --- a/docker_config/compose_files/docker-compose.yml +++ b/docker_config/compose_files/docker-compose.yml @@ -407,6 +407,54 @@ services: - rest-server shm_size: '500mb' + worker-preemptible: + image: codalab/worker:${CODALAB_VERSION} + command: | + cl-worker + --server http://rest-server:${CODALAB_REST_PORT} + --work-dir ${CODALAB_WORKER_DIR} + --network-prefix ${CODALAB_WORKER_NETWORK_PREFIX} + --id ${HOSTNAME}-worker-preemptible + --verbose + --preemptible + --tag preemptible + <<: *codalab-base + <<: *codalab-root # Not ideal since worker files saved as root, but without it, can't use docker + depends_on: + - rest-server + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ${CODALAB_WORKER_DIR}:${CODALAB_WORKER_DIR} + networks: + - service + - worker + - rest-server + shm_size: '500mb' + + worker-preemptible2: + image: codalab/worker:${CODALAB_VERSION} + command: | + cl-worker + --server http://rest-server:${CODALAB_REST_PORT} + --work-dir ${CODALAB_WORKER_DIR} + --network-prefix ${CODALAB_WORKER_NETWORK_PREFIX} + --id ${HOSTNAME}-worker-preemptible2 + --verbose + --preemptible + --tag preemptible + <<: *codalab-base + <<: *codalab-root # Not ideal since worker files saved as root, but without it, can't use docker + depends_on: + - rest-server + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ${CODALAB_WORKER_DIR}:${CODALAB_WORKER_DIR} + networks: + - service + - worker + - rest-server + shm_size: '500mb' + worker-shared-file-system: image: codalab/worker:${CODALAB_VERSION} command: | diff --git a/docs/Checkpoints.md b/docs/Checkpoints.md new file mode 100644 index 000000000..55102cee7 --- /dev/null +++ b/docs/Checkpoints.md @@ -0,0 +1,14 @@ +# Checkpoints + +!!! warning "Unreleased feature" + This feature is still in development. It does not fully work yet and will only work once [#3710](https://github.com/codalab/codalab-worksheets/issues/3710) is resolved. + +CodaLab supports the concept of bundle checkpoints, which allow bundles to be stopped and then resumed on different workers. This is especially useful if you want to use preemptible compute nodes, such as [EC2 Spot](https://aws.amazon.com/ec2/spot/) instances. When a worker is preempted, the bundle should be resumed on another worker. + +## Workflow steps + +This section describes a sample workflow for running bundles on a pool of preemptible workers. The workflow uses the tag `preemptiblepool1` to organize compute, but you can change this tag name as needed. + +1. Start a worker manager that runs workers on preemptible compute notes, using the following flags: `--worker-preemptible --worker-tag-exclusive --worker-tag="preemptiblepool1"`. Ensure that all the workers started by the worker manager are configured to share the same network disk for their bundle run working directory / dependency cache. +1. User can run a bundle by running `cl run --request-queue preemptiblepool1`. +1. The bundle manager will assign the bundle to one of the preemptible workers with tag `preemptiblepool1`. If the worker gets preempted, the bundle transitions back to the `STAGED` state and will then be assigned to another preemptible worker with tag `preemptiblepool1`. The history of workers where a bundle has run is stored in the `remote_history` bundle metadata field. diff --git a/docs/Execution.md b/docs/Execution.md index a5ad62498..eb9f91bec 100644 --- a/docs/Execution.md +++ b/docs/Execution.md @@ -97,7 +97,7 @@ You can tag workers and run jobs on workers with those tags. To tag a worker, s To run a job, simply pass the tag in: - cl run date --request-queue tag= + cl run date --request-queue **Other flags**. Run `cl-worker --help` for information on all the supported flags. Aside from the `--server`, other important flags include `--work-dir` diff --git a/docs/Server-Setup.md b/docs/Server-Setup.md index 9b7b2b811..dc6dc0de6 100644 --- a/docs/Server-Setup.md +++ b/docs/Server-Setup.md @@ -125,6 +125,14 @@ Sometimes you may want to run two workers locally. In that case, you should run: ./codalab_service.py start -bds default worker2 +## Run a preemptible worker (for development / testing) + +If you want to run a preemptible worker locally (for development / testing), you should run: + + ./codalab_service.py start -bds worker-preemptible + +This worker will automatically run bundles with tag `preemptible`. + ## Azure Blob Storage To start the server in dev mode with Azurite (an Azure Blob Storage emulator) enabled, run: diff --git a/mkdocs.yml b/mkdocs.yml index c18ba9396..affdde4a6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -37,6 +37,7 @@ nav: - Execution: Execution.md - Competitions: Competitions.md - Storage Externalization: Externalization.md + - Checkpoints: Checkpoints.md - Reference: - CLI Reference: CLI-Reference.md - REST API Reference: REST-API-Reference.md diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index 9f61cf20d..275b99dba 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -141,7 +141,7 @@ def create_worker(context, user_id, worker_id, tag=None, group_name=None): # Creating a worker through cl-worker on the same instance can cause conflicts with existing workers, so instead # mimic the behavior of cl-worker --id [worker_id] --group [group_name], by leveraging the worker check-in. worker_model.worker_checkin( - user_id, worker_id, tag, group_name, 1, 0, 1000, 1000, {}, False, False, 100, False + user_id, worker_id, tag, group_name, 1, 0, 1000, 1000, {}, False, False, 100, False, False ) context.collect_worker(user_id, worker_id) @@ -1053,6 +1053,44 @@ def fetch_contents_blob_from_web_browser(uuid): assert response.headers['Location'].startswith("http://localhost") +@TestModule.register('preemptible') +def test_preemptible(ctx): + """Tests preemptible workers to ensure they are functioning properly. A bundle + that is preemptible should be run on a preemptible worker, and when that worker is killed, + should go back to staged and transfer to another worker. + + This test should only be called when the "worker-preemptible" and "worker-preemptible2" services are + running locally, and test-setup-preemptible.sh should be run first. See the GitHub Actions test file + "preemptible" test for an example of how to set up this test. + """ + uuid = _run_command( + [ + cl, + 'run', + 'bash -c "(mkdir checkpoint1 || mkdir checkpoint2) && sleep 120"', + '--request-queue', + 'preemptible', + ] + ) + # We run (mkdir checkpoint1 || mkdir checkpoint2) to ensure that the working directory is shared between + # worker runs for a preemptible bundle. The first worker this runs on, the directory "checkpoint1" should be created. + # The second worker should create the directory "checkpoint2" because "checkpoint1" should already exist in + # the working directory (so mkdir checkpoint1 will fail and thus mkdir checkpoint2 will run). + + wait_until_state(uuid, State.RUNNING) + remote_preemptible_worker = get_info(uuid, 'remote') + check_equals("True", get_info(uuid, 'on_preemptible_worker')) + # Bundle should be killed by the test-setup-preemptible.sh script now. + # Wait for bundle to be re-assigned + + wait_until_state(uuid, State.READY) + # Bundle should have resumed on the other worker + check_not_equals(remote_preemptible_worker, get_info(uuid, 'remote')) + check_equals("True", get_info(uuid, 'on_preemptible_worker')) + check_contains("checkpoint1", _run_command([cl, 'cat', uuid])) + check_contains("checkpoint2", _run_command([cl, 'cat', uuid])) + + @TestModule.register('default_bundle_store') def test_upload_default_bundle_store(ctx): """Tests the CODALAB_DEFAULT_BUNDLE_STORE_NAME environment diff --git a/tests/test-setup-preemptible-background.sh b/tests/test-setup-preemptible-background.sh new file mode 100755 index 000000000..03e64c30d --- /dev/null +++ b/tests/test-setup-preemptible-background.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Called by test-setup-preemptible.sh and runs in the background. +# Waits until preemptible bundle is running, then stops the worker, +# and finally starts another preemptible worker where the bundle can finish +# running on. + +set -e + +cl work localhost:: + +while : ; do + echo ">> cl search request_queue=preemptible state=running .count" + num_bundles=$(cl search request_queue=preemptible state=running .count) + echo $num_bundles + if [[ $num_bundles -ne 0 ]]; then + echo "Bundle is running! Stopping the worker and bundle container now." + echo ">> docker kill codalab_worker-preemptible_1" + docker kill codalab_worker-preemptible_1 + run_container=$(docker ps -f name=codalab_run -q) + echo ">> docker kill $run_container && docker rm $run_container" + docker kill $run_container && docker rm $run_container + echo "Worker stopped successfully. Starting another preemptible worker in 1 minute..." + sleep 60 + echo ">> docker start codalab_worker-preemptible2_1" + docker start codalab_worker-preemptible2_1 + break + else + echo "No bundles running" + fi + sleep 1 +done \ No newline at end of file diff --git a/tests/test-setup-preemptible.sh b/tests/test-setup-preemptible.sh new file mode 100755 index 000000000..5e9f810a3 --- /dev/null +++ b/tests/test-setup-preemptible.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Performs setup for preemptible test (used in preemptible test in test_cli.py). + +set -e + +cl work localhost:: + +if [[ $(cl search request_queue=preemptible .count) -ne 0 ]]; then + echo "Cleaning up old bundles" + cl rm --force $(cl search request_queue=preemptible -u) +else + echo "No bundles to clean up" +fi + +echo ">> docker kill codalab_worker-preemptible2_1" +docker kill codalab_worker-preemptible2_1 + +__dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CODALAB_USERNAME=$CODALAB_USERNAME CODALAB_PASSWORD=$CODALAB_PASSWORD timeout 5m ${__dir}/test-setup-preemptible-background.sh & diff --git a/tests/unit/rest/bundles_test.py b/tests/unit/rest/bundles_test.py index cf83c8840..e361a88d0 100644 --- a/tests/unit/rest/bundles_test.py +++ b/tests/unit/rest/bundles_test.py @@ -15,6 +15,7 @@ def test_create(self): 'command': 'echo TEST', 'metadata': { 'name': 'run-echo', + 'remote_history': [], 'description': '', 'tags': [''], 'allow_failed_dependencies': False, @@ -60,6 +61,7 @@ def test_create(self): "actions": [], "created": 1326499200, "name": "run-echo", + "remote_history": [], "request_time": "", "request_cpus": 1, "request_gpus": 0, diff --git a/tests/unit/rest/workers_test.py b/tests/unit/rest/workers_test.py index 02588be75..50e54daeb 100644 --- a/tests/unit/rest/workers_test.py +++ b/tests/unit/rest/workers_test.py @@ -17,6 +17,7 @@ def test_checkin(self): 'tag_exclusive': False, 'exit_after_num_runs': 999999999, 'is_terminating': False, + 'preemptible': True, } response = self.app.post_json('/rest/workers/test_worker/checkin', body) self.assertEqual(response.status_int, 200) diff --git a/tests/unit/server/bundle_manager/__init__.py b/tests/unit/server/bundle_manager/__init__.py index d9cb61bd8..d524cab26 100644 --- a/tests/unit/server/bundle_manager/__init__.py +++ b/tests/unit/server/bundle_manager/__init__.py @@ -260,6 +260,7 @@ def mock_worker_checkin( tag_exclusive=False, exit_after_num_runs=999999999, is_terminating=False, + preemptible=False, ) # Mock a reply from the worker self.bundle_manager._worker_model.send_json_message = Mock(return_value=True) diff --git a/tests/unit/server/bundle_manager/schedule_run_bundles_test.py b/tests/unit/server/bundle_manager/schedule_run_bundles_test.py index 271f7132e..c9f1defc8 100644 --- a/tests/unit/server/bundle_manager/schedule_run_bundles_test.py +++ b/tests/unit/server/bundle_manager/schedule_run_bundles_test.py @@ -71,6 +71,19 @@ def test_finalizing_bundle_goes_offline_if_no_worker_claims(self): bundle = self.bundle_manager._model.get_bundle(bundle.uuid) self.assertEqual(bundle.state, State.WORKER_OFFLINE) + def test_reassign_stuck_running_preemptible_bundles(self): + """If no workers exist to claim a bundle, and the bundle is running on a preemptible worker, it should go to the STAGED state in preparation for being reassigned to another worker.""" + bundle = self.create_run_bundle( + State.RUNNING, + {"on_preemptible_worker": True, "remote_history": ["remote1"], "remote": "remote1"}, + ) + self.save_bundle(bundle) + self.bundle_manager._schedule_run_bundles() + + bundle = self.bundle_manager._model.get_bundle(bundle.uuid) + self.assertEqual(bundle.state, State.STAGED) + self.assertEqual(bundle.metadata.remote_history, ["remote1"]) + def test_finalizing_bundle_gets_finished(self): """If a worker checks in with a "finalizing" message, the bundle should transition to the FINALIZING and then FINISHED state.""" diff --git a/tests/unit/worker_manager/slurm_batch_worker_manager_test.py b/tests/unit/worker_manager/slurm_batch_worker_manager_test.py index 7b4d40ff9..ceba92595 100644 --- a/tests/unit/worker_manager/slurm_batch_worker_manager_test.py +++ b/tests/unit/worker_manager/slurm_batch_worker_manager_test.py @@ -30,6 +30,7 @@ def test_base_command(self): slurm_work_dir=None, exit_after_num_failed=None, worker_shared_memory_size_gb=10, + worker_preemptible=False, ) worker_manager: SlurmBatchWorkerManager = SlurmBatchWorkerManager(args) @@ -72,6 +73,7 @@ def test_filter_bundles(self): cpus=3, gpus=1, worker_shared_memory_size_gb=None, + worker_preemptible=False, ) worker_manager: SlurmBatchWorkerManager = SlurmBatchWorkerManager(args)