Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Generic Tasks #8724

Merged
merged 90 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from 82 commits
Commits
Show all changes
90 commits
Select commit Hold shift + click to select a range
7294fa7
Create Generic Task
Jan 18, 2024
39069cd
Get Task Config
Jan 18, 2024
8897866
Fork Task
Jan 18, 2024
0b28abb
Create Child Task
Jan 18, 2024
b361a0a
small fix
Jan 19, 2024
5ea39bb
fix fork tree merge
Jan 22, 2024
fe11347
Kill task
Jan 22, 2024
623f8fc
fix kill merge
Jan 22, 2024
0952d88
remove duplicates
Jan 23, 2024
d1f7295
fix slots
Jan 23, 2024
321cac6
error for non-generic tasks
Jan 23, 2024
6856d57
fix bindings and kill call
Jan 24, 2024
5ace3d1
Pause & Resume Tasks
Jan 25, 2024
dca9970
pause/resume merge fixes
Jan 25, 2024
cdebb73
bindings clean build
Jan 25, 2024
7794cfa
Apply suggestions from code review
AmanuelAaron Jan 26, 2024
2c81f01
remove duplicate test
Jan 26, 2024
48aaa5c
unify migrations
Jan 26, 2024
cdf8eff
Merge branch 'generic-task-final' of github.com:determined-ai/determi…
Jan 26, 2024
ce9e809
formatting
Jan 29, 2024
6ed6749
hide from -h
Jan 29, 2024
59aa76d
pause -> unpause
Jan 29, 2024
9269b1e
Merge branch 'main' into generic-task-final
Jan 29, 2024
1050216
Create Generic Task
Jan 18, 2024
6cdcadb
Get Task Config
Jan 18, 2024
e2dea62
Fork Task
Jan 18, 2024
6a3a88c
Create Child Task
Jan 18, 2024
89d7390
small fix
Jan 19, 2024
ca719f7
fix fork tree merge
Jan 22, 2024
eee37b2
Kill task
Jan 22, 2024
6f22886
fix kill merge
Jan 22, 2024
5943408
remove duplicates
Jan 23, 2024
018f7f2
fix slots
Jan 23, 2024
8904900
error for non-generic tasks
Jan 23, 2024
2f35942
fix bindings and kill call
Jan 24, 2024
9792cab
Pause & Resume Tasks
Jan 25, 2024
ffcb7bf
pause/resume merge fixes
Jan 25, 2024
2509e5c
bindings clean build
Jan 25, 2024
bfa0ac7
remove duplicate test
Jan 26, 2024
0b928d8
unify migrations
Jan 26, 2024
336391b
Apply suggestions from code review
AmanuelAaron Jan 26, 2024
7b982c9
formatting
Jan 29, 2024
99a8cb6
hide from -h
Jan 29, 2024
48ae5e9
pause -> unpause
Jan 29, 2024
816bdc0
Merge branch 'generic-task-final' of github.com:determined-ai/determi…
Jan 29, 2024
2d8f64b
Merge branch 'main' into generic-task-final
AmanuelAaron Jan 29, 2024
c7fcd1b
fix single node check and naming
Jan 29, 2024
3d947b0
fix injection
Jan 29, 2024
75aacb2
lint + test fixes
Jan 29, 2024
8c60a09
fixes
Jan 29, 2024
11055fd
no pause in task.py
Jan 29, 2024
81dcb1a
go lint
Jan 29, 2024
21f105c
Merge branch 'main' into generic-task-final
Jan 29, 2024
005c302
add e2e test for generic tasks
Jan 30, 2024
e4f5967
isort test
Jan 30, 2024
d03ca94
fix test
Jan 30, 2024
4608633
shorten test task run time
Jan 31, 2024
b2a6b1e
single node default true in api_command
Jan 31, 2024
c41ca37
move to Bun
Jan 31, 2024
b17fb78
Merge branch 'main' into generic-task-final
AmanuelAaron Jan 31, 2024
94e3fc5
fix pb
Jan 31, 2024
8f490f4
lint migrations
Feb 1, 2024
81910af
Merge branch 'main' into generic-task-final
Feb 1, 2024
86d5077
fix missing order by
Feb 1, 2024
84c513d
fix intg test
Feb 1, 2024
6c940de
Apply suggestions from code review
AmanuelAaron Feb 2, 2024
714c6d2
Merge branch 'main' into generic-task-final
Feb 2, 2024
81e47a6
fix merge
Feb 2, 2024
7fce67f
fix style
Feb 2, 2024
9b2876d
add task state check
Feb 2, 2024
6c36667
fix comment
Feb 5, 2024
be9d7c4
update migrations
Feb 5, 2024
c0cf141
fix todo
Feb 5, 2024
ce8f7b4
remove action for get task config
Feb 5, 2024
f0395e5
update e2e test
Feb 8, 2024
4104029
reduce test fixture time
Feb 8, 2024
3baef06
remove run python file
Feb 8, 2024
b9917df
add completion tests
Feb 8, 2024
4492c65
fix imports
Feb 8, 2024
5cfd6c3
test lint
Feb 8, 2024
b5a63f4
Merge branch 'main' into generic-task-final
Feb 9, 2024
3916141
update tests
Feb 9, 2024
0bf0bf1
Merge branch 'main' into generic-task-final
Feb 9, 2024
6f19bd2
fix api.ts conflict
Feb 9, 2024
2a4676a
send dummy job for generic tasks
Feb 9, 2024
23f7ad4
set weight
Feb 9, 2024
904c486
Merge branch 'main' into generic-task-final
AmanuelAaron Feb 9, 2024
1c77dd7
lint migrations
Feb 9, 2024
b7263a1
Merge branch 'generic-task-final' of github.com:determined-ai/determi…
Feb 9, 2024
aae7fee
implement v1Job
Feb 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions e2e_tests/tests/fixtures/generic_task/test_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
entrypoint: ["echo", "task ran"]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
entrypoint: ["exit", "1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
entrypoint: ["echo", "forked"]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
entrypoint: ["sleep", "5"]
Empty file.
256 changes: 256 additions & 0 deletions e2e_tests/tests/task/test_generic_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
import subprocess
import time

import pytest

from determined.cli import ntsc
from determined.common import api, util
from determined.common.api import bindings
from tests import api_utils
from tests import config as conf


def wait_for_task_state(
test_session: api.Session,
task_id: str,
expected_state: bindings.v1GenericTaskState,
timeout: int,
) -> bool:
deadline = time.time() + timeout
while time.time() < deadline:
resp = bindings.get_GetTask(test_session, taskId=task_id)
if expected_state == resp.task.taskState:
return True
time.sleep(0.1)
return False


@pytest.mark.e2e_cpu
def test_create_generic_task() -> None:
"""
Start a simple task with a context directory called from the task CLI
"""
command = [
"det",
"-m",
conf.make_master_url(),
"task",
"create",
conf.fixtures_path("generic_task/test_config.yaml"),
"--context",
conf.fixtures_path("generic_task"),
]

subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it guaranteed that det task create will fail if the task crashes?

That's not guaranteed for det cmd run, where the exit code of det cmd run reflects if there was a failure in the CLI itself, not whether or not there was a failure in the commnad that ran.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(question applies throughout)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

det task create will only fail if the CLI returns an error. Tasks that crash or error out will have their task state be ERROR. The same with det task fork.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, that's good, because it's consistent with how other CLI commands run in our system.

Should this test then make sure that the task completed successfully? If it was properly created, it should not fail, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests added to check task completion and failure



@pytest.mark.e2e_cpu
def test_generic_task_completion() -> None:
"""
Start a simple task and check for task completion
"""
test_session = api_utils.determined_test_session()

with open(conf.fixtures_path("generic_task/test_config.yaml"), "r") as config_file:
# Create task
config_text = config_file.read()

req = bindings.v1CreateGenericTaskRequest(
config=config_text,
contextDirectory=[],
projectId=None,
forkedFrom=None,
parentId=None,
inheritContext=False,
noPause=False,
)
task_resp = bindings.post_CreateGenericTask(test_session, body=req)

# Check for complete state
is_valid_state = wait_for_task_state(
test_session, task_resp.taskId, bindings.v1GenericTaskState.COMPLETED, timeout=30
)
if not is_valid_state:
pytest.fail("task failed to complete after 30 seconds")


@pytest.mark.e2e_cpu
def test_create_generic_task_error() -> None:
"""
Start a simple task that fails and check for error task state
"""
test_session = api_utils.determined_test_session()

with open(conf.fixtures_path("generic_task/test_config_error.yaml"), "r") as config_file:
# Create task
config_text = config_file.read()

req = bindings.v1CreateGenericTaskRequest(
config=config_text,
contextDirectory=[],
projectId=None,
forkedFrom=None,
parentId=None,
inheritContext=False,
noPause=False,
)
task_resp = bindings.post_CreateGenericTask(test_session, body=req)

# Check for error state
is_valid_state = wait_for_task_state(
test_session, task_resp.taskId, bindings.v1GenericTaskState.ERROR, timeout=30
)
if not is_valid_state:
pytest.fail("task failed to complete after 30 seconds")


@pytest.mark.e2e_cpu
def test_generic_task_config() -> None:
"""
Start a simple task without a context directory and grab its config
"""
test_session = api_utils.determined_test_session()

with open(conf.fixtures_path("generic_task/test_config.yaml"), "r") as config_file:
# Create task
config_text = config_file.read()
rb-determined-ai marked this conversation as resolved.
Show resolved Hide resolved

req = bindings.v1CreateGenericTaskRequest(
config=config_text,
contextDirectory=[],
projectId=None,
forkedFrom=None,
parentId=None,
inheritContext=False,
noPause=False,
)
task_resp = bindings.post_CreateGenericTask(test_session, body=req)

# Get config
command = ["det", "-m", conf.make_master_url(), "task", "config", task_resp.taskId]

res = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True)

result_config = util.yaml_safe_load(res.stdout)
expected_config = {"entrypoint": ["echo", "task ran"]}
assert result_config == expected_config


@pytest.mark.e2e_cpu
def test_generic_task_create_with_fork() -> None:
"""
Start a simple task without a context directory and grab its config
"""
test_session = api_utils.determined_test_session()

with open(conf.fixtures_path("generic_task/test_config.yaml"), "r") as config_file:
# Create initial task
config = ntsc.parse_config(config_file, None, [], [])
config_text = util.yaml_safe_dump(config)

req = bindings.v1CreateGenericTaskRequest(
config=config_text,
contextDirectory=[],
projectId=None,
forkedFrom=None,
parentId=None,
inheritContext=False,
noPause=False,
)
task_resp = bindings.post_CreateGenericTask(test_session, body=req)

# Create fork task
with open(conf.fixtures_path("generic_task/test_config_fork.yaml"), "r") as fork_config_file:
config = ntsc.parse_config(fork_config_file, None, [], [])
config_text = util.yaml_safe_dump(config)

req = bindings.v1CreateGenericTaskRequest(
config=config_text,
contextDirectory=[],
projectId=None,
forkedFrom=task_resp.taskId,
parentId=None,
inheritContext=False,
noPause=False,
)
fork_task_resp = bindings.post_CreateGenericTask(test_session, body=req)

# Get fork task Config
command = ["det", "-m", conf.make_master_url(), "task", "config", fork_task_resp.taskId]

res = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True)
result_config = util.yaml_safe_load(res.stdout)
expected_config = {"entrypoint": ["echo", "forked"]}
assert result_config == expected_config


@pytest.mark.e2e_cpu
def test_kill_generic_task() -> None:
"""
Start a simple task without a context directory and grab its config
"""
test_session = api_utils.determined_test_session()

with open(conf.fixtures_path("generic_task/test_config.yaml"), "r") as config_file:
# Create task
config = ntsc.parse_config(config_file, None, [], [])
config_text = util.yaml_safe_dump(config)

req = bindings.v1CreateGenericTaskRequest(
config=config_text,
contextDirectory=[],
projectId=None,
forkedFrom=None,
parentId=None,
inheritContext=False,
noPause=False,
)
task_resp = bindings.post_CreateGenericTask(test_session, body=req)

# Kill task
command = ["det", "-m", conf.make_master_url(), "task", "kill", task_resp.taskId]

subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True)

kill_resp = bindings.get_GetTask(test_session, taskId=task_resp.taskId)
assert kill_resp.task.taskState == bindings.v1GenericTaskState.CANCELED


@pytest.mark.e2e_cpu
def test_pause_and_unpause_generic_task() -> None:
"""
Start a simple task without a context directory and grab its config
"""
test_session = api_utils.determined_test_session()

with open(conf.fixtures_path("generic_task/test_config_pause.yaml"), "r") as config_file:
# Create task
config = ntsc.parse_config(config_file, None, [], [])
config_text = util.yaml_safe_dump(config)

req = bindings.v1CreateGenericTaskRequest(
config=config_text,
contextDirectory=[],
projectId=None,
forkedFrom=None,
parentId=None,
inheritContext=False,
noPause=False,
)
task_resp = bindings.post_CreateGenericTask(test_session, body=req)

# Pause task
command = ["det", "-m", conf.make_master_url(), "task", "pause", task_resp.taskId]

subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True)

pause_resp = bindings.get_GetTask(test_session, taskId=task_resp.taskId)
assert pause_resp.task.taskState == bindings.v1GenericTaskState.PAUSED

# Unpause task
command = ["det", "-m", conf.make_master_url(), "task", "unpause", task_resp.taskId]

subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True)

unpause_resp = bindings.get_GetTask(test_session, taskId=task_resp.taskId)
assert unpause_resp.task.taskState == bindings.v1GenericTaskState.ACTIVE
Loading
Loading