Skip to content

Commit

Permalink
ci: wart removal
Browse files Browse the repository at this point in the history
We run some CLI list and describe commands after every single
experiment, as a "to sanity check that basic CLI commands don't raise
errors".  That has made e2e test logs unreadable since the day it
was added.  For now, we can keep the tests, but don't dump stdout and
stderr into the test logs.  Some day, we should figure out what
codepaths are being tested passively, and write proper tests for them.

Also, the test_task_logs has been failing intermittently with a timeout
but no error message for weeks.  Instead of using pytest.mark.timeout(),
which can't be caught, implement our own timeout logic, and only dump
stdout and stderr if the cli crashes.
  • Loading branch information
rb-determined-ai committed Oct 26, 2023
1 parent 3a24611 commit df9f0d3
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 18 deletions.
43 changes: 31 additions & 12 deletions e2e_tests/tests/cluster/test_logging.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import functools
import re
import socket
import sys
import threading
from typing import Any, Callable, Dict, Iterable, Optional, Union

import pytest
Expand Down Expand Up @@ -59,7 +60,6 @@ def test_trial_logs() -> None:
@pytest.mark.e2e_cpu_elastic
@pytest.mark.e2e_cpu_cross_version
@pytest.mark.e2e_gpu # Note, e2e_gpu and not gpu_required hits k8s cpu tests.
@pytest.mark.timeout(5 * 60)
@pytest.mark.parametrize(
"task_type,task_config,log_regex",
[
Expand Down Expand Up @@ -105,13 +105,34 @@ def task_log_fields(follow: Optional[bool] = None) -> Iterable[LogFields]:
return bindings.get_TaskLogsFields(session, taskId=task_id, follow=follow)

try:
check_logs(
log_regex,
functools.partial(api.task_logs, session, task_id),
functools.partial(bindings.get_TaskLogsFields, session, taskId=task_id),
)
except socket.timeout:
raise TimeoutError(f"timed out waiting for {task_type} with id {task_id}")
result: Optional[Exception] = None

def do_check_logs() -> None:
nonlocal result
try:
check_logs(
log_regex,
functools.partial(api.task_logs, session, task_id),
functools.partial(bindings.get_TaskLogsFields, session, taskId=task_id),
)
except Exception as e:
result = e

thread = threading.Thread(target=do_check_logs, daemon=True)
thread.start()
thread.join(timeout=5 * 60)
if thread.is_alive():
# The thread did not exit
raise ValueError("do_check_logs timed out")
elif isinstance(result, Exception):
# There was a failure on the thread.
raise result
except Exception:
print("============= test_task_logs_failed, logs from task =============")
for log in task_logs():
print(log.log, end="", file=sys.stderr)
print("============= end of task logs =============")
raise

finally:
command._kill(master_url, task_type, task_id)
Expand All @@ -127,9 +148,7 @@ def check_logs(
if log_regex.match(log.message):
break
else:
for log in log_fn(follow=True):
print(log.message)
pytest.fail("ran out of logs without a match")
raise ValueError("ran out of logs without a match")

# Just make sure these calls 200 and return some logs.
assert any(log_fn(tail=10)), "tail returned no logs"
Expand Down
29 changes: 23 additions & 6 deletions e2e_tests/tests/experiment/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,24 @@ def assert_performed_final_checkpoint(exp_id: int) -> None:
last_workload_matches_last_checkpoint(trials[0].workloads)


def run_cmd_and_print_on_error(cmd: List[str]) -> None:
"""
We run some commands to make sure they work, but we don't need their output polluting the logs.
"""
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
ret = p.wait()
if ret != 0:
print(f"cmd failed: {cmd} exited {ret}", file=sys.stderr)
print("====== stdout from failed command ======", file=sys.stderr)
print(out.decode("utf8"), file=sys.stderr)
print("====== end of stdout ======", file=sys.stderr)
print("====== stderr from failed command ======", file=sys.stderr)
print(err.decode("utf8"), file=sys.stderr)
print("====== end of stderr ======", file=sys.stderr)
raise ValueError(f"cmd failed: {cmd} exited {ret}")


def run_describe_cli_tests(experiment_id: int) -> None:
"""
Runs `det experiment describe` CLI command on a finished
Expand All @@ -627,7 +645,7 @@ def run_describe_cli_tests(experiment_id: int) -> None:
"""
# "det experiment describe" without metrics.
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.check_call(
run_cmd_and_print_on_error(
[
"det",
"-m",
Expand All @@ -646,7 +664,7 @@ def run_describe_cli_tests(experiment_id: int) -> None:

# "det experiment describe" with metrics.
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.check_call(
run_cmd_and_print_on_error(
[
"det",
"-m",
Expand All @@ -671,14 +689,13 @@ def run_list_cli_tests(experiment_id: int) -> None:
exception if the CLI command encounters a traceback failure.
"""

subprocess.check_call(
run_cmd_and_print_on_error(
["det", "-m", conf.make_master_url(), "experiment", "list-trials", str(experiment_id)]
)

subprocess.check_call(
run_cmd_and_print_on_error(
["det", "-m", conf.make_master_url(), "experiment", "list-checkpoints", str(experiment_id)]
)
subprocess.check_call(
run_cmd_and_print_on_error(
[
"det",
"-m",
Expand Down

0 comments on commit df9f0d3

Please sign in to comment.