Skip to content

Commit

Permalink
Merge pull request #42 from troycomi/issue-41
Browse files Browse the repository at this point in the history
fix: memory efficiency for multi-task jobs
  • Loading branch information
troycomi authored Sep 14, 2023
2 parents 48acc96 + 346c806 commit 60d80b8
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 309 deletions.
88 changes: 88 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def to_sacct_dict(sacct_line: str) -> dict:
"State",
"Timelimit",
"TotalCPU",
"NTasks",
)
return dict(zip(columns, sacct_line.split("|")))

Expand Down Expand Up @@ -316,3 +317,90 @@ def multinode_job():
"|36|12-14:16:39|6196869.batch|6196869.batch|33824748K|1|191846Mn|COMPLETED||451-06:00:24"
),
]


@pytest.fixture
def issue_41():
"""job run on multiple nodes, with multiple tasks."""
return [
to_sacct_dict(
"|8|00:00:53|131042|131042||1|16000M|COMPLETED|00:01:00|06:57.815|8"
),
to_sacct_dict(
"|8|00:00:53|131042.batch|131042.batch|20264K|1||COMPLETED||00:00.034|8"
),
to_sacct_dict(
"|8|00:00:53|131042.extern|131042.extern|1052K|1||COMPLETED||00:00.001|8"
),
to_sacct_dict(
"|8|00:00:53|131042.0|131042.0|1947276K|1||COMPLETED||06:57.779|8"
),
]


@pytest.fixture
def console_jobs():
"""collection of sacct outputs for test_reportseff."""

# indexed on job id
return {
"25569410_notime": (
"^|^1^|^21:14:48^|^25569410^|^25569410^|^^|^1^|^1^|^4000Mc^|^"
"COMPLETED^|^19:28:36\n"
"^|^1^|^21:14:49^|^25569410.extern^|^25569410.extern^|^1548K^|^"
"1^|^1^|^4000Mc^|^COMPLETED^|^00:00:00\n"
"^|^1^|^21:14:43^|^25569410.0^|^25569410.0^|^62328K"
"^|^1^|^1^|^4000Mc^|^COMPLETED^|^19:28:36\n"
),
"24418435_notime": (
"^|^1^|^01:27:42^|^24418435^|^24418435^|^^|^1^|^1^|^1Gn^|^"
"COMPLETED^|^01:27:29\n"
"^|^1^|^01:27:42^|^24418435.batch^|^24418435.batch^|^499092K^|^"
"1^|^1^|^1Gn^|^COMPLETED^|^01:27:29\n"
"^|^1^|^01:27:42^|^24418435.extern^|^24418435.extern^|^1376K^|^"
"1^|^1^|^1Gn^|^COMPLETED^|^00:00:00\n"
),
"24418435": (
"^|^1^|^01:27:42^|^24418435^|^24418435^|^^|^1^|^1^|^1Gn^|^"
"COMPLETED^|^03:00:00^|^01:27:29\n"
"^|^1^|^01:27:42^|^24418435.batch^|^24418435.batch^|^499092K^|^"
"1^|^1^|^1Gn^|^COMPLETED^|^^|^01:27:29\n"
"^|^1^|^01:27:42^|^24418435.extern^|^24418435.extern^|^1376K^|^"
"1^|^1^|^1Gn^|^COMPLETED^|^^|^00:00:00\n"
),
"23000233": (
"^|^16^|^00:00:00^|^23000233^|^23000233^|^^|^1^|^1^|^4000Mc^|^"
"CANCELLED by 129319^|^6-00:00:00^|^00:00:00\n"
),
"24221219": (
"^|^1^|^00:09:34^|^24220929_421^|^24221219^|^^|^1^|^1^|^16000Mn^|^"
"COMPLETED^|^09:28.052\n"
"^|^1^|^00:09:34^|^24220929_421.batch^|^24221219.batch"
"^|^5664932K^|^1^|^1^|^16000Mn^|^COMPLETED^|^09:28.051\n"
"^|^1^|^00:09:34^|^24220929_421.extern^|^24221219.extern"
"^|^1404K^|^1^|^1^|^16000Mn^|^COMPLETED^|^00:00:00\n"
),
"24221220": (
"^|^1^|^00:09:33^|^24220929_431^|^24221220^|^^|^1^|^1^|^16000Mn^|^"
"PENDING^|^09:27.460\n"
"^|^1^|^00:09:33^|^24220929_431.batch^|^24221220.batch"
"^|^5518572K^|^1^|^1^|^16000Mn^|^PENDING^|^09:27.459\n"
"^|^1^|^00:09:33^|^24220929_431.extern^|^24221220.extern"
"^|^1400K^|^1^|^1^|^16000Mn^|^PENDING^|^00:00:00\n"
),
"23000381": (
"^|^8^|^00:00:12^|^23000381^|^23000381^|^^|^1^|^1^|^4000Mc^|^FAILED^|^00:00:00\n"
"^|^8^|^00:00:12^|^23000381.batch^|^23000381.batch^|^^|^1^|^1^|^4000Mc^|^"
"FAILED^|^00:00:00\n"
"^|^8^|^00:00:12^|^23000381.extern^|^23000381.extern^|^1592K^|^1^|^1^|^4000Mc^|^"
"COMPLETED^|^00:00:00\n"
),
"23000210": (
"^|^8^|^00:00:00^|^23000210^|^23000210^|^^|^1^|^1^|^20000Mn^|^"
"FAILED^|^00:00.007\n"
"^|^8^|^00:00:00^|^23000210.batch^|^23000210.batch^|^1988K^|^1^|^1^|^20000Mn^|^"
"FAILED^|^00:00.006\n"
"^|^8^|^00:00:00^|^23000210.extern^|^23000210.extern^|^1556K^|^1^|^1^|^20000Mn^|^"
"COMPLETED^|^00:00:00\n"
),
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "reportseff"
version = "2.7.5"
version = "2.7.6"
description= "Tablular seff output"
authors = ["Troy Comi <[email protected]>"]
license = "MIT"
Expand Down
4 changes: 2 additions & 2 deletions src/reportseff/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def __init__(self, job: str, jobid: str, filename: Optional[str]) -> None:
self.time: Optional[str] = "---"
self.time_eff: Union[str, float] = "---"
self.cpu: Optional[Union[str, float]] = "---"
self.mem: Union[str, float] = "---"
self.state: Optional[str] = None
self.mem_eff: Optional[float] = None
self.gpu: Optional[float] = None
Expand Down Expand Up @@ -116,7 +115,8 @@ def update(self, entry: Dict) -> None:
if k not in self.other_entries or not self.other_entries[k]:
self.other_entries[k] = value
mem = parsemem(entry["MaxRSS"]) if "MaxRSS" in entry else 0
self.stepmem = max(self.stepmem, mem)
tasks = int(entry.get("NTasks", 1))
self.stepmem = max(self.stepmem, mem * tasks)

if "TRESUsageOutAve" in entry:
self.energy = max(
Expand Down
2 changes: 1 addition & 1 deletion src/reportseff/output_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
# values derived from other values, list includes all dependent values
self.derived: Dict[str, List] = {
"CPUEff": ["TotalCPU", "AllocCPUS", "Elapsed"],
"MemEff": ["REQMEM", "NNodes", "AllocCPUS", "MaxRSS"],
"MemEff": ["REQMEM", "NNodes", "AllocCPUS", "MaxRSS", "NTasks"],
"TimeEff": ["Elapsed", "Timelimit"],
"GPU": [],
"GPUMem": [],
Expand Down
16 changes: 14 additions & 2 deletions tests/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_job_init(job):
assert job.totalmem is None
assert job.time == "---"
assert job.cpu == "---"
assert job.mem == "---"
assert job.state is None


Expand Down Expand Up @@ -250,7 +249,7 @@ def test_update_part_job():
"Elapsed": "00:10:00",
"MaxRSS": "495644K",
"NNodes": "1",
"NTasks": "",
"NTasks": "1",
}
)
assert job.state is None
Expand Down Expand Up @@ -765,3 +764,16 @@ def test_multinode_job(multinode_job):
job.update(line)

assert job.cpu == 5.0


def test_multinode_job_issue_41(issue_41):
"""Testing issue 41 where multiple tasks are used.
Previously reported incorrect memory efficiency.
"""
job = job_module.Job("131042", "131042", None)
for line in issue_41:
job.update(line)

assert job.cpu == 98.3
assert job.get_entry("MemEff") == 95.1
4 changes: 2 additions & 2 deletions tests/test_output_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def test_renderer_init(renderer):
assert sorted(renderer.query_columns) == sorted(
(
"JobID JobIDRaw State Elapsed TotalCPU "
"AllocCPUS REQMEM NNodes MaxRSS AdminComment"
"AllocCPUS REQMEM NNodes NTasks MaxRSS AdminComment"
).split()
)

Expand Down Expand Up @@ -277,7 +277,7 @@ def test_renderer_correct_columns(renderer):
(
"JobID TotalCPU Elapsed REQMEM"
" JobIDRaw State AdminComment"
" NNodes AllocCPUS MaxRSS Timelimit"
" NNodes NTasks AllocCPUS MaxRSS Timelimit"
).split()
)

Expand Down
Loading

0 comments on commit 60d80b8

Please sign in to comment.