Skip to content

Commit

Permalink
fix: profile import should not overwrite existing profile by default
Browse files Browse the repository at this point in the history
this PR also simplifies the torchx run to use a smaller base image, and one that we can pin (we had been using the torchx 0.5.0dev0 stream, which is highly unstable, and was causing test stability issues)
  • Loading branch information
starpit committed Mar 16, 2023
1 parent a4eb426 commit 632837f
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 47 deletions.
34 changes: 17 additions & 17 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions plugins/plugin-codeflare/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@
"@types/split2": "^3.2.1"
},
"dependencies": {
"@guidebooks/store": "^6.1.4",
"@guidebooks/store": "^6.1.6",
"@logdna/tail-file": "^3.0.1",
"@patternfly/react-charts": "^6.94.18",
"@patternfly/react-core": "^4.276.6",
"asciinema-player": "^3.1.0",
"chokidar": "^3.5.3",
"madwizard": "^6.3.1",
"madwizard": "^6.3.2",
"needle": "^3.2.0",
"open": "^8.4.2",
"pretty-bytes": "^6.1.0",
Expand Down
24 changes: 5 additions & 19 deletions tests/kind/inputs/torchx/compute_world_size/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,16 @@
to ensure that the stack has been setup properly for more serious distributed training jobs.
"""

import hydra
from omegaconf import DictConfig, OmegaConf
from torch.distributed.elastic.multiprocessing.errors import record
from module.util import compute_world_size


@record
def run(cfg: DictConfig) -> None:
print(OmegaConf.to_yaml(cfg))

if cfg.main.throws:
raise RuntimeError(f"raising error because cfg.main.throws={cfg.main.throws}")
compute_world_size(cfg)
def run() -> None:
compute_world_size()


if __name__ == "__main__":
# use compose API to make this compatible with ipython notebooks
# need to initialize the config directory as a module to make it
# not depends on rel path (PWD) or abs path (torchx install dir)
# see: https://hydra.cc/docs/advanced/jupyter_notebooks/
with hydra.initialize_config_module(
config_module="compute_world_size.config"
):
cfg: DictConfig = hydra.compose(config_name="defaults")
run(cfg)

print("SUCCEEDED") # @starpit 20230312 for testing
run()

print("SUCCEEDED") # @starpit 20230312 for testing
13 changes: 6 additions & 7 deletions tests/kind/inputs/torchx/compute_world_size/module/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@
import torch
import torch.distributed as dist
import torch.nn.functional as F
from omegaconf import DictConfig


def compute_world_size(cfg: DictConfig) -> int:
def compute_world_size() -> int:

rank = int(os.getenv("RANK", cfg.main.rank))
world_size = int(os.getenv("WORLD_SIZE", cfg.main.world_size))
master_addr = os.getenv("MASTER_ADDR", cfg.main.master_addr)
master_port = int(os.getenv("MASTER_PORT", cfg.main.master_port))
backend = cfg.main.backend
rank = int(os.getenv("RANK", 0))
world_size = int(os.getenv("WORLD_SIZE", 1))
master_addr = os.getenv("MASTER_ADDR", "localhost")
master_port = int(os.getenv("MASTER_PORT", 29500))
backend = "gloo"

print(f"initializing `{backend}` process group")
dist.init_process_group(
Expand Down
4 changes: 2 additions & 2 deletions tests/kind/profiles/non-gpu6/mcad-default
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "mcad-default",
"creationTime": 1660657756574,
"lastModifiedTime": 1678638052528,
"lastUsedTime": 1678715380516,
"lastUsedTime": 1678980273687,
"choices": {
"madwizard/apriori/use-gpu": "don't use gpus",
"madwizard/apriori/arch": "x64",
Expand All @@ -11,7 +11,7 @@
"madwizard/apriori/in-terminal": "HTML",
"ml/codeflare": "Submit a new Run",
"ml/codeflare/run": "Bring Your Own Torch Native Code",
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"bitnami/pytorch:1.13.1\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
Expand Down

0 comments on commit 632837f

Please sign in to comment.