fix: profile import should not overwrite existing profile by default

this PR also simplifies the torchx run to use a smaller base image, and one that we can pin (we had been using the torchx 0.5.0dev0 stream, which is highly unstable, and was causing test stability issues)
project-codeflare · Mar 16, 2023 · 632837f · 632837f
1 parent a4eb426
commit 632837f
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 47 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/plugins/plugin-codeflare/package.json b/plugins/plugin-codeflare/package.json
@@ -30,13 +30,13 @@
     "@types/split2": "^3.2.1"
   },
   "dependencies": {
-    "@guidebooks/store": "^6.1.4",
+    "@guidebooks/store": "^6.1.6",
     "@logdna/tail-file": "^3.0.1",
     "@patternfly/react-charts": "^6.94.18",
     "@patternfly/react-core": "^4.276.6",
     "asciinema-player": "^3.1.0",
     "chokidar": "^3.5.3",
-    "madwizard": "^6.3.1",
+    "madwizard": "^6.3.2",
     "needle": "^3.2.0",
     "open": "^8.4.2",
     "pretty-bytes": "^6.1.0",

diff --git a/tests/kind/inputs/torchx/compute_world_size/main.py b/tests/kind/inputs/torchx/compute_world_size/main.py
@@ -26,30 +26,16 @@
 to ensure that the stack has been setup properly for more serious distributed training jobs.
 """
 
-import hydra
-from omegaconf import DictConfig, OmegaConf
 from torch.distributed.elastic.multiprocessing.errors import record
 from module.util import compute_world_size
 
 
 @record
-def run(cfg: DictConfig) -> None:
-    print(OmegaConf.to_yaml(cfg))
-
-    if cfg.main.throws:
-        raise RuntimeError(f"raising error because cfg.main.throws={cfg.main.throws}")
-    compute_world_size(cfg)
+def run() -> None:
+    compute_world_size()
 
 
 if __name__ == "__main__":
-    # use compose API to make this compatible with ipython notebooks
-    # need to initialize the config directory as a module to make it
-    # not depends on rel path (PWD) or abs path (torchx install dir)
-    # see: https://hydra.cc/docs/advanced/jupyter_notebooks/
-    with hydra.initialize_config_module(
-        config_module="compute_world_size.config"
-    ):
-        cfg: DictConfig = hydra.compose(config_name="defaults")
-        run(cfg)
-
-        print("SUCCEEDED") # @starpit 20230312 for testing
+    run()
+
+    print("SUCCEEDED") # @starpit 20230312 for testing
diff --git a/tests/kind/inputs/torchx/compute_world_size/module/util.py b/tests/kind/inputs/torchx/compute_world_size/module/util.py
@@ -10,16 +10,15 @@
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from omegaconf import DictConfig
 
 
-def compute_world_size(cfg: DictConfig) -> int:
+def compute_world_size() -> int:
 
-    rank = int(os.getenv("RANK", cfg.main.rank))
-    world_size = int(os.getenv("WORLD_SIZE", cfg.main.world_size))
-    master_addr = os.getenv("MASTER_ADDR", cfg.main.master_addr)
-    master_port = int(os.getenv("MASTER_PORT", cfg.main.master_port))
-    backend = cfg.main.backend
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    master_addr = os.getenv("MASTER_ADDR", "localhost")
+    master_port = int(os.getenv("MASTER_PORT", 29500))
+    backend = "gloo"
 
     print(f"initializing `{backend}` process group")
     dist.init_process_group(

diff --git a/tests/kind/profiles/non-gpu6/mcad-default b/tests/kind/profiles/non-gpu6/mcad-default
@@ -2,7 +2,7 @@
   "name": "mcad-default",
   "creationTime": 1660657756574,
   "lastModifiedTime": 1678638052528,
-  "lastUsedTime": 1678715380516,
+  "lastUsedTime": 1678980273687,
   "choices": {
     "madwizard/apriori/use-gpu": "don't use gpus",
     "madwizard/apriori/arch": "x64",
@@ -11,7 +11,7 @@
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
     "ml/codeflare/run": "Bring Your Own Torch Native Code",
-    "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
+    "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"bitnami/pytorch:1.13.1\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
     "ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",