Goofys memory optimizations (#726)

* comments * yapf
skypilot-org · Apr 9, 2022 · 1aa27de · 1aa27de
1 parent 564f70b
commit 1aa27de
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 1 deletion.
diff --git a/sky/data/storage.py b/sky/data/storage.py
@@ -808,7 +808,7 @@ def mount_command(self, mount_path: str) -> str:
             # Install goofys if not already installed
             if ! [ -x "$(command -v goofys)" ]; then
               echo "Installing goofys..."
-              sudo wget -nc https://github.com/kahing/goofys/releases/latest/download/goofys -O /usr/local/bin/goofys
+              sudo wget -nc https://github.com/romilbhardwaj/goofys/releases/download/0.24.0-romilb-upstream/goofys -O /usr/local/bin/goofys
               sudo chmod +x /usr/local/bin/goofys
             else
               echo "Goofys already installed. Proceeding..."

diff --git a/tests/stress/mountedstorage/mount_stress.yaml b/tests/stress/mountedstorage/mount_stress.yaml
@@ -0,0 +1,34 @@
+# Stress test for testing memory utilization mounted storage
+#
+# Lists or reads the files in the mounted storage (depending on --list flag)
+# Manually watch memory util using top to benchmark
+#
+# Usage:
+# cd tests/stress/mountedstorage
+# sky launch -c stress mount_stress.yaml
+
+name: stress
+
+resources:
+  cloud: aws
+
+workdir: .
+
+file_mounts:
+  /covid:
+    source: s3://fah-public-data-covid19-cryptic-pockets
+    mode: MOUNT
+
+setup: |
+  # Install jupyter for playing around with
+  pip install --upgrade pip
+  conda init bash
+  conda activate jupyter
+  conda create -n jupyter python=3.9 -y
+  conda activate jupyter
+  pip install jupyter
+
+run: |
+  python -u read_parallel.py /covid/ --list
+#  conda activate jupyter
+#  jupyter notebook --port 8888
diff --git a/tests/stress/mountedstorage/read_parallel.py b/tests/stress/mountedstorage/read_parallel.py
@@ -0,0 +1,74 @@
+# Read all files in a directory recursively in parallel
+
+import os
+from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import as_completed
+
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description='Recursively read in parallel')
+    parser.add_argument('path', help='Path to directory to read files;')
+    parser.add_argument('--list',
+                        action='store_true',
+                        help='List files before reading')
+    args = parser.parse_args()
+    return args
+
+
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f}Yi{suffix}"
+
+
+def read_file(file):
+    print(file)
+    with open(file, 'rb') as f:
+        x = f.read()
+        # print("File: " + file)
+        # print("Size of file is :", sizeof_fmt(int(f.tell())))
+
+
+def list_files(path):
+    # Returns the number of ls ops done and files ls'd
+    ls_done = 0
+    files_lsed = 0
+    for root, dirs, files in os.walk(path):
+        files = [os.path.join(root, file) for file in files]
+        ls_done += 1
+        files_lsed += len(files)
+        if ls_done % 1000 == 0:
+            print("ls done: ", ls_done)
+            print("files ls'd: ", files_lsed)
+    print("Got files: ", len(files))
+    return ls_done, files_lsed
+
+
+def read_parallel(path):
+    print(f"Reading files in parallel {path}")
+    done = 0
+    for root, dirs, files in os.walk(path):  # Uses a generator
+        with ThreadPoolExecutor(max_workers=64) as executor:
+            futures = [
+                executor.submit(read_file, os.path.join(root, file))
+                for file in files
+            ]
+            for future in as_completed(futures):
+                done += 1
+                if done % 10 == 0:
+                    print("Reads done: ", done)
+                print(future.result())
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    # s3://fah-public-data-covid19-cryptic-pockets mounted at /covid
+    if args.list:
+        ls_done, files_lsed = list_files(args.path)
+        print("ls done: ", ls_done)
+        print("files ls'd: ", files_lsed)
+    else:
+        read_parallel(args.path)