Skip to content

Commit

Permalink
Goofys memory optimizations (#726)
Browse files Browse the repository at this point in the history
* comments

* yapf
  • Loading branch information
romilbhardwaj authored Apr 9, 2022
1 parent 564f70b commit 1aa27de
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 1 deletion.
2 changes: 1 addition & 1 deletion sky/data/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ def mount_command(self, mount_path: str) -> str:
# Install goofys if not already installed
if ! [ -x "$(command -v goofys)" ]; then
echo "Installing goofys..."
sudo wget -nc https://github.com/kahing/goofys/releases/latest/download/goofys -O /usr/local/bin/goofys
sudo wget -nc https://github.com/romilbhardwaj/goofys/releases/download/0.24.0-romilb-upstream/goofys -O /usr/local/bin/goofys
sudo chmod +x /usr/local/bin/goofys
else
echo "Goofys already installed. Proceeding..."
Expand Down
34 changes: 34 additions & 0 deletions tests/stress/mountedstorage/mount_stress.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Stress test for testing memory utilization mounted storage
#
# Lists or reads the files in the mounted storage (depending on --list flag)
# Manually watch memory util using top to benchmark
#
# Usage:
# cd tests/stress/mountedstorage
# sky launch -c stress mount_stress.yaml

name: stress

resources:
cloud: aws

workdir: .

file_mounts:
/covid:
source: s3://fah-public-data-covid19-cryptic-pockets
mode: MOUNT

setup: |
# Install jupyter for playing around with
pip install --upgrade pip
conda init bash
conda activate jupyter
conda create -n jupyter python=3.9 -y
conda activate jupyter
pip install jupyter
run: |
python -u read_parallel.py /covid/ --list
# conda activate jupyter
# jupyter notebook --port 8888
74 changes: 74 additions & 0 deletions tests/stress/mountedstorage/read_parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Read all files in a directory recursively in parallel

import os
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed


def parse_args():
import argparse
parser = argparse.ArgumentParser(description='Recursively read in parallel')
parser.add_argument('path', help='Path to directory to read files;')
parser.add_argument('--list',
action='store_true',
help='List files before reading')
args = parser.parse_args()
return args


def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1024.0
return f"{num:.1f}Yi{suffix}"


def read_file(file):
print(file)
with open(file, 'rb') as f:
x = f.read()
# print("File: " + file)
# print("Size of file is :", sizeof_fmt(int(f.tell())))


def list_files(path):
# Returns the number of ls ops done and files ls'd
ls_done = 0
files_lsed = 0
for root, dirs, files in os.walk(path):
files = [os.path.join(root, file) for file in files]
ls_done += 1
files_lsed += len(files)
if ls_done % 1000 == 0:
print("ls done: ", ls_done)
print("files ls'd: ", files_lsed)
print("Got files: ", len(files))
return ls_done, files_lsed


def read_parallel(path):
print(f"Reading files in parallel {path}")
done = 0
for root, dirs, files in os.walk(path): # Uses a generator
with ThreadPoolExecutor(max_workers=64) as executor:
futures = [
executor.submit(read_file, os.path.join(root, file))
for file in files
]
for future in as_completed(futures):
done += 1
if done % 10 == 0:
print("Reads done: ", done)
print(future.result())


if __name__ == '__main__':
args = parse_args()
# s3://fah-public-data-covid19-cryptic-pockets mounted at /covid
if args.list:
ls_done, files_lsed = list_files(args.path)
print("ls done: ", ls_done)
print("files ls'd: ", files_lsed)
else:
read_parallel(args.path)

0 comments on commit 1aa27de

Please sign in to comment.