From 2bd17bfa4de47a7a13fd65c1035311b7fc161904 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 19 May 2022 01:04:13 +0000
Subject: [PATCH 01/24] data/benchmarks

---
 benchmarks/README.md        |  34 ++++++++++
 benchmarks/requirements.txt |   4 ++
 benchmarks/run.py           |  28 ++++++++
 benchmarks/run_benchmark.py | 124 ++++++++++++++++++++++++++++++++++++
 4 files changed, 190 insertions(+)
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/requirements.txt
 create mode 100644 benchmarks/run.py
 create mode 100644 benchmarks/run_benchmark.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..6e60e38af
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,34 @@
+# Install dependencies
+
+```
+pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113
+python setup.py develop
+```
+
+# Usage instructions
+
+
+```
+usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--batch_size BATCH_SIZE]
+                        [--num_epochs NUM_EPOCHS] [--report_location REPORT_LOCATION]
+                        [--num_workers NUM_WORKERS] [--shuffle] [--dataloaderv DATALOADERV]
+```
+
+## Available metrics
+* [x] Total time
+* [x] Time per batch
+* [x] Time per epoch
+* [x] Precision over time
+* [x] CPU Load
+* [x] GPU Load
+* [x] Memory usage
+
+## Additional profiling
+
+```
+pip install scalene
+pip install torch-tb-profiler
+```
+
+
+`scalene run_benchmark.py`
\ No newline at end of file
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
new file mode 100644
index 000000000..2c59965bf
--- /dev/null
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,4 @@
+pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113
+pip install git+https://github.com/pytorch/data@main
+pip install scalene
+pip install torch-tb-profiler
\ No newline at end of file
diff --git a/benchmarks/run.py b/benchmarks/run.py
new file mode 100644
index 000000000..2aa2d3909
--- /dev/null
+++ b/benchmarks/run.py
@@ -0,0 +1,28 @@
+import argparse
+from torchvision.prototype.datasets import load
+from torch.utils.data import DataLoader
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
+parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
+parser.add_argument("--batch_size", type=int, default=32, help="")
+parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
+
+args = parser.parse_args()
+dataset = args.dataset
+batch_size = args.batch_size
+
+# setup data pipe
+dp = load("gtsrb", split="train")
+print(f"batch size {batch_size}")
+print(f"Dataset name {dp}")
+print(f"Dataset length {len(dp)}")
+
+# Setup data loader
+# Shuffle won't work in distributed 
+dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=True)
+
+# Training loop
+for elem in dl:
+    print(i)
\ No newline at end of file
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
new file mode 100644
index 000000000..13c2a2e09
--- /dev/null
+++ b/benchmarks/run_benchmark.py
@@ -0,0 +1,124 @@
+import argparse
+import torchvision
+import torch
+try:
+    import transformers
+except:
+    pass
+from torchvision.prototype.datasets import load
+import torch.nn.functional as F
+from torchvision import transforms
+import time
+from statistics import mean
+import torch.optim as optim
+
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
+parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
+parser.add_argument("--batch_size", type=int, default=1, help="")
+parser.add_argument("--num_epochs", type=int, default=2)
+parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
+parser.add_argument("--num_workers", type=int, default=1, help="Number of dataloader workers")
+parser.add_argument("--shuffle", action="store_true")
+parser.add_argument("--dataloaderv", type=int, default=1)
+
+args = parser.parse_args()
+dataset = args.dataset
+model_name = args.model_name
+batch_size = args.batch_size
+num_epochs = args.num_epochs
+report_location = args.report_location
+num_workers = args.num_workers
+shuffle = args.shuffle
+dataloaderv = args.dataloaderv
+
+if dataloaderv == 1:
+    from torch.utils.data import DataLoader
+elif dataloaderv == 2:
+    from torch.utils.data.dataloader_experimental import DataLoader2 as DataLoader
+else:
+    raise(f"dataloaderv{dataloaderv} is not a valid option")
+
+# Util function for multiprocessing
+def init_fn(worker_id):
+    info = torch.utils.data.get_worker_info()
+    num_workers = info.num_workers
+    datapipe = info.dataset
+    torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
+
+# Download model
+model_map = {
+    "resnext50_32x4d": torchvision.models.resnext50_32x4d,
+    "mobilenet_v3_large" : torchvision.models.mobilenet_v3_large,
+    "transformerencoder" : torch.nn.TransformerEncoder,
+    # "bert-base" : transformers.BertModel,
+
+}
+
+model = model_map[model_name]()
+
+# setup data pipe
+dp = load(dataset, split="train")
+print(f"batch size {batch_size}")
+print(f"Dataset name {dp}")
+print(f"Dataset length {len(dp)}")
+
+# Datapipe format
+print(f"data format is {next(iter(dp))}")
+
+# Setup data loader
+if num_workers == 1:
+    dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=shuffle)
+
+# Shuffle won't work in distributed yet
+else:
+    dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=True, num_workers=num_workers, worker_init_fn=init_fn, multiprocessing_context="spawn")
+
+
+# TODO: Add measurements time per batch, per epoch and total time here
+
+total_start = time.time()
+per_epoch_durations = []
+batch_durations = []
+
+criterion = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+for epoch in range(num_epochs):
+    epoch_start = time.time()
+    running_loss = 0
+    for i, elem in enumerate(dl):
+        batch_start = time.time()
+        # Should image preprocessing be done online or offline?
+        # This is all image specific, need to refactor this out or create a training loop per model/dataset combo
+        input_image = torch.unsqueeze(elem["image"], 0)
+        input_image = transforms.Resize(size=(96,98))(input_image)
+        input_image = input_image.reshape(64,3,7,7) / 255
+
+        labels = elem["label"]
+        optimizer.zero_grad()
+
+        outputs = model(input_image)
+
+        # TODO: ValueError: Expected input batch_size (64) to match target batch_size (1).
+        loss = criterion(outputs,labels)
+        loss.backward()
+        optimizer.step()
+        running_loss += loss.item()
+        if i % 2000 == 1999:    # print every 2000 mini-batches
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+            running_loss = 0.0
+
+        batch_end = time.time()
+        batch_duration = batch_end - batch_start 
+        batch_durations.append(batch_duration)
+    epoch_end = time.time()
+    epoch_duration = epoch_end - epoch_start
+    per_epoch_durations.append(epoch_duration)
+total_end = time.time()
+total_duration = total_end - total_start
+
+print(f"Total duration is {total_duration}")
+print(f"Per epoch duration {mean(per_epoch_durations)}")
+print(f"Per batch duration {mean(batch_durations)}")
\ No newline at end of file

From 4e8f41cbf8b781609fcdfa0c6477751e80f65a4c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 19 May 2022 01:06:23 +0000
Subject: [PATCH 02/24] removed extra files

---
 benchmarks/requirements.txt |  4 ----
 benchmarks/run.py           | 28 ----------------------------
 2 files changed, 32 deletions(-)
 delete mode 100644 benchmarks/requirements.txt
 delete mode 100644 benchmarks/run.py

diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
deleted file mode 100644
index 2c59965bf..000000000
--- a/benchmarks/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113
-pip install git+https://github.com/pytorch/data@main
-pip install scalene
-pip install torch-tb-profiler
\ No newline at end of file
diff --git a/benchmarks/run.py b/benchmarks/run.py
deleted file mode 100644
index 2aa2d3909..000000000
--- a/benchmarks/run.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import argparse
-from torchvision.prototype.datasets import load
-from torch.utils.data import DataLoader
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
-parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
-parser.add_argument("--batch_size", type=int, default=32, help="")
-parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
-
-args = parser.parse_args()
-dataset = args.dataset
-batch_size = args.batch_size
-
-# setup data pipe
-dp = load("gtsrb", split="train")
-print(f"batch size {batch_size}")
-print(f"Dataset name {dp}")
-print(f"Dataset length {len(dp)}")
-
-# Setup data loader
-# Shuffle won't work in distributed 
-dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=True)
-
-# Training loop
-for elem in dl:
-    print(i)
\ No newline at end of file

From 0818eb6afcaba9149c84e7c0b90b669158cc7dc8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 19 May 2022 01:45:06 +0000
Subject: [PATCH 03/24] nananannana

---
 benchmarks/run_benchmark.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index 13c2a2e09..cbdb6780b 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -97,18 +97,21 @@ def init_fn(worker_id):
         input_image = input_image.reshape(64,3,7,7) / 255
 
         labels = elem["label"]
+        
+        # TODO: remove this is wrong
+        labels = labels.repeat(64)
         optimizer.zero_grad()
 
         outputs = model(input_image)
-
-        # TODO: ValueError: Expected input batch_size (64) to match target batch_size (1).
+        
+        # ValueError: Expected input batch_size (64) to match target batch_size (1).
         loss = criterion(outputs,labels)
         loss.backward()
         optimizer.step()
         running_loss += loss.item()
-        if i % 2000 == 1999:    # print every 2000 mini-batches
-            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
-            running_loss = 0.0
+        # if i % 2000 == 1999:    # print every 2000 mini-batches
+        print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+        running_loss = 0.0
 
         batch_end = time.time()
         batch_duration = batch_end - batch_start 

From 590a20bae4facc96e8cc33cc27c07f38947df300 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 19 May 2022 01:49:45 +0000
Subject: [PATCH 04/24] added gpu support

---
 benchmarks/run_benchmark.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index cbdb6780b..d8bafd755 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -18,6 +18,7 @@
 parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
 parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
 parser.add_argument("--batch_size", type=int, default=1, help="")
+parser.add_argument("--device", type=str, default="cuda:0", help="Options are are cpu or cuda:0")
 parser.add_argument("--num_epochs", type=int, default=2)
 parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
 parser.add_argument("--num_workers", type=int, default=1, help="Number of dataloader workers")
@@ -57,7 +58,7 @@ def init_fn(worker_id):
 
 }
 
-model = model_map[model_name]()
+model = model_map[model_name]().to(torch.device("cuda:0"))
 
 # setup data pipe
 dp = load(dataset, split="train")
@@ -92,11 +93,11 @@ def init_fn(worker_id):
         batch_start = time.time()
         # Should image preprocessing be done online or offline?
         # This is all image specific, need to refactor this out or create a training loop per model/dataset combo
-        input_image = torch.unsqueeze(elem["image"], 0)
+        input_image = torch.unsqueeze(elem["image"], 0).to(torch.device("cuda:0"))
         input_image = transforms.Resize(size=(96,98))(input_image)
         input_image = input_image.reshape(64,3,7,7) / 255
 
-        labels = elem["label"]
+        labels = elem["label"].to(torch.device("cuda:0"))
         
         # TODO: remove this is wrong
         labels = labels.repeat(64)

From 411167bbca3b800b3f54d37674e8751e40a80e29 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 19 May 2022 18:22:26 +0000
Subject: [PATCH 05/24] [skip ci] enable profiler

---
 benchmarks/run_benchmark.py | 91 ++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 37 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index d8bafd755..c373afb8d 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -11,9 +11,10 @@
 import time
 from statistics import mean
 import torch.optim as optim
+from torch.profiler import profile, record_function, ProfilerActivity
 
 
-
+## Arg parsing
 parser = argparse.ArgumentParser()
 parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
 parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
@@ -49,6 +50,12 @@ def init_fn(worker_id):
     datapipe = info.dataset
     torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
 
+def trace_handler(p):
+    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
+    print(output)
+    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
+
+
 # Download model
 model_map = {
     "resnext50_32x4d": torchvision.models.resnext50_32x4d,
@@ -86,42 +93,52 @@ def init_fn(worker_id):
 
 criterion = torch.nn.CrossEntropyLoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
-for epoch in range(num_epochs):
-    epoch_start = time.time()
-    running_loss = 0
-    for i, elem in enumerate(dl):
-        batch_start = time.time()
-        # Should image preprocessing be done online or offline?
-        # This is all image specific, need to refactor this out or create a training loop per model/dataset combo
-        input_image = torch.unsqueeze(elem["image"], 0).to(torch.device("cuda:0"))
-        input_image = transforms.Resize(size=(96,98))(input_image)
-        input_image = input_image.reshape(64,3,7,7) / 255
-
-        labels = elem["label"].to(torch.device("cuda:0"))
-        
-        # TODO: remove this is wrong
-        labels = labels.repeat(64)
-        optimizer.zero_grad()
-
-        outputs = model(input_image)
-        
-        # ValueError: Expected input batch_size (64) to match target batch_size (1).
-        loss = criterion(outputs,labels)
-        loss.backward()
-        optimizer.step()
-        running_loss += loss.item()
-        # if i % 2000 == 1999:    # print every 2000 mini-batches
-        print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
-        running_loss = 0.0
-
-        batch_end = time.time()
-        batch_duration = batch_end - batch_start 
-        batch_durations.append(batch_duration)
-    epoch_end = time.time()
-    epoch_duration = epoch_end - epoch_start
-    per_epoch_durations.append(epoch_duration)
-total_end = time.time()
-total_duration = total_end - total_start
+
+with profile(
+    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+    schedule=torch.profiler.schedule(
+        wait=1,
+        warmup=1,
+        active=2),
+    on_trace_ready=trace_handler
+) as p:
+
+    for epoch in range(num_epochs):
+        epoch_start = time.time()
+        running_loss = 0
+        for i, elem in enumerate(dl):
+            batch_start = time.time()
+            # Should image preprocessing be done online or offline?
+            # This is all image specific, need to refactor this out or create a training loop per model/dataset combo
+            input_image = torch.unsqueeze(elem["image"], 0).to(torch.device("cuda:0"))
+            input_image = transforms.Resize(size=(96,98))(input_image)
+            input_image = input_image.reshape(64,3,7,7) / 255
+
+            labels = elem["label"].to(torch.device("cuda:0"))
+            
+            # TODO: remove this is wrong
+            labels = labels.repeat(64)
+            optimizer.zero_grad()
+
+            outputs = model(input_image)
+            
+            # ValueError: Expected input batch_size (64) to match target batch_size (1).
+            loss = criterion(outputs,labels)
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item()
+            # if i % 2000 == 1999:    # print every 2000 mini-batches
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+            running_loss = 0.0
+
+            batch_end = time.time()
+            batch_duration = batch_end - batch_start 
+            batch_durations.append(batch_duration)
+        epoch_end = time.time()
+        epoch_duration = epoch_end - epoch_start
+        per_epoch_durations.append(epoch_duration)
+    total_end = time.time()
+    total_duration = total_end - total_start
 
 print(f"Total duration is {total_duration}")
 print(f"Per epoch duration {mean(per_epoch_durations)}")

From 4fffddf50ac0b9864c16ed31e02eaa5b20962672 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 25 May 2022 19:57:28 +0000
Subject: [PATCH 06/24] fixed preprocessing pipeline

---
 benchmarks/run_benchmark.py | 64 ++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index c373afb8d..b638104a4 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -1,10 +1,9 @@
 import argparse
+import sys
 import torchvision
 import torch
-try:
-    import transformers
-except:
-    pass
+import transformers
+
 from torchvision.prototype.datasets import load
 import torch.nn.functional as F
 from torchvision import transforms
@@ -13,7 +12,6 @@
 import torch.optim as optim
 from torch.profiler import profile, record_function, ProfilerActivity
 
-
 ## Arg parsing
 parser = argparse.ArgumentParser()
 parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
@@ -30,6 +28,7 @@
 dataset = args.dataset
 model_name = args.model_name
 batch_size = args.batch_size
+device = args.device
 num_epochs = args.num_epochs
 report_location = args.report_location
 num_workers = args.num_workers
@@ -61,20 +60,45 @@ def trace_handler(p):
     "resnext50_32x4d": torchvision.models.resnext50_32x4d,
     "mobilenet_v3_large" : torchvision.models.mobilenet_v3_large,
     "transformerencoder" : torch.nn.TransformerEncoder,
-    # "bert-base" : transformers.BertModel,
+    "bert-base" : transformers.BertModel,
 
 }
 
-model = model_map[model_name]().to(torch.device("cuda:0"))
+model = model_map[model_name]().to(torch.device(device))
 
 # setup data pipe
-dp = load(dataset, split="train")
+if model_name in ["resnext50_32x4d", "mobilenet_v3_large"]:
+    dp = load(dataset, split="train")
+
+else:
+    print(f"{model} not supported yet")
+
 print(f"batch size {batch_size}")
 print(f"Dataset name {dp}")
 print(f"Dataset length {len(dp)}")
 
 # Datapipe format
-print(f"data format is {next(iter(dp))}")
+print(f"data format before preprocessing is {next(iter(dp))}")
+
+if dataset == "gtsrb":
+    def transform(img):
+        t= transforms.Compose([
+            transforms.ToPILImage(),
+            transforms.Resize(size=(96,98)),
+            # transforms.reshape(64,3,7,7),
+            transforms.ToTensor()]
+        )
+        return t(img)
+
+    dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
+    dp = dp.map(lambda sample : transform(sample.decode()), input_col="image")
+    dp = dp.map(lambda sample : sample.to_categories(), input_col="label")
+    dp = dp.batch(batch_size)
+    
+# dp_batches.map(lambda batch : {"images" : [sample["image"]]})
+
+# Datapipe format after preprocessing
+print(f"data format after preprocessing is \n {next(iter(dp))}\n")
 
 # Setup data loader
 if num_workers == 1:
@@ -85,8 +109,6 @@ def trace_handler(p):
     dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=True, num_workers=num_workers, worker_init_fn=init_fn, multiprocessing_context="spawn")
 
 
-# TODO: Add measurements time per batch, per epoch and total time here
-
 total_start = time.time()
 per_epoch_durations = []
 batch_durations = []
@@ -108,27 +130,17 @@ def trace_handler(p):
         running_loss = 0
         for i, elem in enumerate(dl):
             batch_start = time.time()
-            # Should image preprocessing be done online or offline?
-            # This is all image specific, need to refactor this out or create a training loop per model/dataset combo
-            input_image = torch.unsqueeze(elem["image"], 0).to(torch.device("cuda:0"))
-            input_image = transforms.Resize(size=(96,98))(input_image)
-            input_image = input_image.reshape(64,3,7,7) / 255
-
-            labels = elem["label"].to(torch.device("cuda:0"))
             
-            # TODO: remove this is wrong
-            labels = labels.repeat(64)
+            labels = elem["label"].to(torch.device("cuda:0"))       
             optimizer.zero_grad()
-
-            outputs = model(input_image)
-            
-            # ValueError: Expected input batch_size (64) to match target batch_size (1).
+            outputs = model(elem["image"])
             loss = criterion(outputs,labels)
             loss.backward()
             optimizer.step()
+
             running_loss += loss.item()
-            # if i % 2000 == 1999:    # print every 2000 mini-batches
-            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+            if i % 200 == 1999:    # print every 2000 mini-batches
+                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
             running_loss = 0.0
 
             batch_end = time.time()

From 07739b21d306b00d18f648cded250d006057567b Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 25 May 2022 19:57:44 +0000
Subject: [PATCH 07/24] [skip ci]


From fea3322bf40c62e4b8fc24cb6f0437eae72f6aed Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 25 May 2022 20:07:02 +0000
Subject: [PATCH 08/24] update

---
 benchmarks/run_benchmark.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index b638104a4..66973896e 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -88,15 +88,20 @@ def transform(img):
             # transforms.reshape(64,3,7,7),
             transforms.ToTensor()]
         )
-        return t(img)
+        return t(img).to(torch.device(device))
 
+    # Filter out bounding box and path to image
     dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
+
+    # Apply image preprocessing
     dp = dp.map(lambda sample : transform(sample.decode()), input_col="image")
     dp = dp.map(lambda sample : sample.to_categories(), input_col="label")
+
+    # TODO: Missing a collation
+
+    # Batch
     dp = dp.batch(batch_size)
     
-# dp_batches.map(lambda batch : {"images" : [sample["image"]]})
-
 # Datapipe format after preprocessing
 print(f"data format after preprocessing is \n {next(iter(dp))}\n")
 
@@ -131,7 +136,7 @@ def transform(img):
         for i, elem in enumerate(dl):
             batch_start = time.time()
             
-            labels = elem["label"].to(torch.device("cuda:0"))       
+            labels = elem["label"].to(torch.device(device))       
             optimizer.zero_grad()
             outputs = model(elem["image"])
             loss = criterion(outputs,labels)

From aec3a6fcfdf768c264c5b03e70da65f7a1092471 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 25 May 2022 21:10:15 +0000
Subject: [PATCH 09/24] [skip ci] lunch

---
 benchmarks/run_benchmark.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index 66973896e..a9adaee47 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -90,12 +90,18 @@ def transform(img):
         )
         return t(img).to(torch.device(device))
 
+    def str_to_list(str):
+        l = []
+        for char in str:
+            l.append(int(char))
+        return l
+
     # Filter out bounding box and path to image
     dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
 
     # Apply image preprocessing
     dp = dp.map(lambda sample : transform(sample.decode()), input_col="image")
-    dp = dp.map(lambda sample : sample.to_categories(), input_col="label")
+    dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
 
     # TODO: Missing a collation
 
@@ -135,10 +141,10 @@ def transform(img):
         running_loss = 0
         for i, elem in enumerate(dl):
             batch_start = time.time()
-            
-            labels = elem["label"].to(torch.device(device))       
+            # print(f"elem is {elem}")
+            labels = elem[0]["label"]      
             optimizer.zero_grad()
-            outputs = model(elem["image"])
+            outputs = model(elem[0]["image"])
             loss = criterion(outputs,labels)
             loss.backward()
             optimizer.step()

From 621ca0b941339c05f7ef226a03fca04c57a87e32 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 25 May 2022 22:18:45 +0000
Subject: [PATCH 10/24] [skip ci] it runs

---
 benchmarks/run_benchmark.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index a9adaee47..40153dce2 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -78,7 +78,7 @@ def trace_handler(p):
 print(f"Dataset length {len(dp)}")
 
 # Datapipe format
-print(f"data format before preprocessing is {next(iter(dp))}")
+# print(f"data format before preprocessing is {next(iter(dp))}")
 
 if dataset == "gtsrb":
     def transform(img):
@@ -109,7 +109,7 @@ def str_to_list(str):
     dp = dp.batch(batch_size)
     
 # Datapipe format after preprocessing
-print(f"data format after preprocessing is \n {next(iter(dp))}\n")
+# print(f"data format after preprocessing is \n {next(iter(dp))}\n")
 
 # Setup data loader
 if num_workers == 1:
@@ -141,8 +141,8 @@ def str_to_list(str):
         running_loss = 0
         for i, elem in enumerate(dl):
             batch_start = time.time()
-            # print(f"elem is {elem}")
-            labels = elem[0]["label"]      
+
+            labels = torch.argmax(elem[0]["label"], dim=1)      
             optimizer.zero_grad()
             outputs = model(elem[0]["image"])
             loss = criterion(outputs,labels)

From 1c06bae6d851175c28543e216aa5b66d1dd562ef Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 25 May 2022 22:55:08 +0000
Subject: [PATCH 11/24] [skip ci] fix profile settings

---
 benchmarks/run_benchmark.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index 40153dce2..b6a9a49a6 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -18,7 +18,7 @@
 parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
 parser.add_argument("--batch_size", type=int, default=1, help="")
 parser.add_argument("--device", type=str, default="cuda:0", help="Options are are cpu or cuda:0")
-parser.add_argument("--num_epochs", type=int, default=2)
+parser.add_argument("--num_epochs", type=int, default=1)
 parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
 parser.add_argument("--num_workers", type=int, default=1, help="Number of dataloader workers")
 parser.add_argument("--shuffle", action="store_true")
@@ -49,10 +49,10 @@ def init_fn(worker_id):
     datapipe = info.dataset
     torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
 
-def trace_handler(p):
-    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
-    print(output)
-    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
+# def trace_handler(p):
+#     output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
+#     print(output)
+#     p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
 
 
 # Download model
@@ -129,11 +129,12 @@ def str_to_list(str):
 
 with profile(
     activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-    schedule=torch.profiler.schedule(
-        wait=1,
-        warmup=1,
-        active=2),
-    on_trace_ready=trace_handler
+    on_trace_ready=torch.profiler.tensorboard_trace_handler,
+    record_shapes=True,
+    profile_memory=True,
+    with_flops=True,
+    with_stack=True,
+    with_modules=True
 ) as p:
 
     for epoch in range(num_epochs):
@@ -150,8 +151,8 @@ def str_to_list(str):
             optimizer.step()
 
             running_loss += loss.item()
-            if i % 200 == 1999:    # print every 2000 mini-batches
-                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+            # if i % 200 == 1999:    # print every 2000 mini-batches
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.10f}')
             running_loss = 0.0
 
             batch_end = time.time()

From 9001b005c2c2106cc2fb345c4a9fedd2ce0da1b7 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 25 May 2022 23:35:17 +0000
Subject: [PATCH 12/24] [skip ci] added logging and fixed profiler

---
 benchmarks/run_benchmark.py | 41 +++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index b6a9a49a6..2c2da5a6a 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -1,16 +1,20 @@
 import argparse
 import sys
+import logging
+
 import torchvision
 import torch
 import transformers
-
 from torchvision.prototype.datasets import load
 import torch.nn.functional as F
 from torchvision import transforms
 import time
 from statistics import mean
 import torch.optim as optim
-from torch.profiler import profile, record_function, ProfilerActivity
+import torch.profiler
+
+logging.basicConfig(filename='example.log', level=logging.DEBUG)
+
 
 ## Arg parsing
 parser = argparse.ArgumentParser()
@@ -49,12 +53,6 @@ def init_fn(worker_id):
     datapipe = info.dataset
     torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
 
-# def trace_handler(p):
-#     output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
-#     print(output)
-#     p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
-
-
 # Download model
 model_map = {
     "resnext50_32x4d": torchvision.models.resnext50_32x4d,
@@ -78,7 +76,7 @@ def init_fn(worker_id):
 print(f"Dataset length {len(dp)}")
 
 # Datapipe format
-# print(f"data format before preprocessing is {next(iter(dp))}")
+logging.debug(f"data format before preprocessing is {next(iter(dp))}")
 
 if dataset == "gtsrb":
     def transform(img):
@@ -103,13 +101,11 @@ def str_to_list(str):
     dp = dp.map(lambda sample : transform(sample.decode()), input_col="image")
     dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
 
-    # TODO: Missing a collation
-
     # Batch
     dp = dp.batch(batch_size)
     
 # Datapipe format after preprocessing
-# print(f"data format after preprocessing is \n {next(iter(dp))}\n")
+logging.debug(f"data format after preprocessing is \n {next(iter(dp))}\n")
 
 # Setup data loader
 if num_workers == 1:
@@ -127,9 +123,10 @@ def str_to_list(str):
 criterion = torch.nn.CrossEntropyLoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
 
-with profile(
-    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-    on_trace_ready=torch.profiler.tensorboard_trace_handler,
+with torch.profiler.profile(
+    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+    on_trace_ready=torch.profiler.tensorboard_trace_handler('./result', worker_name='datapipe0'),
+    schedule=torch.profiler.schedule(wait=1,warmup=1,active=2),
     record_shapes=True,
     profile_memory=True,
     with_flops=True,
@@ -141,6 +138,8 @@ def str_to_list(str):
         epoch_start = time.time()
         running_loss = 0
         for i, elem in enumerate(dl):
+            p.step()
+
             batch_start = time.time()
 
             labels = torch.argmax(elem[0]["label"], dim=1)      
@@ -158,12 +157,18 @@ def str_to_list(str):
             batch_end = time.time()
             batch_duration = batch_end - batch_start 
             batch_durations.append(batch_duration)
+            p.step()
+        p.step()
         epoch_end = time.time()
         epoch_duration = epoch_end - epoch_start
         per_epoch_durations.append(epoch_duration)
     total_end = time.time()
     total_duration = total_end - total_start
 
-print(f"Total duration is {total_duration}")
-print(f"Per epoch duration {mean(per_epoch_durations)}")
-print(f"Per batch duration {mean(batch_durations)}")
\ No newline at end of file
+# TODO: Make this output some human readable markdown file
+def create_report(per_epoch_durations, batch_durations, total_duration):
+    print(f"Total duration is {total_duration}")
+    print(f"Per epoch duration {mean(per_epoch_durations)}")
+    print(f"Per batch duration {mean(batch_durations)}")
+
+create_report(per_epoch_durations, batch_durations, total_duration)
\ No newline at end of file

From 95d48aedd444dfc69b2532be8303a750e9cb9ff8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 26 May 2022 00:24:54 +0000
Subject: [PATCH 13/24] update

---
 benchmarks/README.md        | 15 +++++++++------
 benchmarks/run_benchmark.py |  2 +-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 6e60e38af..ad5ab431e 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,7 +1,7 @@
 # Install dependencies
 
 ```
-pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113
+pip3 install --pre torch torchvision torchaudio torchtext --extra-index-url https://download.pytorch.org/whl/nightly/cu113
 python setup.py develop
 ```
 
@@ -9,9 +9,8 @@ python setup.py develop
 
 
 ```
-usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--batch_size BATCH_SIZE]
-                        [--num_epochs NUM_EPOCHS] [--report_location REPORT_LOCATION]
-                        [--num_workers NUM_WORKERS] [--shuffle] [--dataloaderv DATALOADERV]
+usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--batch_size BATCH_SIZE] [--device DEVICE] [--num_epochs NUM_EPOCHS] 
+                        [--report_location REPORT_LOCATION] [--num_workers NUM_WORKERS] [--shuffle] [--dataloaderv DATALOADERV]
 ```
 
 ## Available metrics
@@ -22,13 +21,17 @@ usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--ba
 * [x] CPU Load
 * [x] GPU Load
 * [x] Memory usage
+* [x] PyTorch profiler
 
 ## Additional profiling
 
 ```
 pip install scalene
-pip install torch-tb-profiler
 ```
+`scalene run_benchmark.py`
 
 
-`scalene run_benchmark.py`
\ No newline at end of file
+## Other benchmarks in the wild
+* https://github.com/pytorch/kineto/blob/main/tb_plugin/examples/datapipe_example.py
+* https://github.com/pytorch/text/tree/main/test/datasets
+* https://github.com/pytorch/vision/tree/main/torchvision/prototype/datasets
\ No newline at end of file
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index 2c2da5a6a..aaa941efe 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -82,7 +82,7 @@ def init_fn(worker_id):
     def transform(img):
         t= transforms.Compose([
             transforms.ToPILImage(),
-            transforms.Resize(size=(96,98)),
+            transforms.Resize(size=(100,100)),
             # transforms.reshape(64,3,7,7),
             transforms.ToTensor()]
         )

From 4b8da09bf516d093ee55cf8fd8bc388438ae5df0 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 26 May 2022 00:25:37 +0000
Subject: [PATCH 14/24] [skip ci] push

---
 benchmarks/run_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index aaa941efe..e943c64dc 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -83,7 +83,7 @@ def transform(img):
         t= transforms.Compose([
             transforms.ToPILImage(),
             transforms.Resize(size=(100,100)),
-            # transforms.reshape(64,3,7,7),
+            # transforms.reshape(64,3,7,7), TODO: Figure out collation
             transforms.ToTensor()]
         )
         return t(img).to(torch.device(device))

From 58101fe0a313f9fb0263dbcb2d3fcb053ff9328d Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Tue, 31 May 2022 22:47:50 +0000
Subject: [PATCH 15/24] [skip ci] added nvidia smi and lscpu

---
 benchmarks/run_benchmark.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index e943c64dc..cbe207973 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -1,6 +1,7 @@
 import argparse
 import sys
 import logging
+import subprocess
 
 import torchvision
 import torch
@@ -71,6 +72,13 @@ def init_fn(worker_id):
 else:
     print(f"{model} not supported yet")
 
+if device.startswith("cuda"):
+    nvidiasmi = subprocess.check_output("nvidia-smi", shell=True, text=True)
+    print(nvidiasmi)
+
+lscpu = subprocess.check_output("lscpu", shell=True, text=True)
+print(lscpu)
+
 print(f"batch size {batch_size}")
 print(f"Dataset name {dp}")
 print(f"Dataset length {len(dp)}")
@@ -138,8 +146,6 @@ def str_to_list(str):
         epoch_start = time.time()
         running_loss = 0
         for i, elem in enumerate(dl):
-            p.step()
-
             batch_start = time.time()
 
             labels = torch.argmax(elem[0]["label"], dim=1)      
@@ -158,10 +164,11 @@ def str_to_list(str):
             batch_duration = batch_end - batch_start 
             batch_durations.append(batch_duration)
             p.step()
-        p.step()
+
         epoch_end = time.time()
         epoch_duration = epoch_end - epoch_start
         per_epoch_durations.append(epoch_duration)
+
     total_end = time.time()
     total_duration = total_end - total_start
 

From 18d6a2e52762dccaae11ecb2231cf041eee906b1 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 1 Jun 2022 00:16:10 +0000
Subject: [PATCH 16/24] [ski ci] refactor

---
 benchmarks/__init__.py      |   0
 benchmarks/args.py          |  26 +++++++++
 benchmarks/datasets.py      |  29 ++++++++++
 benchmarks/report.py        |   7 +++
 benchmarks/run_benchmark.py | 106 +++++++-----------------------------
 benchmarks/trainers.py      |  30 ++++++++++
 benchmarks/utils.py         |   6 ++
 7 files changed, 117 insertions(+), 87 deletions(-)
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/args.py
 create mode 100644 benchmarks/datasets.py
 create mode 100644 benchmarks/report.py
 create mode 100644 benchmarks/trainers.py
 create mode 100644 benchmarks/utils.py

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/args.py b/benchmarks/args.py
new file mode 100644
index 000000000..a399f6572
--- /dev/null
+++ b/benchmarks/args.py
@@ -0,0 +1,26 @@
+import argparse
+
+## Arg parsing
+def arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
+    parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
+    parser.add_argument("--batch_size", type=int, default=1, help="")
+    parser.add_argument("--device", type=str, default="cuda:0", help="Options are are cpu or cuda:0")
+    parser.add_argument("--num_epochs", type=int, default=1)
+    parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
+    parser.add_argument("--num_workers", type=int, default=1, help="Number of dataloader workers")
+    parser.add_argument("--shuffle", action="store_true")
+    parser.add_argument("--dataloaderv", type=int, default=1)
+
+    args = parser.parse_args()
+    dataset = args.dataset
+    model_name = args.model_name
+    batch_size = args.batch_size
+    device = args.device
+    num_epochs = args.num_epochs
+    report_location = args.report_location
+    num_workers = args.num_workers
+    shuffle = args.shuffle
+    dataloaderv = args.dataloaderv
+    return dataset,model_name,batch_size,device,num_epochs,num_workers,shuffle,dataloaderv
\ No newline at end of file
diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
new file mode 100644
index 000000000..1ea50bc5b
--- /dev/null
+++ b/benchmarks/datasets.py
@@ -0,0 +1,29 @@
+from torchvision import transforms
+import torch
+
+def prepare_gtsrb(batch_size, device, dp):
+    def transform(img):
+        t= transforms.Compose([
+            transforms.ToPILImage(),
+            transforms.Resize(size=(100,100)),
+            # transforms.reshape(64,3,7,7), TODO: Figure out collation
+            transforms.ToTensor()]
+        )
+        return t(img).to(torch.device(device))
+
+    def str_to_list(str):
+        l = []
+        for char in str:
+            l.append(int(char))
+        return l
+
+    # Filter out bounding box and path to image
+    dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
+
+    # Apply image preprocessing
+    dp = dp.map(lambda sample : transform(sample.decode()), input_col="image")
+    dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
+
+    # Batch
+    dp = dp.batch(batch_size)
+    return dp
\ No newline at end of file
diff --git a/benchmarks/report.py b/benchmarks/report.py
new file mode 100644
index 000000000..a5bbf8d91
--- /dev/null
+++ b/benchmarks/report.py
@@ -0,0 +1,7 @@
+from statistics import mean
+
+
+def create_report(per_epoch_durations, batch_durations, total_duration):
+    print(f"Total duration is {total_duration}")
+    print(f"Per epoch duration {mean(per_epoch_durations)}")
+    print(f"Per batch duration {mean(batch_durations)}")
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index cbe207973..cd62dec67 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -1,4 +1,3 @@
-import argparse
 import sys
 import logging
 import subprocess
@@ -10,35 +9,20 @@
 import torch.nn.functional as F
 from torchvision import transforms
 import time
-from statistics import mean
 import torch.optim as optim
 import torch.profiler
 
+# Relative imports
+from args import arg_parser
+from utils import init_fn
+from datasets import prepare_gtsrb
+from trainers import train
+from report import create_report
+
 logging.basicConfig(filename='example.log', level=logging.DEBUG)
 
 
-## Arg parsing
-parser = argparse.ArgumentParser()
-parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
-parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
-parser.add_argument("--batch_size", type=int, default=1, help="")
-parser.add_argument("--device", type=str, default="cuda:0", help="Options are are cpu or cuda:0")
-parser.add_argument("--num_epochs", type=int, default=1)
-parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
-parser.add_argument("--num_workers", type=int, default=1, help="Number of dataloader workers")
-parser.add_argument("--shuffle", action="store_true")
-parser.add_argument("--dataloaderv", type=int, default=1)
-
-args = parser.parse_args()
-dataset = args.dataset
-model_name = args.model_name
-batch_size = args.batch_size
-device = args.device
-num_epochs = args.num_epochs
-report_location = args.report_location
-num_workers = args.num_workers
-shuffle = args.shuffle
-dataloaderv = args.dataloaderv
+dataset, model_name, batch_size, device, num_epochs, num_workers, shuffle, dataloaderv = arg_parser()
 
 if dataloaderv == 1:
     from torch.utils.data import DataLoader
@@ -47,12 +31,7 @@
 else:
     raise(f"dataloaderv{dataloaderv} is not a valid option")
 
-# Util function for multiprocessing
-def init_fn(worker_id):
-    info = torch.utils.data.get_worker_info()
-    num_workers = info.num_workers
-    datapipe = info.dataset
-    torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
+
 
 # Download model
 model_map = {
@@ -87,30 +66,7 @@ def init_fn(worker_id):
 logging.debug(f"data format before preprocessing is {next(iter(dp))}")
 
 if dataset == "gtsrb":
-    def transform(img):
-        t= transforms.Compose([
-            transforms.ToPILImage(),
-            transforms.Resize(size=(100,100)),
-            # transforms.reshape(64,3,7,7), TODO: Figure out collation
-            transforms.ToTensor()]
-        )
-        return t(img).to(torch.device(device))
-
-    def str_to_list(str):
-        l = []
-        for char in str:
-            l.append(int(char))
-        return l
-
-    # Filter out bounding box and path to image
-    dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
-
-    # Apply image preprocessing
-    dp = dp.map(lambda sample : transform(sample.decode()), input_col="image")
-    dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
-
-    # Batch
-    dp = dp.batch(batch_size)
+    dp = prepare_gtsrb(batch_size, device, dp)
     
 # Datapipe format after preprocessing
 logging.debug(f"data format after preprocessing is \n {next(iter(dp))}\n")
@@ -123,13 +79,16 @@ def str_to_list(str):
 else:
     dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=True, num_workers=num_workers, worker_init_fn=init_fn, multiprocessing_context="spawn")
 
+criterion = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+
+
 
 total_start = time.time()
 per_epoch_durations = []
 batch_durations = []
 
-criterion = torch.nn.CrossEntropyLoss()
-optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+
 
 with torch.profiler.profile(
     activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
@@ -142,40 +101,13 @@ def str_to_list(str):
     with_modules=True
 ) as p:
 
-    for epoch in range(num_epochs):
-        epoch_start = time.time()
-        running_loss = 0
-        for i, elem in enumerate(dl):
-            batch_start = time.time()
-
-            labels = torch.argmax(elem[0]["label"], dim=1)      
-            optimizer.zero_grad()
-            outputs = model(elem[0]["image"])
-            loss = criterion(outputs,labels)
-            loss.backward()
-            optimizer.step()
-
-            running_loss += loss.item()
-            # if i % 200 == 1999:    # print every 2000 mini-batches
-            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.10f}')
-            running_loss = 0.0
-
-            batch_end = time.time()
-            batch_duration = batch_end - batch_start 
-            batch_durations.append(batch_duration)
-            p.step()
-
-        epoch_end = time.time()
-        epoch_duration = epoch_end - epoch_start
-        per_epoch_durations.append(epoch_duration)
+    train(num_epochs, model, dl, per_epoch_durations, batch_durations, criterion, optimizer, p)
 
     total_end = time.time()
     total_duration = total_end - total_start
 
 # TODO: Make this output some human readable markdown file
-def create_report(per_epoch_durations, batch_durations, total_duration):
-    print(f"Total duration is {total_duration}")
-    print(f"Per epoch duration {mean(per_epoch_durations)}")
-    print(f"Per batch duration {mean(batch_durations)}")
 
-create_report(per_epoch_durations, batch_durations, total_duration)
\ No newline at end of file
+create_report(per_epoch_durations, batch_durations, total_duration)
+
+
diff --git a/benchmarks/trainers.py b/benchmarks/trainers.py
new file mode 100644
index 000000000..674faf7c4
--- /dev/null
+++ b/benchmarks/trainers.py
@@ -0,0 +1,30 @@
+import time
+import torch
+
+def train(num_epochs, model, dl, per_epoch_durations, batch_durations, criterion, optimizer, p):
+    for epoch in range(num_epochs):
+        epoch_start = time.time()
+        running_loss = 0
+        for i, elem in enumerate(dl):
+            batch_start = time.time()
+
+            labels = torch.argmax(elem[0]["label"], dim=1)      
+            optimizer.zero_grad()
+            outputs = model(elem[0]["image"])
+            loss = criterion(outputs,labels)
+            loss.backward()
+            optimizer.step()
+
+            running_loss += loss.item()
+            # if i % 200 == 1999:    # print every 2000 mini-batches
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.10f}')
+            running_loss = 0.0
+
+            batch_end = time.time()
+            batch_duration = batch_end - batch_start 
+            batch_durations.append(batch_duration)
+            p.step()
+
+        epoch_end = time.time()
+        epoch_duration = epoch_end - epoch_start
+        per_epoch_durations.append(epoch_duration)
\ No newline at end of file
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
new file mode 100644
index 000000000..f149b3ce1
--- /dev/null
+++ b/benchmarks/utils.py
@@ -0,0 +1,6 @@
+# Util function for multiprocessing
+def init_fn(worker_id):
+    info = torch.utils.data.get_worker_info()
+    num_workers = info.num_workers
+    datapipe = info.dataset
+    torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
\ No newline at end of file

From e6638c7dce93ec0649bf4655d81a6dc99be787ca Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 1 Jun 2022 00:27:03 +0000
Subject: [PATCH 17/24] [ski ci] refactor

---
 benchmarks/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index f149b3ce1..d53ec796a 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -1,4 +1,6 @@
 # Util function for multiprocessing
+import torch
+
 def init_fn(worker_id):
     info = torch.utils.data.get_worker_info()
     num_workers = info.num_workers

From 6cae2a3559b7623a0aded8f3de19825ac11a2a92 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 1 Jun 2022 00:46:45 +0000
Subject: [PATCH 18/24] [skip ci] collation works

---
 benchmarks/README.md   | 1 +
 benchmarks/datasets.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index ad5ab431e..a098fd2a9 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -14,6 +14,7 @@ usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--ba
 ```
 
 ## Available metrics
+* [ ] PyTorch profiler
 * [x] Total time
 * [x] Time per batch
 * [x] Time per epoch
diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
index 1ea50bc5b..7b6fb78d7 100644
--- a/benchmarks/datasets.py
+++ b/benchmarks/datasets.py
@@ -6,7 +6,6 @@ def transform(img):
         t= transforms.Compose([
             transforms.ToPILImage(),
             transforms.Resize(size=(100,100)),
-            # transforms.reshape(64,3,7,7), TODO: Figure out collation
             transforms.ToTensor()]
         )
         return t(img).to(torch.device(device))

From cfa49aa161403402d6b9b6a68fe1427ae2c340ec Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 1 Jun 2022 01:51:32 +0000
Subject: [PATCH 19/24] [skip ci] push

---
 benchmarks/args.py          |  5 +++-
 benchmarks/datasets.py      | 32 ++++++++++-----------
 benchmarks/report.py        |  1 -
 benchmarks/run_benchmark.py | 55 ++++++++++++++++++++-----------------
 benchmarks/trainers.py      | 14 ++++++----
 requirements.txt            |  1 +
 6 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/benchmarks/args.py b/benchmarks/args.py
index a399f6572..6576a78d4 100644
--- a/benchmarks/args.py
+++ b/benchmarks/args.py
@@ -4,6 +4,7 @@
 def arg_parser():
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
+    parser.add_argument("--ispipe", action="store_true", help="is datapipe or dataset?")
     parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
     parser.add_argument("--batch_size", type=int, default=1, help="")
     parser.add_argument("--device", type=str, default="cuda:0", help="Options are are cpu or cuda:0")
@@ -14,7 +15,9 @@ def arg_parser():
     parser.add_argument("--dataloaderv", type=int, default=1)
 
     args = parser.parse_args()
+    print(args)
     dataset = args.dataset
+    ispipe = args.ispipe
     model_name = args.model_name
     batch_size = args.batch_size
     device = args.device
@@ -23,4 +26,4 @@ def arg_parser():
     num_workers = args.num_workers
     shuffle = args.shuffle
     dataloaderv = args.dataloaderv
-    return dataset,model_name,batch_size,device,num_epochs,num_workers,shuffle,dataloaderv
\ No newline at end of file
+    return dataset, ispipe, model_name,batch_size,device,num_epochs,num_workers,shuffle,dataloaderv
\ No newline at end of file
diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
index 7b6fb78d7..8dc93cc18 100644
--- a/benchmarks/datasets.py
+++ b/benchmarks/datasets.py
@@ -1,28 +1,28 @@
-from torchvision import transforms
+from torchvision import transforms, datasets
 import torch
 
-def prepare_gtsrb(batch_size, device, dp):
-    def transform(img):
-        t= transforms.Compose([
-            transforms.ToPILImage(),
-            transforms.Resize(size=(100,100)),
-            transforms.ToTensor()]
-        )
-        return t(img).to(torch.device(device))
+def transform(img):
+    t= transforms.Compose([
+        transforms.ToPILImage(),
+        transforms.Resize(size=(100,100)),
+        transforms.ToTensor()]
+    )
+    return t(img)
 
-    def str_to_list(str):
-        l = []
-        for char in str:
-            l.append(int(char))
-        return l
+def str_to_list(str):
+    l = []
+    for char in str:
+        l.append(int(char))
+    return l
 
+def prepare_gtsrb_datapipe(batch_size, device, dp):
     # Filter out bounding box and path to image
     dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
 
     # Apply image preprocessing
-    dp = dp.map(lambda sample : transform(sample.decode()), input_col="image")
+    dp = dp.map(lambda sample : transform(sample.decode().to(torch.device(device))), input_col="image")
     dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
 
     # Batch
     dp = dp.batch(batch_size)
-    return dp
\ No newline at end of file
+    return dp    
\ No newline at end of file
diff --git a/benchmarks/report.py b/benchmarks/report.py
index a5bbf8d91..5d2378e69 100644
--- a/benchmarks/report.py
+++ b/benchmarks/report.py
@@ -1,6 +1,5 @@
 from statistics import mean
 
-
 def create_report(per_epoch_durations, batch_durations, total_duration):
     print(f"Total duration is {total_duration}")
     print(f"Per epoch duration {mean(per_epoch_durations)}")
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index cd62dec67..eeece0c2c 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -5,7 +5,7 @@
 import torchvision
 import torch
 import transformers
-from torchvision.prototype.datasets import load
+from torchvision.prototype.datasets import load as loadpipe
 import torch.nn.functional as F
 from torchvision import transforms
 import time
@@ -14,15 +14,24 @@
 
 # Relative imports
 from args import arg_parser
+from benchmarks.datasets import prepare_gtsrb_dataset
 from utils import init_fn
-from datasets import prepare_gtsrb
+from datasets import prepare_gtsrb_datapipe
 from trainers import train
 from report import create_report
 
 logging.basicConfig(filename='example.log', level=logging.DEBUG)
 
 
-dataset, model_name, batch_size, device, num_epochs, num_workers, shuffle, dataloaderv = arg_parser()
+dataset, ispipe, model_name, batch_size, device, num_epochs, num_workers, shuffle, dataloaderv = arg_parser()
+
+if device.startswith("cuda"):
+    nvidiasmi = subprocess.check_output("nvidia-smi", shell=True, text=True)
+    logging.debug(nvidiasmi)
+
+lscpu = subprocess.check_output("lscpu", shell=True, text=True)
+logging.debug(lscpu)
+
 
 if dataloaderv == 1:
     from torch.utils.data import DataLoader
@@ -31,8 +40,6 @@
 else:
     raise(f"dataloaderv{dataloaderv} is not a valid option")
 
-
-
 # Download model
 model_map = {
     "resnext50_32x4d": torchvision.models.resnext50_32x4d,
@@ -45,39 +52,38 @@
 model = model_map[model_name]().to(torch.device(device))
 
 # setup data pipe
-if model_name in ["resnext50_32x4d", "mobilenet_v3_large"]:
-    dp = load(dataset, split="train")
+if dataset == "gtsrb":
+    if ispipe:
+        dp = loadpipe(dataset, split="train")
+        logging.debug(f"data format before preprocessing is {next(iter(dp))}")
 
-else:
-    print(f"{model} not supported yet")
+        dp = prepare_gtsrb_datapipe(batch_size, device, dp)
+        logging.debug(f"data format after preprocessing is \n {next(iter(dp))}\n")
 
-if device.startswith("cuda"):
-    nvidiasmi = subprocess.check_output("nvidia-smi", shell=True, text=True)
-    print(nvidiasmi)
+    else:
+        # No further preprocessing needed this returns a tuple of Images and labels as ints
+        # Do I need to do batching and collation manually?
+        ds = torchvision.datasets.GTSRB(root=".",split="train", download=True)
+
+
+else:
+    print(f"{dataset} not supported yet")
 
-lscpu = subprocess.check_output("lscpu", shell=True, text=True)
-print(lscpu)
 
 print(f"batch size {batch_size}")
 print(f"Dataset name {dp}")
 print(f"Dataset length {len(dp)}")
 
-# Datapipe format
-logging.debug(f"data format before preprocessing is {next(iter(dp))}")
+# Setup data loader
 
-if dataset == "gtsrb":
-    dp = prepare_gtsrb(batch_size, device, dp)
-    
-# Datapipe format after preprocessing
-logging.debug(f"data format after preprocessing is \n {next(iter(dp))}\n")
+data = dp if dp else ds
 
-# Setup data loader
 if num_workers == 1:
-    dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=shuffle)
+    dl = DataLoader(dataset=data, batch_size=batch_size, shuffle=shuffle)
 
 # Shuffle won't work in distributed yet
 else:
-    dl = DataLoader(dataset=dp, batch_size=batch_size, shuffle=True, num_workers=num_workers, worker_init_fn=init_fn, multiprocessing_context="spawn")
+    dl = DataLoader(dataset=data, batch_size=batch_size, shuffle=True, num_workers=num_workers, worker_init_fn=init_fn, multiprocessing_context="spawn")
 
 criterion = torch.nn.CrossEntropyLoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
@@ -107,7 +113,6 @@
     total_duration = total_end - total_start
 
 # TODO: Make this output some human readable markdown file
-
 create_report(per_epoch_durations, batch_durations, total_duration)
 
 
diff --git a/benchmarks/trainers.py b/benchmarks/trainers.py
index 674faf7c4..5f5a33f4e 100644
--- a/benchmarks/trainers.py
+++ b/benchmarks/trainers.py
@@ -8,10 +8,7 @@ def train(num_epochs, model, dl, per_epoch_durations, batch_durations, criterion
         for i, elem in enumerate(dl):
             batch_start = time.time()
 
-            labels = torch.argmax(elem[0]["label"], dim=1)      
-            optimizer.zero_grad()
-            outputs = model(elem[0]["image"])
-            loss = criterion(outputs,labels)
+            loss = process(model, criterion, optimizer, elem)
             loss.backward()
             optimizer.step()
 
@@ -27,4 +24,11 @@ def train(num_epochs, model, dl, per_epoch_durations, batch_durations, criterion
 
         epoch_end = time.time()
         epoch_duration = epoch_end - epoch_start
-        per_epoch_durations.append(epoch_duration)
\ No newline at end of file
+        per_epoch_durations.append(epoch_duration)
+
+def process(model, criterion, optimizer, elem):
+    labels = torch.argmax(elem[0]["label"], dim=1)      
+    optimizer.zero_grad()
+    outputs = model(elem[0]["image"])
+    loss = criterion(outputs,labels)
+    return loss
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 14a4b8fa8..f509913dc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 urllib3 >= 1.25
 requests
+portalocker >= 2.0.0
\ No newline at end of file

From eed5203324bf4b70ebf4f6e48a589c56dfa047e8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Tue, 28 Jun 2022 15:47:45 -0700
Subject: [PATCH 20/24] [skip ci] push

---
 benchmarks/README.md       |  2 +-
 benchmarks/cloud/README.md | 33 +++++++++++++++
 benchmarks/cloud/ec2.yml   | 84 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/cloud/README.md
 create mode 100644 benchmarks/cloud/ec2.yml

diff --git a/benchmarks/README.md b/benchmarks/README.md
index a098fd2a9..b9ff7136d 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -14,7 +14,7 @@ usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--ba
 ```
 
 ## Available metrics
-* [ ] PyTorch profiler
+* [ ] PyTorch profiler - won't be possible until H2 2022
 * [x] Total time
 * [x] Time per batch
 * [x] Time per epoch
diff --git a/benchmarks/cloud/README.md b/benchmarks/cloud/README.md
new file mode 100644
index 000000000..59d8bbbc9
--- /dev/null
+++ b/benchmarks/cloud/README.md
@@ -0,0 +1,33 @@
+This folder contains templates that are useful for cloud setups
+
+Idea would be to provision a machine by configuring it in a YAML file and then running a benchmark script on it automatically. This is critical both for ad hoc benchmarking that are reproducible but also including real world benchmarks in a release.
+
+We've provided some useful `yml` templates for you to get started
+
+https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/using-cfn-cli-creating-stack.html
+
+## Setup aws cli
+`aws configure` and enter your credentials
+
+## Setup stack (machine configuration)
+
+```sh
+ aws cloudformation create-stack \
+  --stack-name myteststack \
+  --template-body ec2.yml \
+```
+
+## Ssh into machine and run job 
+```
+ssh elastic_ip
+git clone https://github.com/pytorch/data
+cd data/benchmarks
+python run_benchmark.py
+```
+
+Visually inspect logs
+
+## Shut down stack
+
+`aws cloudformation delete-stack --stack-name myteststack`
+
diff --git a/benchmarks/cloud/ec2.yml b/benchmarks/cloud/ec2.yml
new file mode 100644
index 000000000..8b6537cc1
--- /dev/null
+++ b/benchmarks/cloud/ec2.yml
@@ -0,0 +1,84 @@
+# This script sets up an Ec2 instance with elastic IP and a disk volume
+Parameters:
+  InstanceTypeParameter:
+    Type: String
+    Default: c5n.large
+    AllowedValues:
+      - c5n.large
+      - p2.2xlarge
+      - p3.2xlarge
+      - p3.8xlarge
+    Description: Instance type CPU, GPU
+  DiskSize:
+    Type: Number
+    Default: 100
+    Description: Disk size in GB
+  DiskType:
+    Type: String
+    Default: gp2
+    AllowedValues:
+      - gp2
+      - gp3
+      - io1
+      - io2
+      - sc1
+      - st1
+      - standard
+    Description: Enter Disk type SSD, HDD
+
+Resources:
+  MyInstance:
+    Type: AWS::EC2::Instance
+    Properties:
+      AvailabilityZone: us-west-2a
+      ImageId: ami-0306d46d05aaf8663 # Deep Learning AMI
+      InstanceType: 
+        Ref: InstanceTypeParameter
+      SecurityGroups:
+        - !Ref SSHSecurityGroup
+
+  # Elastic IP so I can easily ssh into the machine
+  MyEIP:
+    Type: AWS::EC2::EIP
+    Properties:
+      InstanceId: !Ref MyInstance
+
+  # Open security group for SSH
+  SSHSecurityGroup:
+    Type: AWS::EC2::SecurityGroup
+    Properties:
+      GroupDescription: Enable SSH access via port 22
+      SSHSecurityGroupIngress:
+      - CidrIp: 0.0.0.0/0
+      FromPort: 22
+      IpProtocol: tcp
+      ToPort: 22
+
+
+  NewVolume:
+    Type: AWS::EC2::Volume
+    Properties:
+      Size: 
+        Ref: DiskSize
+      VolumeType:
+        Ref: DiskType
+      AvailabilityZone: !GetAtt MyInstance.AvailabilityZone
+      Tags:
+        - Key: MyTag
+          Value: TagValue
+    DeletionPolicy: Snapshot
+
+  MountPoint:
+    Type: AWS::EC2::VolumeAttachment
+    Properties:
+      InstanceId: !Ref MyInstance
+      VolumeId: !Ref NewVolume
+      Device: /dev/sdh
+
+# # Volume
+# SSD:
+#   Type: AWS::EC2::VolumeAttachment
+#   Properties:
+#     InstanceId: !Ref MyInstance
+
+# HDD:
\ No newline at end of file

From cf54e201ea66af78e51794099fa7abf6ca1db7a9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Tue, 28 Jun 2022 16:00:39 -0700
Subject: [PATCH 21/24] [skip ci] push

---
 benchmarks/cloud/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/cloud/README.md b/benchmarks/cloud/README.md
index 59d8bbbc9..14aab3963 100644
--- a/benchmarks/cloud/README.md
+++ b/benchmarks/cloud/README.md
@@ -13,8 +13,9 @@ https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/using-cfn-cli-cre
 
 ```sh
  aws cloudformation create-stack \
-  --stack-name myteststack \
+  --stack-name torchdatabenchmark \
   --template-body ec2.yml \
+  --parameters ParameterKey=InstanceTypeParameter,ParameterValue=p3.2xlarge ParameterKey=DiskType,ParameterValue=gp3
 ```
 
 ## Ssh into machine and run job 

From 8ace4ff972e5463afa8fd2c6d46968137b977511 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Tue, 28 Jun 2022 16:12:41 -0700
Subject: [PATCH 22/24] [skip ci] update

---
 benchmarks/cloud/ec2.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/cloud/ec2.yml b/benchmarks/cloud/ec2.yml
index 8b6537cc1..fc894506a 100644
--- a/benchmarks/cloud/ec2.yml
+++ b/benchmarks/cloud/ec2.yml
@@ -48,11 +48,11 @@ Resources:
     Type: AWS::EC2::SecurityGroup
     Properties:
       GroupDescription: Enable SSH access via port 22
-      SSHSecurityGroupIngress:
+      SecurityGroupIngress:
       - CidrIp: 0.0.0.0/0
-      FromPort: 22
-      IpProtocol: tcp
-      ToPort: 22
+        FromPort: 22
+        IpProtocol: tcp
+        ToPort: 22
 
 
   NewVolume:

From 493323965b37599ef32f89eab02048e4af6a0929 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 21 Jul 2022 00:20:13 +0000
Subject: [PATCH 23/24] git commit

---
 benchmarks/README.md        | 36 ++++++++++++-------------
 benchmarks/args.py          | 51 ++++++++++++++++++------------------
 benchmarks/report.py        | 52 ++++++++++++++++++++++++++++++++++---
 benchmarks/requirements.txt |  9 +++++++
 benchmarks/run_benchmark.py | 44 +++++++++++++++----------------
 benchmarks/utils.py         |  8 ------
 6 files changed, 120 insertions(+), 80 deletions(-)
 create mode 100644 benchmarks/requirements.txt
 delete mode 100644 benchmarks/utils.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index b9ff7136d..0c2cfc077 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,38 +1,34 @@
 # Install dependencies
 
 ```
-pip3 install --pre torch torchvision torchaudio torchtext --extra-index-url https://download.pytorch.org/whl/nightly/cu113
+pip install -r benchmarks/requirements.txt
 python setup.py develop
 ```
 
 # Usage instructions
 
-
 ```
-usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--batch_size BATCH_SIZE] [--device DEVICE] [--num_epochs NUM_EPOCHS] 
+usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--batch_size BATCH_SIZE] [--device DEVICE] [--num_epochs NUM_EPOCHS]
                         [--report_location REPORT_LOCATION] [--num_workers NUM_WORKERS] [--shuffle] [--dataloaderv DATALOADERV]
 ```
 
 ## Available metrics
-* [ ] PyTorch profiler - won't be possible until H2 2022
-* [x] Total time
-* [x] Time per batch
-* [x] Time per epoch
-* [x] Precision over time
-* [x] CPU Load
-* [x] GPU Load
-* [x] Memory usage
-* [x] PyTorch profiler
 
-## Additional profiling
+- [x] Total time
+- [x] Time per batch
+- [x] Time per epoch
+- [x] Precision over time
+- [x] CPU Load
+- [x] GPU Load
+- [x] Memory usage
 
-```
-pip install scalene
-```
-`scalene run_benchmark.py`
+## Additional profiling
 
+The PyTorch profiler doesn't work quite well with `torchdata` for now https://github.com/pytorch/kineto/issues/609 but
+there are other good options like `py-spy` or `scalene` which could be used like so `profiler_name run_benchmark.py`
 
 ## Other benchmarks in the wild
-* https://github.com/pytorch/kineto/blob/main/tb_plugin/examples/datapipe_example.py
-* https://github.com/pytorch/text/tree/main/test/datasets
-* https://github.com/pytorch/vision/tree/main/torchvision/prototype/datasets
\ No newline at end of file
+
+- https://github.com/pytorch/kineto/blob/main/tb_plugin/examples/datapipe_example.py
+- https://github.com/pytorch/text/tree/main/test/datasets
+- https://github.com/pytorch/vision/tree/main/torchvision/prototype/datasets
diff --git a/benchmarks/args.py b/benchmarks/args.py
index 6576a78d4..98414e244 100644
--- a/benchmarks/args.py
+++ b/benchmarks/args.py
@@ -1,29 +1,30 @@
-import argparse
+from dataclasses import dataclass, fields
+from enum import Enum
+
+from simple_parsing import ArgumentParser
+
+
+@dataclass(frozen=True)
+class BenchmarkConfig:
+    dataset: str = "gtsrb"  # TODO: Integrate with HF datasets
+    model_name: str = "resnext50_32x4d"  # TODO: torchvision models supported only
+    batch_size: int = 1
+    device: str = "cuda:0"  # Options are cpu or cuda:0
+    num_epochs: int = 1
+    report_location: str = "report.csv"
+    num_wokers: int = 1
+    shuffle: bool = True
+    dataloader_version: int = 1  # Options are 1 or 2
+
 
 ## Arg parsing
 def arg_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", type=str, default="gtsrb", help="The name of the dataset")
-    parser.add_argument("--ispipe", action="store_true", help="is datapipe or dataset?")
-    parser.add_argument("--model_name", type=str, default="resnext50_32x4d", help="The name of the model")
-    parser.add_argument("--batch_size", type=int, default=1, help="")
-    parser.add_argument("--device", type=str, default="cuda:0", help="Options are are cpu or cuda:0")
-    parser.add_argument("--num_epochs", type=int, default=1)
-    parser.add_argument("--report_location", type=str, default="./report.md", help="The location where the generated report will be stored")
-    parser.add_argument("--num_workers", type=int, default=1, help="Number of dataloader workers")
-    parser.add_argument("--shuffle", action="store_true")
-    parser.add_argument("--dataloaderv", type=int, default=1)
-
+    parser = ArgumentParser()
+    parser.add_arguments(BenchmarkConfig, dest="options")
     args = parser.parse_args()
-    print(args)
-    dataset = args.dataset
-    ispipe = args.ispipe
-    model_name = args.model_name
-    batch_size = args.batch_size
-    device = args.device
-    num_epochs = args.num_epochs
-    report_location = args.report_location
-    num_workers = args.num_workers
-    shuffle = args.shuffle
-    dataloaderv = args.dataloaderv
-    return dataset, ispipe, model_name,batch_size,device,num_epochs,num_workers,shuffle,dataloaderv
\ No newline at end of file
+    benchmark_config = args.options
+    return benchmark_config
+
+
+if __name__ == "__main__":
+    arg_parser()
diff --git a/benchmarks/report.py b/benchmarks/report.py
index 5d2378e69..b72f3d207 100644
--- a/benchmarks/report.py
+++ b/benchmarks/report.py
@@ -1,6 +1,50 @@
+import csv
+from abc import ABC, abstractclassmethod
+from dataclasses import dataclass, fields
 from statistics import mean
+from typing import Dict, list, tuple
 
-def create_report(per_epoch_durations, batch_durations, total_duration):
-    print(f"Total duration is {total_duration}")
-    print(f"Per epoch duration {mean(per_epoch_durations)}")
-    print(f"Per batch duration {mean(batch_durations)}")
+import numpy as np
+
+duration = int
+
+
+@dataclass
+class MetricCache:
+    epoch_durations: list[duration]
+    batch_durations: list[duration]
+    total_duration: int = 0
+
+
+class MetricExporter(ABC):
+    @abstractclassmethod
+    def export(self, metric_cache: MetricCache) -> None:
+        return NotImplementedError
+
+    def calculate_percentiles(self, metric_cache: MetricCache) -> Dict[str, float]:
+        output = {}
+        for field in fields(metric_cache):
+            duration_list = getattr(metric_cache, field.name)
+            percentiles = [
+                np.percentile(duration_list, 0.5),
+                np.percentile(duration_list, 0.9),
+                np.percentile(duration_list, 0.99),
+            ]
+            output[field.name] = percentiles
+        return output
+
+
+class StdOutReport(MetricExporter):
+    def export(self, metric_cache):
+        percentiles_dict = metric_cache.calculate_percentiles()
+        for field, percentiles in percentiles_dict.items:
+            print(f"{field} duration is {percentiles}")
+
+
+class CSVReport(MetricExporter):
+    def export(self, metric_cache: MetricCache, filepath: str):
+        percentiles_dict = metric_cache.calculate_percentiles()
+        with open(filepath, "w") as file:
+            writer = csv.writer(file)
+            for field, percentiles in percentiles_dict.items:
+                writer.writerow(field + percentiles)
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
new file mode 100644
index 000000000..cfc16922a
--- /dev/null
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,9 @@
+--extra-index-url https://download.pytorch.org/whl/nightly/cu113
+simple-parsing
+dill
+numpy
+torch 
+torchvision
+torchaudio
+torchtext 
+transformers
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index eeece0c2c..41c318c56 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -1,26 +1,27 @@
-import sys
 import logging
 import subprocess
+import sys
+import time
 
-import torchvision
 import torch
-import transformers
-from torchvision.prototype.datasets import load as loadpipe
 import torch.nn.functional as F
-from torchvision import transforms
-import time
 import torch.optim as optim
 import torch.profiler
 
+import torchvision
+import transformers
+
 # Relative imports
 from args import arg_parser
 from benchmarks.datasets import prepare_gtsrb_dataset
-from utils import init_fn
 from datasets import prepare_gtsrb_datapipe
-from trainers import train
 from report import create_report
+from torchvision import transforms
+from torchvision.prototype.datasets import load as loadpipe
+from trainers import train
+from utils import init_fn
 
-logging.basicConfig(filename='example.log', level=logging.DEBUG)
+logging.basicConfig(filename="example.log", level=logging.DEBUG)
 
 
 dataset, ispipe, model_name, batch_size, device, num_epochs, num_workers, shuffle, dataloaderv = arg_parser()
@@ -38,15 +39,14 @@
 elif dataloaderv == 2:
     from torch.utils.data.dataloader_experimental import DataLoader2 as DataLoader
 else:
-    raise(f"dataloaderv{dataloaderv} is not a valid option")
+    raise (f"dataloaderv{dataloaderv} is not a valid option")
 
 # Download model
 model_map = {
     "resnext50_32x4d": torchvision.models.resnext50_32x4d,
-    "mobilenet_v3_large" : torchvision.models.mobilenet_v3_large,
-    "transformerencoder" : torch.nn.TransformerEncoder,
-    "bert-base" : transformers.BertModel,
-
+    "mobilenet_v3_large": torchvision.models.mobilenet_v3_large,
+    "transformerencoder": torch.nn.TransformerEncoder,
+    "bert-base": transformers.BertModel,
 }
 
 model = model_map[model_name]().to(torch.device(device))
@@ -63,7 +63,7 @@
     else:
         # No further preprocessing needed this returns a tuple of Images and labels as ints
         # Do I need to do batching and collation manually?
-        ds = torchvision.datasets.GTSRB(root=".",split="train", download=True)
+        ds = torchvision.datasets.GTSRB(root=".", split="train", download=True)
 
 
 else:
@@ -83,28 +83,28 @@
 
 # Shuffle won't work in distributed yet
 else:
-    dl = DataLoader(dataset=data, batch_size=batch_size, shuffle=True, num_workers=num_workers, worker_init_fn=init_fn, multiprocessing_context="spawn")
+    dl = DataLoader(
+        dataset=data, batch_size=batch_size, shuffle=True, num_workers=num_workers, multiprocessing_context="spawn"
+    )
 
 criterion = torch.nn.CrossEntropyLoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
 
 
-
 total_start = time.time()
 per_epoch_durations = []
 batch_durations = []
 
 
-
 with torch.profiler.profile(
     activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
-    on_trace_ready=torch.profiler.tensorboard_trace_handler('./result', worker_name='datapipe0'),
-    schedule=torch.profiler.schedule(wait=1,warmup=1,active=2),
+    on_trace_ready=torch.profiler.tensorboard_trace_handler("./result", worker_name="datapipe0"),
+    schedule=torch.profiler.schedule(wait=1, warmup=1, active=2),
     record_shapes=True,
     profile_memory=True,
     with_flops=True,
     with_stack=True,
-    with_modules=True
+    with_modules=True,
 ) as p:
 
     train(num_epochs, model, dl, per_epoch_durations, batch_durations, criterion, optimizer, p)
@@ -114,5 +114,3 @@
 
 # TODO: Make this output some human readable markdown file
 create_report(per_epoch_durations, batch_durations, total_duration)
-
-
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
deleted file mode 100644
index d53ec796a..000000000
--- a/benchmarks/utils.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Util function for multiprocessing
-import torch
-
-def init_fn(worker_id):
-    info = torch.utils.data.get_worker_info()
-    num_workers = info.num_workers
-    datapipe = info.dataset
-    torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
\ No newline at end of file

From a09826d5b6691a37cb6f3978ec2fc4caece758cf Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Wed, 20 Jul 2022 19:28:25 -0700
Subject: [PATCH 24/24] update

---
 benchmarks/datasets.py      | 54 ++++++++++++++++++++++---------------
 benchmarks/report.py        |  2 ++
 benchmarks/run_benchmark.py | 16 ++++++-----
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
index 8dc93cc18..3ff794422 100644
--- a/benchmarks/datasets.py
+++ b/benchmarks/datasets.py
@@ -1,28 +1,40 @@
 from torchvision import transforms, datasets
 import torch
+from abc import ABC, abstractmethod
 
-def transform(img):
-    t= transforms.Compose([
-        transforms.ToPILImage(),
-        transforms.Resize(size=(100,100)),
-        transforms.ToTensor()]
-    )
-    return t(img)
+class DataPipeReadyBenchmark(ABC):
+    @abstractmethod
+    def prepare_pipe(self, params):
+        return NotImplementedError
 
-def str_to_list(str):
-    l = []
-    for char in str:
-        l.append(int(char))
-    return l
+class GTSRBReadyBenchmark(DataPipeReadyBenchmark):
+    def transform(img):
+        t= transforms.Compose([
+            transforms.ToPILImage(),
+            transforms.Resize(size=(100,100)),
+            transforms.ToTensor()]
+        )
+        return t(img)
 
-def prepare_gtsrb_datapipe(batch_size, device, dp):
-    # Filter out bounding box and path to image
-    dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
+    def str_to_list(str):
+        l = []
+        for char in str:
+            l.append(int(char))
+        return l
 
-    # Apply image preprocessing
-    dp = dp.map(lambda sample : transform(sample.decode().to(torch.device(device))), input_col="image")
-    dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
+    def prepare_pipe(self, params):
+        batch_size, device, dp = params
+        # Filter out bounding box and path to image
+        dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
 
-    # Batch
-    dp = dp.batch(batch_size)
-    return dp    
\ No newline at end of file
+        # Apply image preprocessing
+        dp = dp.map(lambda sample : transform(sample.decode().to(torch.device(device))), input_col="image")
+        dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
+
+        # Batch
+        dp = dp.batch(batch_size)
+        return dp    
+    
+class HuggingFaceReadyBenchmark(DataPipeReadyBenchmark):
+    def prepare(self, dataset_name):
+        return NotImplementedError
\ No newline at end of file
diff --git a/benchmarks/report.py b/benchmarks/report.py
index b72f3d207..7fa4e7a28 100644
--- a/benchmarks/report.py
+++ b/benchmarks/report.py
@@ -35,6 +35,7 @@ def calculate_percentiles(self, metric_cache: MetricCache) -> Dict[str, float]:
 
 
 class StdOutReport(MetricExporter):
+    @staticmethod
     def export(self, metric_cache):
         percentiles_dict = metric_cache.calculate_percentiles()
         for field, percentiles in percentiles_dict.items:
@@ -42,6 +43,7 @@ def export(self, metric_cache):
 
 
 class CSVReport(MetricExporter):
+    @staticmethod
     def export(self, metric_cache: MetricCache, filepath: str):
         percentiles_dict = metric_cache.calculate_percentiles()
         with open(filepath, "w") as file:
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index 41c318c56..cebc5f4f8 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -15,16 +15,15 @@
 from args import arg_parser
 from benchmarks.datasets import prepare_gtsrb_dataset
 from datasets import prepare_gtsrb_datapipe
-from report import create_report
 from torchvision import transforms
 from torchvision.prototype.datasets import load as loadpipe
 from trainers import train
-from utils import init_fn
+from report import MetricCache, CSVReport
 
 logging.basicConfig(filename="example.log", level=logging.DEBUG)
 
 
-dataset, ispipe, model_name, batch_size, device, num_epochs, num_workers, shuffle, dataloaderv = arg_parser()
+dataset, ispipe, model_name, batch_size, device, num_epochs, report_location, num_workers, shuffle, dataloaderv = arg_parser()
 
 if device.startswith("cuda"):
     nvidiasmi = subprocess.check_output("nvidia-smi", shell=True, text=True)
@@ -52,6 +51,7 @@
 model = model_map[model_name]().to(torch.device(device))
 
 # setup data pipe
+# TODO: How about we just make this work with any HF dataset
 if dataset == "gtsrb":
     if ispipe:
         dp = loadpipe(dataset, split="train")
@@ -70,9 +70,9 @@
     print(f"{dataset} not supported yet")
 
 
-print(f"batch size {batch_size}")
-print(f"Dataset name {dp}")
-print(f"Dataset length {len(dp)}")
+logging.info(f"batch size {batch_size}")
+logging.info(f"Dataset name {dp}")
+logging.info(f"Dataset length {len(dp)}")
 
 # Setup data loader
 
@@ -107,10 +107,12 @@
     with_modules=True,
 ) as p:
 
+    # TODO: Double check if this actually modifies the metrics in calling code
     train(num_epochs, model, dl, per_epoch_durations, batch_durations, criterion, optimizer, p)
 
     total_end = time.time()
     total_duration = total_end - total_start
+    metric_cache = MetricCache(batch_durations, per_epoch_durations, total_duration)
 
 # TODO: Make this output some human readable markdown file
-create_report(per_epoch_durations, batch_durations, total_duration)
+CSVReport(metric_cache, report_location).export()
\ No newline at end of file