From 9e2e5210fd08466d0d1ec4057ffc6f680d879563 Mon Sep 17 00:00:00 2001
From: tianyi1 <tianyi.liu@intel.com>
Date: Fri, 2 Sep 2022 11:38:51 +0800
Subject: [PATCH 1/3] remove the model compression part in DLRM

---
 modelzoo/dlrm/dlrm.patch | 2402 +-------------------------------------
 1 file changed, 21 insertions(+), 2381 deletions(-)

diff --git a/modelzoo/dlrm/dlrm.patch b/modelzoo/dlrm/dlrm.patch
index 67e0fdf09..e79fc9f51 100644
--- a/modelzoo/dlrm/dlrm.patch
+++ b/modelzoo/dlrm/dlrm.patch
@@ -406,199 +406,6 @@ index c6b5b99..0000000
 --------
 -This source code is licensed under the MIT license found in the
 -LICENSE file in the root directory of this source tree.
-diff --git a/analysis_model.py b/analysis_model.py
-new file mode 100644
-index 0000000..2fa6d91
---- /dev/null
-+++ b/analysis_model.py
-@@ -0,0 +1,187 @@
-+from __future__ import absolute_import, division, print_function, unicode_literals
-+
-+import os, sys
-+import matplotlib
-+matplotlib.use("Agg")
-+import matplotlib.pyplot
-+import pandas as pd
-+
-+from model_compression.hyparameters import hyparams
-+# For model compression
-+import distiller
-+from distiller.utils import *
-+
-+# miscellaneous
-+import builtins
-+import functools
-+# import bisect
-+# import shutil
-+import time
-+import json
-+import math
-+# data generation
-+import dlrm_data_pytorch as dp
-+
-+# numpy
-+import numpy as np
-+
-+# onnx
-+# The onnx import causes deprecation warnings every time workers
-+# are spawned during testing. So, we filter out those warnings.
-+import warnings
-+with warnings.catch_warnings():
-+    warnings.filterwarnings("ignore", category=DeprecationWarning)
-+import onnx
-+
-+# pytorch
-+import torch
-+import torch.nn as nn
-+from torch.nn.parallel.parallel_apply import parallel_apply
-+from torch.nn.parallel.replicate import replicate
-+from torch.nn.parallel.scatter_gather import gather, scatter
-+
-+# For distributed run
-+import extend_distributed as ext_dist
-+
-+try:
-+    import intel_pytorch_extension as ipex
-+    from intel_pytorch_extension import core
-+except:
-+    pass
-+from lamb_bin import Lamb, log_lamb_rs
-+
-+# quotient-remainder trick
-+from tricks.qr_embedding_bag import QREmbeddingBag
-+# mixed-dimension trick
-+from tricks.md_embedding_bag import PrEmbeddingBag, md_solver
-+
-+import sklearn.metrics
-+import mlperf_logger
-+
-+# from torchviz import make_dot
-+# import torch.nn.functional as Functional
-+# from torch.nn.parameter import Parameter
-+
-+from torch.optim.lr_scheduler import _LRScheduler
-+
-+exc = getattr(builtins, "IOError", "FileNotFoundError")
-+
-+from dlrm_s_pytorch_lamb_sparselamb_test import DLRM_Net
-+
-+def load_model(model_path, args):
-+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
-+
-+    if (args.data_generation == "dataset"):
-+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
-+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
-+        nbatches_test = len(test_ld)
-+
-+        ln_emb = train_data.counts
-+        # enforce maximum limit on number of vectors per embedding
-+        if args.max_ind_range > 0:
-+            ln_emb = np.array(list(map(
-+                lambda x: x if x < args.max_ind_range else args.max_ind_range,
-+                ln_emb
-+            )))
-+        m_den = train_data.m_den
-+        ln_bot[0] = m_den
-+
-+    else:
-+        # input and target at random
-+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
-+        m_den = ln_bot[0]
-+        train_data, train_ld = dp.make_random_data_and_loader(args, ln_emb, m_den)
-+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
-+
-+    ### parse command line arguments ###
-+    m_spa = args.arch_sparse_feature_size
-+    num_fea = ln_emb.size + 1  # num sparse + num dense features
-+    m_den_out = ln_bot[ln_bot.size - 1]
-+    if args.arch_interaction_op == "dot":
-+        # approach 1: all
-+        # num_int = num_fea * num_fea + m_den_out
-+        # approach 2: unique
-+        if args.arch_interaction_itself:
-+            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
-+        else:
-+            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
-+    elif args.arch_interaction_op == "cat":
-+        num_int = num_fea * m_den_out
-+    else:
-+        sys.exit(
-+            "ERROR: --arch-interaction-op="
-+            + args.arch_interaction_op
-+            + " is not supported"
-+        )
-+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
-+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
-+    ndevices = -1
-+
-+    dlrm = DLRM_Net(
-+        m_spa = args.arch_sparse_feature_size,
-+        ln_emb = ln_emb,
-+        ln_bot = ln_bot,
-+        ln_top = ln_top,
-+        arch_interaction_op = args.arch_interaction_op,
-+        arch_interaction_itself = args.arch_interaction_itself,
-+        sigmoid_bot = -1,
-+        sigmoid_top = ln_top.size - 2,
-+        sync_dense_params = args.sync_dense_params,
-+        loss_threshold = args.loss_threshold,
-+        ndevices = ndevices,
-+        qr_flag = args.qr_flag,
-+        qr_operation = args.qr_operation,
-+        qr_collisions = args.qr_collisions,
-+        qr_threshold = args.qr_threshold,
-+        md_flag = args.md_flag,
-+        md_threshold = args.md_threshold,
-+        sparse_dense_boundary = args.sparse_dense_boundary,
-+        bf16 = args.bf16,
-+        use_ipex = args.use_ipex
-+    )
-+
-+    model_dict = torch.load(os.path.join(model_path,"dlrm_s_pytorch_"+str(ext_dist.dist.get_rank())+".pkl"))
-+    dlrm.load_state_dict({k.replace('module.',''):v for k,v in model_dict["state_dict"].items()})
-+
-+    return dlrm
-+    
-+
-+def view_elementwise_sparsity(model):
-+    origin_model_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')),"model_compression/model/compress/AGP_Structure/test2/")
-+    df_sparsity = distiller.weights_sparsity_summary(model=model, param_dims=[1,2,4,5])
-+    df_sparsity.to_csv(os.path.join(origin_model_dir,"dlrm_s_pytorch_new_"+str(ext_dist.dist.get_rank())+".csv"))
-+    print(df_sparsity[['Name', 'Shape', 'NNZ (dense)', 'NNZ (sparse)']])
-+    return df_sparsity
-+
-+def view_layer_wise_sparsity(df_sparsity):
-+    origin_model_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')),"model_compression/model/compress/AGP_Structure/test2/")  
-+    matplotlib.rcParams.update({'font.size': 22})
-+    spec_df_sparsity = df_sparsity[~df_sparsity["Name"].str.contains("emb")]
-+    spec_df_sparsity.to_csv(os.path.join(origin_model_dir,"dlrm_s_pytorch_spec_"+str(ext_dist.dist.get_rank())+".csv"))
-+    spec_df_sparsity_new = spec_df_sparsity[['NNZ (dense)', 'NNZ (sparse)']]
-+    ax = spec_df_sparsity_new.iloc[0:-1].plot(kind='bar', figsize=[30,30], title="Weights footprint: Sparse vs. Dense\n(element-wise)")
-+    ax.set_xticklabels(spec_df_sparsity.Name[:-1], rotation=90)
-+    ax.figure.savefig(os.path.join(os.path.dirname(os.path.abspath('__file__')),"model_compression/model/compress/AGP_Structure/test2/layer_wise_sparsity_"+str(ext_dist.dist.get_rank())+".png"))
-+
-+def remove_layers(model):
-+    #layers_to_remove = [param_name for param_name, param in model.named_parameters() if distiller.density(param) == 0]
-+    layers_density = [(param_name , distiller.density(param)) for param_name, param in model.named_parameters()]
-+    
-+    print(layers_density)
-+
-+def main(args):
-+    origin_model_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')),"model_compression/model/compress/AGP_Structure/test2/")
-+    origin_dlrm = load_model(origin_model_dir, args).type(torch.float32)
-+    #for name, params in origin_dlrm.state_dict().items():
-+    #        print('{}:{}:{}'.format(name, params.size(), params.dtype))
-+    #df_sparsity = view_elementwise_sparsity(origin_dlrm)
-+    #view_layer_wise_sparsity(df_sparsity)
-+    remove_layers(origin_dlrm)
-+
-+
-+if __name__ == "__main__":
-+    parser = hyparams.parser
-+    args = parser.parse_args()
-+    ext_dist.init_distributed(backend=args.dist_backend)
-+    main(args)
-+    
 diff --git a/cython/cython_compile.py b/cython/cython_compile.py
 deleted file mode 100644
 index ffacf08..0000000
@@ -5105,12 +4912,12 @@ index ec3394b..db5f5c7 100644
 -    run()
 +        print("Saved model to {}".format(args.save_model))
 \ No newline at end of file
-diff --git a/dlrm_s_pytorch_compress.py b/dlrm_s_pytorch_compress.py
+diff --git a/dlrm_s_pytorch_inference.py b/dlrm_s_pytorch_inference.py
 new file mode 100644
-index 0000000..6e177ec
+index 0000000..575dabd
 --- /dev/null
-+++ b/dlrm_s_pytorch_compress.py
-@@ -0,0 +1,1681 @@
++++ b/dlrm_s_pytorch_inference.py
+@@ -0,0 +1,1107 @@
 +# Copyright (c) Facebook, Inc. and its affiliates.
 +#
 +# This source code is licensed under the MIT license found in the
@@ -5166,9 +4973,6 @@ index 0000000..6e177ec
 +
 +from __future__ import absolute_import, division, print_function, unicode_literals
 +
-+# For model compression
-+import distiller
-+
 +# miscellaneous
 +import builtins
 +import functools
@@ -5176,9 +4980,6 @@ index 0000000..6e177ec
 +# import shutil
 +import time
 +import json
-+import math
-+
-+from numpy.core.fromnumeric import compress
 +# data generation
 +import dlrm_data_pytorch as dp
 +
@@ -5226,46 +5027,6 @@ index 0000000..6e177ec
 +
 +exc = getattr(builtins, "IOError", "FileNotFoundError")
 +
-+class LRPolicyScheduler(_LRScheduler):
-+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
-+        self.num_warmup_steps = num_warmup_steps
-+        self.decay_start_step = decay_start_step
-+        self.decay_end_step = decay_start_step + num_decay_steps
-+        self.num_decay_steps = num_decay_steps
-+
-+        if self.decay_start_step < self.num_warmup_steps:
-+            sys.exit("Learning rate warmup must finish before the decay starts")
-+
-+        if isinstance(optimizer, tuple):
-+            for opt in optimizer:
-+                super(LRPolicyScheduler, self).__init__(opt)
-+        else:
-+            super(LRPolicyScheduler, self).__init__(optimizer)
-+
-+    def get_lr(self):
-+        step_count = self._step_count
-+        if step_count < self.num_warmup_steps:
-+            # warmup
-+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
-+            lr = [base_lr * scale for base_lr in self.base_lrs]
-+            self.last_lr = lr
-+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
-+            # decay
-+            decayed_steps = step_count - self.decay_start_step
-+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
-+            min_lr = 0.0000001
-+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
-+            self.last_lr = lr
-+        else:
-+            if self.num_decay_steps > 0:
-+                # freeze at last, either because we're after decay
-+                # or because we're between warmup and decay
-+                lr = self.last_lr
-+            else:
-+                # do not adjust
-+                lr = self.base_lrs
-+        return lr
-+
 +
 +class Cast(nn.Module):
 +     __constants__ = ['to_dtype']
@@ -5294,7 +5055,7 @@ index 0000000..6e177ec
 +            m = ln[i + 1]
 +
 +            # construct fully connected operator
-+            if self.use_ipex: #and self.bf16:
++            if self.use_ipex and self.bf16:
 +                LL = ipex.IpexMLPLinear(int(n), int(m), bias=True, output_stays_blocked=(i < ln.size - 2), default_blocking=32)
 +            else:
 +                LL = nn.Linear(int(n), int(m), bias=True)
@@ -5321,10 +5082,7 @@ index 0000000..6e177ec
 +                LL.to(torch.bfloat16)
 +            # prepack weight for IPEX Linear
 +            if hasattr(LL, 'reset_weight_shape'):
-+                if self.bf16:
-+                    LL.reset_weight_shape(block_for_dtype=torch.bfloat16)
-+                else:
-+                    LL.reset_weight_shape(block_for_dtype=torch.float32)
++                LL.reset_weight_shape(block_for_dtype=torch.bfloat16)
 +
 +            layers.append(LL)
 +
@@ -5334,7 +5092,7 @@ index 0000000..6e177ec
 +                    layers.append(Cast(torch.float32))
 +                layers.append(nn.Sigmoid())
 +            else:
-+                if self.use_ipex: #and self.bf16:
++                if self.use_ipex and self.bf16:
 +                    LL.set_activation_type('relu')
 +                else:
 +                    layers.append(nn.ReLU())
@@ -5495,7 +5253,7 @@ index 0000000..6e177ec
 +        #     x = layer(x)
 +        # return x
 +        # approach 2: use Sequential container to wrap all layers
-+        need_padding = self.use_ipex and x.size(0) % 2 == 1 #and self.bf16
++        need_padding = self.use_ipex and self.bf16 and x.size(0) % 2 == 1
 +        if need_padding:
 +            x = torch.nn.functional.pad(input=x, pad=(0,0,0,1), mode='constant', value=0)
 +            ret = layers(x)
@@ -5530,7 +5288,7 @@ index 0000000..6e177ec
 +    def interact_features(self, x, ly):
 +        x = x.to(ly[0].dtype)
 +        if self.arch_interaction_op == "dot":
-+            if self.bf16 or self.use_ipex:
++            if self.bf16:
 +                T = [x] + ly
 +                R = ipex.interaction(*T)
 +            else:
@@ -5635,7 +5393,6 @@ index 0000000..6e177ec
 +        del ly_sparse
 +        #ly_sparse ""= torch.cat(ly_sparse,1)
 +        ly = ly_dense + list(ly_sparse2)
-+
 +        # interactions
 +        z = self.interact_features(x, ly)
 +        # top mlp
@@ -5866,10 +5623,9 @@ index 0000000..6e177ec
 +    parser.add_argument("--use-ipex", action="store_true", default=False)
 +    # lamb
 +    parser.add_argument("--optimizer", type=int, default=0, help='optimizer:[0:sgd, 1:lamb/sgd, 2:adagrad, 3:sparseadam]')
-+    # distiller option
-+    parser.add_argument("--model-compression-type", type=str, default=None)
-+    parser.add_argument("--compression-file", type=str, default="./model_compression/dlrm.schedule_agp.yaml")
-+
++    parser.add_argument("--lamblr", type=float, default=0.01, help='lr for lamb')
++    parser.add_argument("--eval-data-path", type=str, default="./data/valid.bin")
++    parser.add_argument("--day-feature-count", type=str, default="./data/day_fea_count.npz")
 +    args = parser.parse_args()
 +
 +    ext_dist.init_distributed(backend=args.dist_backend)
@@ -5927,19 +5683,17 @@ index 0000000..6e177ec
 +    mlperf_logger.barrier()
 +
 +    if (args.data_generation == "dataset"):
-+        train_data, train_ld, test_data, test_ld = \
-+            dp.make_criteo_data_and_loaders(args)
-+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
++        test_data, test_ld = \
++            dp.make_criteo_data_and_loaders_test(args)
 +        nbatches_test = len(test_ld)
-+
-+        ln_emb = train_data.counts
++        ln_emb = test_data.counts
 +        # enforce maximum limit on number of vectors per embedding
 +        if args.max_ind_range > 0:
 +            ln_emb = np.array(list(map(
 +                lambda x: x if x < args.max_ind_range else args.max_ind_range,
 +                ln_emb
 +            )))
-+        m_den = train_data.m_den
++        m_den = test_data.m_den
 +        ln_bot[0] = m_den
 +
 +    else:
@@ -6020,59 +5774,7 @@ index 0000000..6e177ec
 +            d0=m_spa,
 +            round_dim=args.md_round_dims
 +        ).tolist()
-+
-+    # test prints (model arch)
-+    if args.debug_mode:
-+        print("model arch:")
-+        print(
-+            "mlp top arch "
-+            + str(ln_top.size - 1)
-+            + " layers, with input to output dimensions:"
-+        )
-+        print(ln_top)
-+        print("# of interactions")
-+        print(num_int)
-+        print(
-+            "mlp bot arch "
-+            + str(ln_bot.size - 1)
-+            + " layers, with input to output dimensions:"
-+        )
-+        print(ln_bot)
-+        print("# of features (sparse and dense)")
-+        print(num_fea)
-+        print("dense feature size")
-+        print(m_den)
-+        print("sparse feature size")
-+        print(m_spa)
-+        print(
-+            "# of embeddings (= # of sparse features) "
-+            + str(ln_emb.size)
-+            + ", with dimensions "
-+            + str(m_spa)
-+            + "x:"
-+        )
-+        print(ln_emb)
-+
-+        print("data (inputs and targets):")
-+        for j, (X, lS_o, lS_i, T) in enumerate(train_ld):
-+            # early exit if nbatches was set by the user and has been exceeded
-+            if nbatches > 0 and j >= nbatches:
-+                break
-+
-+            print("mini-batch: %d" % j)
-+            print(X.detach().cpu().numpy())
-+            # transform offsets to lengths when printing
-+            print(
-+                [
-+                    np.diff(
-+                        S_o.detach().cpu().tolist() + list(lS_i[i].shape)
-+                    ).tolist()
-+                    for i, S_o in enumerate(lS_o)
-+                ]
-+            )
-+            print([S_i.detach().cpu().tolist() for S_i in lS_i])
-+            print(T.detach().cpu().numpy())
-+
++   
 +    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
 +
 +    ### construct the neural network specified above ###
@@ -6107,8 +5809,8 @@ index 0000000..6e177ec
 +    # test prints
 +    if args.debug_mode:
 +        print("initial parameters (weights and bias):")
-+        #for param in dlrm.parameters():
-+        #    print(param.detach().cpu().numpy())
++        for param in dlrm.parameters():
++            print(param.detach().cpu().numpy())
 +        # print(dlrm)
 +
 +    if args.use_ipex:
@@ -6144,59 +5846,6 @@ index 0000000..6e177ec
 +        loss_fn = torch.nn.BCELoss(reduction="none")
 +    else:
 +        sys.exit("ERROR: --loss-function=" + args.loss_function + " is not supported")
-+
-+    for name, params in dlrm.named_parameters():
-+            print('{}:{}:{}'.format(name, params.size(), params.dtype))
-+    compression_scheduler = None
-+
-+    if not args.inference_only:
-+        # specify the optimizer algorithm
-+        optimizer_list = ([torch.optim.SGD, ([Lamb, False], torch.optim.SGD),
-+                           torch.optim.Adagrad, ([torch.optim.Adam, None], torch.optim.SparseAdam)],
-+                          [ipex.SplitSGD, ([Lamb, True], ipex.SplitSGD)])
-+        optimizers = optimizer_list[args.bf16 and ipex.is_available()][args.optimizer]
-+        print('Chosen optimizer(s): %s' % str(optimizers))
-+
-+        if ext_dist.my_size == 1:
-+            if len(optimizers) == 1:
-+                optimizer = optimizers(dlrm.parameters(), lr=args.learning_rate)
-+            else:
-+                optimizer_dense = optimizers[0][0]([
-+                    {"params": dlrm.bot_l.parameters(), "lr": args.learning_rate},
-+                    {"params": dlrm.top_l.parameters(), "lr": args.learning_rate}
-+                ], lr=args.learning_rate)
-+                if optimizers[0][1] is not None:
-+                    optimizer_dense.set_bf16(optimizers[0][1])
-+                optimizer_sparse = optimizers[1]([
-+                    {"params": [p for emb in dlrm.emb_l for p in emb.parameters()], "lr": args.learning_rate},
-+                ], lr=args.learning_rate)
-+                optimizer = (optimizer_dense, optimizer_sparse)
-+        else:
-+            if len(optimizers) == 1:
-+                optimizer = optimizers([
-+                    {"params": [p for emb in dlrm.emb_sparse for p in emb.parameters()],
-+                     "lr": args.learning_rate / ext_dist.my_size},
-+                    {"params": [p for emb in dlrm.emb_dense for p in emb.parameters()], "lr": args.learning_rate},
-+                    {"params": dlrm.bot_l.parameters(), "lr": args.learning_rate},
-+                    {"params": dlrm.top_l.parameters(), "lr": args.learning_rate}
-+                ], lr=args.learning_rate)
-+            else:
-+                optimizer_dense = optimizers[0][0]([
-+                    {"params": [p for emb in dlrm.emb_dense for p in emb.parameters()], "lr": args.learning_rate},
-+                    {"params": dlrm.bot_l.parameters(), "lr": args.learning_rate},
-+                    {"params": dlrm.top_l.parameters(), "lr": args.learning_rate}
-+                ], lr=args.learning_rate, bf16=args.bf16)
-+                optimizer_sparse = optimizers[1]([
-+                    {"params": [p for emb in dlrm.emb_sparse for p in emb.parameters()],
-+                     "lr": args.learning_rate / ext_dist.my_size},
-+                ], lr=args.learning_rate)
-+                optimizer = (optimizer_dense, optimizer_sparse)
-+        lr_scheduler = LRPolicyScheduler(optimizer, args.lr_num_warmup_steps, args.lr_decay_start_step,
-+                                      args.lr_num_decay_steps)
-+        # load the model compression configuration
-+        if args.model_compression_type is not None:
-+            compression_scheduler = distiller.file_config(dlrm, optimizer, args.compression_file, compression_scheduler)
-+
 +    ### main loop ###
 +    def time_wrap(use_gpu):
 +        if use_gpu:
@@ -6250,1540 +5899,11 @@ index 0000000..6e177ec
 +    total_samp = 0
 +    k = 0
 +
-+    mini_num_batchs = 1
-+
-+    train_ld_iter = enumerate(train_ld)
-+
-+    mlperf_logger.mlperf_submission_log('dlrm')
-+    mlperf_logger.log_event(key=mlperf_logger.constants.SEED, value=args.numpy_rand_seed)
-+    mlperf_logger.log_event(key=mlperf_logger.constants.GLOBAL_BATCH_SIZE, value=args.mini_batch_size)
 +
 +    # Load model is specified
 +    if not (args.load_model == ""):
-+        print("Loading saved model {}".format(args.load_model))
-+        if use_gpu:
-+            if dlrm.ndevices > 1:
-+                # NOTE: when targeting inference on multiple GPUs,
-+                # load the model as is on CPU or GPU, with the move
-+                # to multiple GPUs to be done in parallel_forward
-+                ld_model = torch.load(args.load_model)
-+            else:
-+                # NOTE: when targeting inference on single GPU,
-+                # note that the call to .to(device) has already happened
-+                ld_model = torch.load(
-+                    args.load_model,
-+                    map_location=torch.device('cuda')
-+                    # map_location=lambda storage, loc: storage.cuda(0)
-+                )
-+        else:
-+            # when targeting inference on CPU
-+            ld_model = torch.load(args.load_model, map_location=torch.device('cpu'))
-+        dlrm.load_state_dict(ld_model["state_dict"])
-+        ld_j = ld_model["iter"]
-+        ld_k = ld_model["epoch"]
-+        ld_nepochs = ld_model["nepochs"]
-+        ld_nbatches = ld_model["nbatches"]
-+        ld_nbatches_test = ld_model["nbatches_test"]
-+        ld_gA = ld_model["train_acc"]
-+        ld_gL = ld_model["train_loss"]
-+        ld_total_loss = ld_model["total_loss"]
-+        ld_total_accu = ld_model["total_accu"]
-+        ld_gA_test = ld_model["test_acc"]
-+        ld_gL_test = ld_model["test_loss"]
-+        if not args.inference_only:
-+            optimizer.load_state_dict(ld_model["opt_state_dict"])
-+            best_gA_test = ld_gA_test
-+            total_loss = ld_total_loss
-+            total_accu = ld_total_accu
-+            skip_upto_epoch = ld_k  # epochs
-+            skip_upto_batch = ld_j  # batches
-+        else:
-+            args.print_freq = ld_nbatches
-+            args.test_freq = 0
-+
-+        print(
-+            "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format(
-+                ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test
-+            )
-+        )
-+        print(
-+            "Training state: loss = {:.6f}, accuracy = {:3.3f} %".format(
-+                ld_gL, ld_gA * 100
-+            )
-+        )
-+        print(
-+            "Testing state: loss = {:.6f}, accuracy = {:3.3f} %".format(
-+                ld_gL_test, ld_gA_test * 100
-+            )
-+        )
-+
-+    ext_dist.barrier()
-+    print("time/loss/accuracy (if enabled):")
-+
-+    # LR is logged twice for now because of a compliance checker bug
-+    mlperf_logger.log_event(key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate)
-+    mlperf_logger.log_event(key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS,
-+                            value=args.lr_num_warmup_steps)
-+
-+    # use logging keys from the official HP table and not from the logging library
-+    mlperf_logger.log_event(key='sgd_opt_base_learning_rate', value=args.learning_rate)
-+    mlperf_logger.log_event(key='lr_decay_start_steps', value=args.lr_decay_start_step)
-+    mlperf_logger.log_event(key='sgd_opt_learning_rate_decay_steps', value=args.lr_num_decay_steps)
-+    mlperf_logger.log_event(key='sgd_opt_learning_rate_decay_poly_power', value=2)
-+
-+    # record_shapes=True
-+    # if hasattr(torch.autograd.profiler.profile, "resume"):
-+    #     prof_support_suspend_resume = True
-+    #     prof_arg_dict = {"start_suspended": True}
-+    # else:
-+    #     prof_support_suspend_resume = False
-+    #     prof_arg_dict = { }
-+
-+    # prof_start_iter = args.profiling_start_iter
-+    # prof_end_iter = prof_start_iter + args.profiling_num_iters
-+    train_start = time.time()
-+    # with torch.autograd.profiler.profile(args.enable_profiling, use_gpu, record_shapes=record_shapes, **prof_arg_dict) as prof:
-+
-+    with torch.autograd.profiler.profile(args.enable_profiling, use_gpu) as prof:
-+        while k < args.nepochs:
-+            mlperf_logger.barrier()
-+            mlperf_logger.log_start(key=mlperf_logger.constants.BLOCK_START,
-+                                    metadata={mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1),
-+                                              mlperf_logger.constants.EPOCH_COUNT: 1})
-+            mlperf_logger.barrier()
-+            mlperf_logger.log_start(key=mlperf_logger.constants.EPOCH_START,
-+                                    metadata={mlperf_logger.constants.EPOCH_NUM: k + 1})
-+
-+            if k < skip_upto_epoch:
-+                continue
-+
-+            if compression_scheduler:
-+                compression_scheduler.on_epoch_begin(epoch=k)
-+
-+            accum_time_begin = time_wrap(use_gpu)
-+
-+            if args.mlperf_logging:
-+                previous_iteration_time = None
-+
-+            if compression_scheduler:
-+                mini_num_batchs = steps_per_epoch = math.ceil(len(train_ld) / args.nepochs)
-+            else:
-+                mini_num_batchs = len(train_ld)
-+
-+            for j, (X, lS_o, lS_i, T) in train_ld_iter:
-+                mini_j = j % mini_num_batchs
-+                if j == 0 and args.save_onnx:
-+                    (X_onnx, lS_o_onnx, lS_i_onnx) = (X, lS_o, lS_i)
-+
-+                if j < skip_upto_batch:
-+                    continue
-+
-+                if args.mlperf_logging:
-+                    current_time = time_wrap(use_gpu)
-+                    if previous_iteration_time:
-+                        iteration_time = current_time - previous_iteration_time
-+                    else:
-+                        iteration_time = 0
-+                    previous_iteration_time = current_time
-+                    # if prof and prof_support_suspend_resume and j == prof_start_iter: prof.resume()
-+                    # if prof and prof_support_suspend_resume and j == prof_end_iter: prof.suspend()
-+                else:
-+                    # ext_dist.barrier()
-+                    # if prof and prof_support_suspend_resume and j >= prof_start_iter and j < prof_end_iter: prof.resume()
-+                    t1 = time_wrap(use_gpu)
-+
-+                # early exit if nbatches was set by the user and has been exceeded
-+                if nbatches > 0 and j >= nbatches:
-+                    break
-+                '''
-+                # debug prints
-+                print("input and targets")
-+                print(X.detach().cpu().numpy())
-+                print([np.diff(S_o.detach().cpu().tolist()
-+                       + list(lS_i[i].shape)).tolist() for i, S_o in enumerate(lS_o)])
-+                print([S_i.detach().cpu().numpy().tolist() for S_i in lS_i])
-+                print(T.detach().cpu().numpy())
-+                '''
-+
-+                if compression_scheduler and not args.inference_only:
-+                    compression_scheduler.on_minibatch_begin(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, optimizer=optimizer)
-+
-+                # forward pass
-+                Z = dlrm_wrap(X, lS_o, lS_i, use_gpu, use_ipex, device)
-+
-+                # loss
-+                E = loss_fn_wrap(Z, T, use_gpu, use_ipex, device)
-+                '''
-+                # debug prints
-+                print("output and loss")
-+                print(Z.detach().cpu().numpy())
-+                print(E.detach().cpu().numpy())
-+                '''
-+                # compute loss and accuracy
-+                L = E.detach().cpu().numpy()  # numpy array
-+                S = Z.detach().cpu().numpy()  # numpy array
-+                T = T.detach().cpu().numpy()  # numpy array
-+                mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
-+                A = np.sum((np.round(S, 0) == T).astype(np.uint8))
-+
-+                if not args.inference_only:
-+                    # scaled error gradient propagation
-+                    # (where we do not accumulate gradients across mini-batches)
-+                    if compression_scheduler:
-+                        if args.optimizer == 1 or args.optimizer == 3:
-+                            compression_scheduler.before_backward_pass(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, loss=L, optimizer=optimizer_dense)
-+                            compression_scheduler.before_backward_pass(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, loss=L, optimizer=optimizer_sparse)
-+                        else:
-+                            compression_scheduler.before_backward_pass(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, loss=L, optimizer=optimizer)
-+
-+                    if args.optimizer == 1 or args.optimizer == 3:
-+                        optimizer_dense.zero_grad()
-+                        optimizer_sparse.zero_grad()
-+                    else:
-+                        optimizer.zero_grad()
-+                    # backward pass
-+                    E.backward()
-+                    if compression_scheduler:
-+                        if args.optimizer == 1 or args.optimizer == 3:
-+                            compression_scheduler.before_parameter_optimization(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, optimizer=optimizer_dense)
-+                            compression_scheduler.before_parameter_optimization(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, optimizer=optimizer_sparse)
-+                        else:
-+                            compression_scheduler.before_parameter_optimization(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, optimizer=optimizer)
-+                    # debug prints (check gradient norm)
-+                    # for l in mlp.layers:
-+                    #     if hasattr(l, 'weight'):
-+                    #          print(l.weight.grad.norm().item())
-+
-+                    # optimizer
-+                    if args.optimizer == 1 or args.optimizer == 3:
-+                        optimizer_dense.step()
-+                        optimizer_sparse.step()
-+                    else:
-+                        optimizer.step()
-+                    lr_scheduler.step()
-+
-+                    if compression_scheduler:
-+                        compression_scheduler.on_minibatch_end(epoch=k, minibatch_id=mini_j, minibatches_per_epoch=steps_per_epoch, optimizer=optimizer)
-+
-+                if args.mlperf_logging:
-+                    total_time += iteration_time
-+                else:
-+                    t2 = time_wrap(use_gpu)
-+                    total_time += t2 - t1
-+                total_accu += A
-+                total_loss += L * mbs
-+                total_iter += 1
-+                total_samp += mbs
-+
-+                should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches)
-+                should_test = (
-+                    (args.test_freq > 0)
-+                    and (args.data_generation == "dataset")
-+                    and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
-+                )
-+
-+                # print time, loss and accuracy
-+                if should_print or should_test:
-+                    gT = 1000.0 * total_time / total_iter if args.print_time else -1
-+                    total_time = 0
-+
-+                    gA = total_accu / total_samp
-+                    total_accu = 0
-+
-+                    gL = total_loss / total_samp
-+                    total_loss = 0
-+
-+                    str_run_type = "inference" if args.inference_only else "training"
-+                    if compression_scheduler:
-+                        print(
-+                            "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format(
-+                                str_run_type, mini_j + 1, mini_num_batchs, k, gT
-+                            )
-+                            + "loss {:.6f}, accuracy {:3.3f} %".format(gL, gA * 100)
-+                        )
-+                    else:
-+                        print(
-+                            "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format(
-+                                str_run_type, j + 1, nbatches, k, gT
-+                            )
-+                            + "loss {:.6f}, accuracy {:3.3f} %".format(gL, gA * 100)
-+                        )
-+                    # Uncomment the line below to print out the total time with overhead
-+                    # print("Accumulated time so far: {}" \
-+                    # .format(time_wrap(use_gpu) - accum_time_begin))
-+                    total_iter = 0
-+                    total_samp = 0
-+
-+                # testing
-+                if should_test and not args.inference_only:
-+                    epoch_num_float = (j + 1) / len(train_ld) + k + 1
-+                    mlperf_logger.barrier()
-+                    mlperf_logger.log_start(key=mlperf_logger.constants.EVAL_START,
-+                                            metadata={mlperf_logger.constants.EPOCH_NUM: epoch_num_float})
-+
-+                    # don't measure training iter time in a test iteration
-+                    if args.mlperf_logging:
-+                        previous_iteration_time = None
-+
-+                    test_accu = 0
-+                    test_loss = 0
-+                    test_samp = 0
-+
-+                    accum_test_time_begin = time_wrap(use_gpu)
-+                    if args.mlperf_logging:
-+                        scores = []
-+                        targets = []
-+
-+                    for i, (X_test, lS_o_test, lS_i_test, T_test) in enumerate(test_ld):
-+                        # early exit if nbatches was set by the user and was exceeded
-+                        if nbatches > 0 and i >= nbatches:
-+                            break
-+
-+                        t1_test = time_wrap(use_gpu)
-+
-+                        # forward pass
-+                        Z_test = dlrm_wrap(
-+                            X_test, lS_o_test, lS_i_test, use_gpu, use_ipex, device
-+                        )
-+                        if args.mlperf_logging:
-+                            if ext_dist.my_size > 1:
-+                                Z_test = ext_dist.all_gather(Z_test, None)
-+                                T_test = ext_dist.all_gather(T_test, None)
-+                            S_test = Z_test.detach().cpu().numpy()  # numpy array
-+                            T_test = T_test.detach().cpu().numpy()  # numpy array
-+                            scores.append(S_test)
-+                            targets.append(T_test)
-+                        else:
-+                            # loss
-+                            E_test = loss_fn_wrap(Z_test, T_test, use_gpu, use_ipex, device)
-+
-+                            # compute loss and accuracy
-+                            L_test = E_test.detach().cpu().numpy()  # numpy array
-+                            S_test = Z_test.detach().cpu().numpy()  # numpy array
-+                            T_test = T_test.detach().cpu().numpy()  # numpy array
-+                            mbs_test = T_test.shape[0]  # = mini_batch_size except last
-+                            A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8))
-+                            test_accu += A_test
-+                            test_loss += L_test * mbs_test
-+                            test_samp += mbs_test
-+
-+                        t2_test = time_wrap(use_gpu)
-+
-+                    if args.mlperf_logging:
-+                        scores = np.concatenate(scores, axis=0)
-+                        targets = np.concatenate(targets, axis=0)
-+
-+                        validation_results = {}
-+                        if args.use_ipex:
-+                            validation_results['roc_auc'], validation_results['loss'], validation_results['accuracy'] = \
-+                                core.roc_auc_score(torch.from_numpy(targets).reshape(-1), torch.from_numpy(scores).reshape(-1))
-+                        else:
-+                            metrics = {
-+                                'loss' : sklearn.metrics.log_loss,
-+                                'recall' : lambda y_true, y_score:
-+                                sklearn.metrics.recall_score(
-+                                    y_true=y_true,
-+                                    y_pred=np.round(y_score)
-+                                ),
-+                                'precision' : lambda y_true, y_score:
-+                                sklearn.metrics.precision_score(
-+                                    y_true=y_true,
-+                                    y_pred=np.round(y_score)
-+                                ),
-+                                'f1' : lambda y_true, y_score:
-+                                sklearn.metrics.f1_score(
-+                                    y_true=y_true,
-+                                    y_pred=np.round(y_score)
-+                                ),
-+                                'ap' : sklearn.metrics.average_precision_score,
-+                                'roc_auc' : sklearn.metrics.roc_auc_score,
-+                                'accuracy' : lambda y_true, y_score:
-+                                sklearn.metrics.accuracy_score(
-+                                    y_true=y_true,
-+                                    y_pred=np.round(y_score)
-+                                ),
-+                            }
-+
-+                            # print("Compute time for validation metric : ", end="")
-+                            # first_it = True
-+                            for metric_name, metric_function in metrics.items():
-+                                # if first_it:
-+                                #     first_it = False
-+                                # else:
-+                                #     print(", ", end="")
-+                                # metric_compute_start = time_wrap(False)
-+                                validation_results[metric_name] = metric_function(
-+                                    targets,
-+                                    scores
-+                                )
-+                                # metric_compute_end = time_wrap(False)
-+                                # met_time = metric_compute_end - metric_compute_start
-+                                # print("{} {:.4f}".format(metric_name, 1000 * (met_time)),
-+                                #      end="")
-+
-+                        # print(" ms")
-+                        gA_test = validation_results['accuracy']
-+                        gL_test = validation_results['loss']
-+                    else:
-+                        gA_test = test_accu / test_samp
-+                        gL_test = test_loss / test_samp
-+
-+                    is_best = gA_test > best_gA_test
-+                    
-+                    dlrm.to(torch.device("cpu"))
-+                    if is_best:
-+                        best_gA_test = gA_test
-+                        if not (args.save_model == ""):
-+                            print("Saving model to {}".format(args.save_model))
-+                            torch.save(
-+                                {
-+                                    "epoch": k,
-+                                    "nepochs": args.nepochs,
-+                                    "nbatches": nbatches,
-+                                    "nbatches_test": nbatches_test,
-+                                    "iter": j + 1,
-+                                    "state_dict": dlrm.state_dict(),
-+                                    "train_acc": gA,
-+                                    "train_loss": gL,
-+                                    "test_acc": gA_test,
-+                                    "test_loss": gL_test,
-+                                    "total_loss": total_loss,
-+                                    "total_accu": total_accu,
-+                                },
-+                                os.path.join(args.save_model, "dlrm_s_pytorch_" + str(dlrm.rank) + "_best.pkl")
-+                            )
-+                    else:
-+                        if not (args.save_model == ""):
-+                            torch.save(
-+                                {
-+                                    "epoch": k,
-+                                    "nepochs": args.nepochs,
-+                                    "nbatches": nbatches,
-+                                    "nbatches_test": nbatches_test,
-+                                    "iter": j + 1,
-+                                    "state_dict": dlrm.state_dict(),
-+                                    "train_acc": gA,
-+                                    "train_loss": gL,
-+                                    "test_acc": gA_test,
-+                                    "test_loss": gL_test,
-+                                    "total_loss": total_loss,
-+                                    "total_accu": total_accu,
-+                                },
-+                                os.path.join(args.save_model, "dlrm_s_pytorch_" + str(dlrm.rank) + ".pkl")
-+                            )
-+                    dlrm.to(device)
-+
-+                    if args.mlperf_logging:
-+                        is_best = validation_results['roc_auc'] > best_auc_test
-+                        if is_best:
-+                            best_auc_test = validation_results['roc_auc']
-+
-+                        mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_ACCURACY,
-+                                                value=float(validation_results['roc_auc']),
-+                                                metadata={mlperf_logger.constants.EPOCH_NUM: epoch_num_float})
-+                        print(
-+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
-+                            + " loss {:.6f},".format(
-+                                validation_results['loss']
-+                            )
-+                            + " auc {:.4f}, best auc {:.4f},".format(
-+                                validation_results['roc_auc'],
-+                                best_auc_test
-+                            )
-+                            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
-+                                validation_results['accuracy'] * 100,
-+                                best_gA_test * 100
-+                            )
-+                        )
-+                    else:
-+                        print(
-+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0)
-+                            + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format(
-+                                gL_test, gA_test * 100, best_gA_test * 100
-+                            )
-+                        )
-+                    mlperf_logger.barrier()
-+                    mlperf_logger.log_end(key=mlperf_logger.constants.EVAL_STOP,
-+                                          metadata={mlperf_logger.constants.EPOCH_NUM: epoch_num_float})
-+
-+                    # Uncomment the line below to print out the total time with overhead
-+                    # print("Total test time for this group: {}" \
-+                    # .format(time_wrap(use_gpu) - accum_test_time_begin))
-+
-+                    if (args.mlperf_logging
-+                        and (args.mlperf_acc_threshold > 0)
-+                        and (best_gA_test > args.mlperf_acc_threshold)):
-+                        print("MLPerf testing accuracy threshold "
-+                              + str(args.mlperf_acc_threshold)
-+                              + " reached, stop training")
-+                        break
-+
-+                    if (args.mlperf_logging
-+                        and (args.mlperf_auc_threshold > 0)
-+                        and (best_auc_test > args.mlperf_auc_threshold)):
-+                        print("MLPerf testing auc threshold "
-+                              + str(args.mlperf_auc_threshold)
-+                              + " reached, stop training")
-+                        train_end = time.time()
-+                        total_time = train_end - train_start
-+                        print(F"Total Time:{total_time}")
-+                        mlperf_logger.barrier()
-+                        mlperf_logger.log_end(key=mlperf_logger.constants.RUN_STOP,
-+                                              metadata={
-+                                                  mlperf_logger.constants.STATUS: mlperf_logger.constants.SUCCESS})
-+                                
-+                        break
-+                    #ext_dist.barrier()
-+                if mini_j + 1 >= mini_num_batchs:
-+                    break
-+
-+            if compression_scheduler:
-+                compression_scheduler.on_epoch_end(epoch=k, optimizer=optimizer, metrics={'min': total_loss, 'max': total_accu})
-+
-+            mlperf_logger.barrier()
-+            mlperf_logger.log_end(key=mlperf_logger.constants.EPOCH_STOP,
-+                                  metadata={mlperf_logger.constants.EPOCH_NUM: k + 1})
-+            mlperf_logger.barrier()
-+            mlperf_logger.log_end(key=mlperf_logger.constants.BLOCK_STOP,
-+                                  metadata={mlperf_logger.constants.FIRST_EPOCH_NUM: k + 1})
-+            k += 1  # nepochs
-+    train_end = time.time()
-+    total_time = train_end - train_start
-+    print(F"Total Time:{total_time}")
-+    if args.enable_profiling:
-+        print(prof.key_averages().table(sort_by="cpu_time_total"))
-+
-+    if args.mlperf_logging and best_auc_test <= args.mlperf_auc_threshold:
-+        mlperf_logger.barrier()
-+        mlperf_logger.log_end(key=mlperf_logger.constants.RUN_STOP,
-+                              metadata={mlperf_logger.constants.STATUS: mlperf_logger.constants.ABORTED})
-+
-+    # profiling
-+    if args.enable_profiling:
-+        with open("dlrm_s_pytorch.prof", "w") as prof_f:
-+            prof_f.write(prof.key_averages().table(sort_by="cpu_time_total"))
-+            prof.export_chrome_trace("./dlrm_s_pytorch.json")
-+        # print(prof.key_averages().table(sort_by="cpu_time_total"))
-+
-+    # plot compute graph
-+    if args.plot_compute_graph:
-+        sys.exit(
-+            "ERROR: Please install pytorchviz package in order to use the"
-+            + " visualization. Then, uncomment its import above as well as"
-+            + " three lines below and run the code again."
-+        )
-+        # V = Z.mean() if args.inference_only else E
-+        # dot = make_dot(V, params=dict(dlrm.named_parameters()))
-+        # dot.render('dlrm_s_pytorch_graph') # write .pdf file
-+
-+    # test prints
-+    if not args.inference_only and args.debug_mode:
-+        print("updated parameters (weights and bias):")
-+        for param in dlrm.parameters():
-+            print(param.detach().cpu().numpy())
-+
-+    # export the model in onnx
-+    if args.save_onnx:
-+        dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
-+        torch.onnx.export(
-+            dlrm, (X_onnx, lS_o_onnx, lS_i_onnx), dlrm_pytorch_onnx_file, verbose=True, use_external_data_format=True
-+        )
-+        # recover the model back
-+        dlrm_pytorch_onnx = onnx.load("dlrm_s_pytorch.onnx")
-+        # check the onnx model
-+        onnx.checker.check_model(dlrm_pytorch_onnx)
-diff --git a/dlrm_s_pytorch_inference.py b/dlrm_s_pytorch_inference.py
-new file mode 100644
-index 0000000..575dabd
---- /dev/null
-+++ b/dlrm_s_pytorch_inference.py
-@@ -0,0 +1,1107 @@
-+# Copyright (c) Facebook, Inc. and its affiliates.
-+#
-+# This source code is licensed under the MIT license found in the
-+# LICENSE file in the root directory of this source tree.
-+#
-+# Description: an implementation of a deep learning recommendation model (DLRM)
-+# The model input consists of dense and sparse features. The former is a vector
-+# of floating point values. The latter is a list of sparse indices into
-+# embedding tables, which consist of vectors of floating point values.
-+# The selected vectors are passed to mlp networks denoted by triangles,
-+# in some cases the vectors are interacted through operators (Ops).
-+#
-+# output:
-+#                         vector of values
-+# model:                        |
-+#                              /\
-+#                             /__\
-+#                               |
-+#       _____________________> Op  <___________________
-+#     /                         |                      \
-+#    /\                        /\                      /\
-+#   /__\                      /__\           ...      /__\
-+#    |                          |                       |
-+#    |                         Op                      Op
-+#    |                    ____/__\_____           ____/__\____
-+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
-+# input:
-+# [ dense features ]     [sparse indices] , ..., [sparse indices]
-+#
-+# More precise definition of model layers:
-+# 1) fully connected layers of an mlp
-+# z = f(y)
-+# y = Wx + b
-+#
-+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
-+# z = Op(e1,...,ek)
-+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
-+#
-+# 3) Operator Op can be one of the following
-+# Sum(e1,...,ek) = e1 + ... + ek
-+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
-+# Cat(e1,...,ek) = [e1', ..., ek']'
-+# where ' denotes transpose operation
-+#
-+# References:
-+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
-+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
-+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
-+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
-+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
-+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
-+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
-+
-+from __future__ import absolute_import, division, print_function, unicode_literals
-+
-+# miscellaneous
-+import builtins
-+import functools
-+# import bisect
-+# import shutil
-+import time
-+import json
-+# data generation
-+import dlrm_data_pytorch as dp
-+
-+# numpy
-+import numpy as np
-+
-+# onnx
-+# The onnx import causes deprecation warnings every time workers
-+# are spawned during testing. So, we filter out those warnings.
-+import warnings
-+with warnings.catch_warnings():
-+    warnings.filterwarnings("ignore", category=DeprecationWarning)
-+import onnx
-+
-+# pytorch
-+import torch
-+import torch.nn as nn
-+from torch.nn.parallel.parallel_apply import parallel_apply
-+from torch.nn.parallel.replicate import replicate
-+from torch.nn.parallel.scatter_gather import gather, scatter
-+
-+# For distributed run
-+import extend_distributed as ext_dist
-+
-+try:
-+    import intel_pytorch_extension as ipex
-+    from intel_pytorch_extension import core
-+except:
-+    pass
-+from lamb_bin import Lamb, log_lamb_rs
-+
-+# quotient-remainder trick
-+from tricks.qr_embedding_bag import QREmbeddingBag
-+# mixed-dimension trick
-+from tricks.md_embedding_bag import PrEmbeddingBag, md_solver
-+
-+import sklearn.metrics
-+import mlperf_logger
-+
-+# from torchviz import make_dot
-+# import torch.nn.functional as Functional
-+# from torch.nn.parameter import Parameter
-+
-+from torch.optim.lr_scheduler import _LRScheduler
-+
-+exc = getattr(builtins, "IOError", "FileNotFoundError")
-+
-+
-+class Cast(nn.Module):
-+     __constants__ = ['to_dtype']
-+ 
-+     def __init__(self, to_dtype):
-+         super(Cast, self).__init__()
-+         self.to_dtype = to_dtype
-+ 
-+     def forward(self, input):
-+         if input.is_mkldnn:
-+             return input.to_dense(self.to_dtype)
-+         else:
-+             return input.to(self.to_dtype)
-+ 
-+     def extra_repr(self):
-+         return 'to(%s)' % self.to_dtype
-+
-+ 
-+### define dlrm in PyTorch ###
-+class DLRM_Net(nn.Module):
-+    def create_mlp(self, ln, sigmoid_layer):
-+        # build MLP layer by layer
-+        layers = nn.ModuleList()
-+        for i in range(0, ln.size - 1):
-+            n = ln[i]
-+            m = ln[i + 1]
-+
-+            # construct fully connected operator
-+            if self.use_ipex and self.bf16:
-+                LL = ipex.IpexMLPLinear(int(n), int(m), bias=True, output_stays_blocked=(i < ln.size - 2), default_blocking=32)
-+            else:
-+                LL = nn.Linear(int(n), int(m), bias=True)
-+
-+            # initialize the weights
-+            # with torch.no_grad():
-+            # custom Xavier input, output or two-sided fill
-+            mean = 0.0  # std_dev = np.sqrt(variance)
-+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
-+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
-+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
-+            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
-+            # approach 1
-+            LL.weight.data = torch.tensor(W, requires_grad=True)
-+            LL.bias.data = torch.tensor(bt, requires_grad=True)
-+            # approach 2
-+            # LL.weight.data.copy_(torch.tensor(W))
-+            # LL.bias.data.copy_(torch.tensor(bt))
-+            # approach 3
-+            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
-+            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
-+
-+            if self.bf16 and ipex.is_available():
-+                LL.to(torch.bfloat16)
-+            # prepack weight for IPEX Linear
-+            if hasattr(LL, 'reset_weight_shape'):
-+                LL.reset_weight_shape(block_for_dtype=torch.bfloat16)
-+
-+            layers.append(LL)
-+
-+            # construct sigmoid or relu operator
-+            if i == sigmoid_layer:
-+                if self.bf16:
-+                    layers.append(Cast(torch.float32))
-+                layers.append(nn.Sigmoid())
-+            else:
-+                if self.use_ipex and self.bf16:
-+                    LL.set_activation_type('relu')
-+                else:
-+                    layers.append(nn.ReLU())
-+
-+        # approach 1: use ModuleList
-+        # return layers
-+        # approach 2: use Sequential container to wrap all layers
-+        return torch.nn.Sequential(*layers)
-+
-+    def create_emb(self, m, ln, local_ln_emb_sparse=None, ln_emb_dense=None):
-+        emb_l = nn.ModuleList()
-+        # save the numpy random state
-+        np_rand_state = np.random.get_state()
-+        emb_dense = nn.ModuleList()
-+        emb_sparse = nn.ModuleList()
-+        embs = range(len(ln))
-+        if local_ln_emb_sparse or ln_emb_dense:
-+            embs = local_ln_emb_sparse + ln_emb_dense
-+        for i in embs:
-+            # Use per table random seed for Embedding initialization
-+            np.random.seed(self.l_emb_seeds[i])
-+            n = ln[i]
-+            # construct embedding operator
-+            if self.qr_flag and n > self.qr_threshold:
-+                EE = QREmbeddingBag(n, m, self.qr_collisions,
-+                    operation=self.qr_operation, mode="sum", sparse=True)
-+            elif self.md_flag:
-+                base = max(m)
-+                _m = m[i] if n > self.md_threshold else base
-+                EE = PrEmbeddingBag(n, _m, base)
-+                # use np initialization as below for consistency...
-+                W = np.random.uniform(
-+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m)
-+                ).astype(np.float32)
-+                EE.embs.weight.data = torch.tensor(W, requires_grad=True)
-+
-+            else:
-+                # initialize embeddings
-+                # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
-+                W = np.random.uniform(
-+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
-+                ).astype(np.float32)
-+                # approach 1
-+                if n >= self.sparse_dense_boundary:
-+                    #n = 39979771
-+                    m_sparse = 16
-+                    W = np.random.uniform(
-+                        low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m_sparse)
-+                    ).astype(np.float32)
-+                    EE = nn.EmbeddingBag(n, m_sparse, mode="sum", sparse=True, _weight=torch.tensor(W, requires_grad=True))
-+                else:
-+                    W = np.random.uniform(
-+                        low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
-+                    ).astype(np.float32)
-+                    EE = nn.EmbeddingBag(n, m, mode="sum", sparse=False, _weight=torch.tensor(W, requires_grad=True))
-+                # approach 2
-+                # EE.weight.data.copy_(torch.tensor(W))
-+                # approach 3
-+                # EE.weight = Parameter(torch.tensor(W),requires_grad=True)
-+                if self.bf16 and ipex.is_available():
-+                    EE.to(torch.bfloat16)
-+               
-+            if ext_dist.my_size > 1:
-+                if n >= self.sparse_dense_boundary:
-+                    emb_sparse.append(EE)
-+                else:
-+                    emb_dense.append(EE)
-+
-+            emb_l.append(EE)
-+
-+        # Restore the numpy random state
-+        np.random.set_state(np_rand_state)
-+        return emb_l, emb_dense, emb_sparse
-+
-+    def __init__(
-+        self,
-+        m_spa=None,
-+        ln_emb=None,
-+        ln_bot=None,
-+        ln_top=None,
-+        arch_interaction_op=None,
-+        arch_interaction_itself=False,
-+        sigmoid_bot=-1,
-+        sigmoid_top=-1,
-+        sync_dense_params=True,
-+        loss_threshold=0.0,
-+        ndevices=-1,
-+        qr_flag=False,
-+        qr_operation="mult",
-+        qr_collisions=0,
-+        qr_threshold=200,
-+        md_flag=False,
-+        md_threshold=200,
-+        bf16=False,
-+        use_ipex=False,
-+        sparse_dense_boundary = 2048
-+    ):
-+        super(DLRM_Net, self).__init__()
-+
-+        if (
-+            (m_spa is not None)
-+            and (ln_emb is not None)
-+            and (ln_bot is not None)
-+            and (ln_top is not None)
-+            and (arch_interaction_op is not None)
-+        ):
-+
-+            # save arguments
-+            self.ndevices = ndevices
-+            self.output_d = 0
-+            self.parallel_model_batch_size = -1
-+            self.parallel_model_is_not_prepared = True
-+            self.arch_interaction_op = arch_interaction_op
-+            self.arch_interaction_itself = arch_interaction_itself
-+            self.sync_dense_params = sync_dense_params
-+            self.loss_threshold = loss_threshold
-+            self.bf16 = bf16
-+            self.use_ipex = use_ipex
-+            self.sparse_dense_boundary = sparse_dense_boundary
-+            # create variables for QR embedding if applicable
-+            self.qr_flag = qr_flag
-+            if self.qr_flag:
-+                self.qr_collisions = qr_collisions
-+                self.qr_operation = qr_operation
-+                self.qr_threshold = qr_threshold
-+            # create variables for MD embedding if applicable
-+            self.md_flag = md_flag
-+            if self.md_flag:
-+                self.md_threshold = md_threshold
-+
-+            # generate np seeds for Emb table initialization
-+            self.l_emb_seeds = np.random.randint(low=0, high=100000, size=len(ln_emb))
-+
-+            #If running distributed, get local slice of embedding tables
-+            if ext_dist.my_size > 1:
-+                n_emb = len(ln_emb)
-+                self.n_global_emb = n_emb
-+                self.rank = ext_dist.dist.get_rank()
-+                self.ln_emb_dense = [i for i in range(n_emb) if ln_emb[i] < self.sparse_dense_boundary]
-+                self.ln_emb_sparse = [i for i in range(n_emb) if ln_emb[i] >= self.sparse_dense_boundary]
-+                n_emb_sparse = len(self.ln_emb_sparse)
-+                self.n_local_emb_sparse, self.n_sparse_emb_per_rank = ext_dist.get_split_lengths(n_emb_sparse)
-+                self.local_ln_emb_sparse_slice = ext_dist.get_my_slice(n_emb_sparse)
-+                self.local_ln_emb_sparse = self.ln_emb_sparse[self.local_ln_emb_sparse_slice]
-+            # create operators
-+            if ndevices <= 1:
-+                if ext_dist.my_size > 1:
-+                    _, self.emb_dense, self.emb_sparse = self.create_emb(m_spa, ln_emb, self.local_ln_emb_sparse, self.ln_emb_dense)
-+                else:
-+                    self.emb_l, _, _ = self.create_emb(m_spa, ln_emb)
-+
-+            self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
-+            self.top_l = self.create_mlp(ln_top, sigmoid_top)
-+
-+    def apply_mlp(self, x, layers):
-+        # approach 1: use ModuleList
-+        # for layer in layers:
-+        #     x = layer(x)
-+        # return x
-+        # approach 2: use Sequential container to wrap all layers
-+        need_padding = self.use_ipex and self.bf16 and x.size(0) % 2 == 1
-+        if need_padding:
-+            x = torch.nn.functional.pad(input=x, pad=(0,0,0,1), mode='constant', value=0)
-+            ret = layers(x)
-+            return(ret[:-1,:])
-+        else:
-+            return layers(x)
-+
-+    def apply_emb(self, lS_o, lS_i, emb_l):
-+        # WARNING: notice that we are processing the batch at once. We implicitly
-+        # assume that the data is laid out such that:
-+        # 1. each embedding is indexed with a group of sparse indices,
-+        #   corresponding to a single lookup
-+        # 2. for each embedding the lookups are further organized into a batch
-+        # 3. for a list of embedding tables there is a list of batched lookups
-+
-+        ly = []
-+        for k, sparse_index_group_batch in enumerate(lS_i):
-+            sparse_offset_group_batch = lS_o[k]
-+
-+            # embedding lookup
-+            # We are using EmbeddingBag, which implicitly uses sum operator.
-+            # The embeddings are represented as tall matrices, with sum
-+            # happening vertically across 0 axis, resulting in a row vector
-+            E = emb_l[k]
-+            V = E(sparse_index_group_batch, sparse_offset_group_batch)
-+
-+            ly.append(V)
-+
-+        # print(ly)
-+        return ly
-+#if self.bf16:
-+    def interact_features(self, x, ly):
-+        x = x.to(ly[0].dtype)
-+        if self.arch_interaction_op == "dot":
-+            if self.bf16:
-+                T = [x] + ly
-+                R = ipex.interaction(*T)
-+            else:
-+                # concatenate dense and sparse features
-+                (batch_size, d) = x.shape
-+                T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
-+                # perform a dot product
-+                Z = torch.bmm(T, torch.transpose(T, 1, 2))
-+                # append dense feature with the interactions (into a row vector)
-+                # approach 1: all
-+                # Zflat = Z.view((batch_size, -1))
-+                # approach 2: unique
-+                _, ni, nj = Z.shape
-+                # approach 1: tril_indices
-+                # offset = 0 if self.arch_interaction_itself else -1
-+                # li, lj = torch.tril_indices(ni, nj, offset=offset)
-+                # approach 2: custom
-+                offset = 1 if self.arch_interaction_itself else 0
-+                li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
-+                lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
-+                Zflat = Z[:, li, lj]
-+                # concatenate dense features and interactions
-+                R = torch.cat([x] + [Zflat], dim=1)
-+        elif self.arch_interaction_op == "cat":
-+            # concatenation features (into a row vector)
-+            R = torch.cat([x] + ly, dim=1)
-+        else:
-+            sys.exit(
-+                "ERROR: --arch-interaction-op="
-+                + self.arch_interaction_op
-+                + " is not supported"
-+            )
-+
-+        return R
-+
-+    def forward(self, dense_x, lS_o, lS_i):
-+        if self.bf16:
-+            dense_x = dense_x.bfloat16()
-+        if ext_dist.my_size > 1:
-+            return self.distributed_forward(dense_x, lS_o, lS_i)
-+        elif self.ndevices <= 1:
-+            return self.sequential_forward(dense_x, lS_o, lS_i)
-+        else:
-+            return self.parallel_forward(dense_x, lS_o, lS_i)
-+
-+    def sequential_forward(self, dense_x, lS_o, lS_i):
-+        # process dense features (using bottom mlp), resulting in a row vector
-+        x = self.apply_mlp(dense_x, self.bot_l)
-+        # debug prints
-+        # print("intermediate")
-+        # print(x.detach().cpu().numpy())
-+
-+        # process sparse features(using embeddings), resulting in a list of row vectors
-+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
-+        # for y in ly:
-+        #     print(y.detach().cpu().numpy())
-+
-+        # interact features (dense and sparse)
-+        z = self.interact_features(x, ly)
-+        # print(z.detach().cpu().numpy())
-+
-+        # obtain probability of a click (using top mlp)
-+        p = self.apply_mlp(z, self.top_l)
-+
-+        # clamp output if needed
-+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
-+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
-+        else:
-+            z = p
-+
-+        return z
-+
-+    def distributed_forward(self, dense_x, lS_o, lS_i):
-+        batch_size = dense_x.size()[0]
-+        # WARNING: # of ranks must be <= batch size in distributed_forward call
-+        if batch_size < ext_dist.my_size:
-+            sys.exit("ERROR: batch_size (%d) must be larger than number of ranks (%d)" % (batch_size, ext_dist.my_size))
-+
-+        lS_o_dense = [lS_o[i]  for i in self.ln_emb_dense]
-+        lS_i_dense = [lS_i[i] for i in self.ln_emb_dense]
-+        lS_o_sparse = [lS_o[i] for i in self.ln_emb_sparse]  # partition sparse table in one group
-+        lS_i_sparse = [lS_i[i] for i in self.ln_emb_sparse]
-+
-+        lS_i_sparse = ext_dist.shuffle_data(lS_i_sparse)
-+        g_i_sparse = [lS_i_sparse[:, i * batch_size:(i + 1) * batch_size].reshape(-1) for i in range(len(self.local_ln_emb_sparse))]
-+        offset = torch.arange(batch_size * ext_dist.my_size).to(device)
-+        g_o_sparse = [offset for i in range(self.n_local_emb_sparse)]
-+
-+        if (len(self.local_ln_emb_sparse) != len(g_o_sparse)) or (len(self.local_ln_emb_sparse) != len(g_i_sparse)):
-+           sys.exit("ERROR 0 : corrupted model input detected in distributed_forward call")
-+        # sparse embeddings
-+        ly_sparse = self.apply_emb(g_o_sparse, g_i_sparse, self.emb_sparse)
-+        a2a_req = ext_dist.alltoall(ly_sparse, self.n_sparse_emb_per_rank)
-+        # bottom mlp
-+        x = self.apply_mlp(dense_x, self.bot_l)
-+        # dense embeddings
-+        ly_dense = self.apply_emb(lS_o_dense, lS_i_dense, self.emb_dense)
-+        ly_sparse = a2a_req.wait()
-+        ly_sparse2 = []
-+        for i in range(len(ly_sparse)):
-+            ly_sparse2.append(ly_sparse[i].repeat(1,4))
-+        del ly_sparse
-+        #ly_sparse ""= torch.cat(ly_sparse,1)
-+        ly = ly_dense + list(ly_sparse2)
-+        # interactions
-+        z = self.interact_features(x, ly)
-+        # top mlp
-+        p = self.apply_mlp(z, self.top_l)
-+        # clamp output if needed
-+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
-+            z = torch.clamp(
-+                p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
-+            )
-+        else:
-+            z = p
-+
-+        return z
-+ 
-+    def parallel_forward(self, dense_x, lS_o, lS_i):
-+        ### prepare model (overwrite) ###
-+        # WARNING: # of devices must be >= batch size in parallel_forward call
-+        batch_size = dense_x.size()[0]
-+        ndevices = min(self.ndevices, batch_size, len(self.emb_l))
-+        device_ids = range(ndevices)
-+        # WARNING: must redistribute the model if mini-batch size changes(this is common
-+        # for last mini-batch, when # of elements in the dataset/batch size is not even
-+        if self.parallel_model_batch_size != batch_size:
-+            self.parallel_model_is_not_prepared = True
-+
-+        if self.parallel_model_is_not_prepared or self.sync_dense_params:
-+            # replicate mlp (data parallelism)
-+            self.bot_l_replicas = replicate(self.bot_l, device_ids)
-+            self.top_l_replicas = replicate(self.top_l, device_ids)
-+            self.parallel_model_batch_size = batch_size
-+
-+        if self.parallel_model_is_not_prepared:
-+            # distribute embeddings (model parallelism)
-+            t_list = []
-+            for k, emb in enumerate(self.emb_l):
-+                d = torch.device("cuda:" + str(k % ndevices))
-+                emb.to(d)
-+                t_list.append(emb.to(d))
-+            self.emb_l = nn.ModuleList(t_list)
-+            self.parallel_model_is_not_prepared = False
-+
-+        ### prepare input (overwrite) ###
-+        # scatter dense features (data parallelism)
-+        # print(dense_x.device)
-+        dense_x = scatter(dense_x, device_ids, dim=0)
-+        # distribute sparse features (model parallelism)
-+        if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
-+            sys.exit("ERROR: corrupted model input detected in parallel_forward call")
-+
-+        t_list = []
-+        i_list = []
-+        for k, _ in enumerate(self.emb_l):
-+            d = torch.device("cuda:" + str(k % ndevices))
-+            t_list.append(lS_o[k].to(d))
-+            i_list.append(lS_i[k].to(d))
-+        lS_o = t_list
-+        lS_i = i_list
-+
-+        ### compute results in parallel ###
-+        # bottom mlp
-+        # WARNING: Note that the self.bot_l is a list of bottom mlp modules
-+        # that have been replicated across devices, while dense_x is a tuple of dense
-+        # inputs that has been scattered across devices on the first (batch) dimension.
-+        # The output is a list of tensors scattered across devices according to the
-+        # distribution of dense_x.
-+        x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids)
-+        # debug prints
-+        # print(x)
-+
-+        # embeddings
-+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
-+        # debug prints
-+        # print(ly)
-+
-+        # butterfly shuffle (implemented inefficiently for now)
-+        # WARNING: Note that at this point we have the result of the embedding lookup
-+        # for the entire batch on each device. We would like to obtain partial results
-+        # corresponding to all embedding lookups, but part of the batch on each device.
-+        # Therefore, matching the distribution of output of bottom mlp, so that both
-+        # could be used for subsequent interactions on each device.
-+        if len(self.emb_l) != len(ly):
-+            sys.exit("ERROR: corrupted intermediate result in parallel_forward call")
-+
-+        t_list = []
-+        for k, _ in enumerate(self.emb_l):
-+            d = torch.device("cuda:" + str(k % ndevices))
-+            y = scatter(ly[k], device_ids, dim=0)
-+            t_list.append(y)
-+        # adjust the list to be ordered per device
-+        ly = list(map(lambda y: list(y), zip(*t_list)))
-+        # debug prints
-+        # print(ly)
-+
-+        # interactions
-+        z = []
-+        for k in range(ndevices):
-+            zk = self.interact_features(x[k], ly[k])
-+            z.append(zk)
-+        # debug prints
-+        # print(z)
-+
-+        # top mlp
-+        # WARNING: Note that the self.top_l is a list of top mlp modules that
-+        # have been replicated across devices, while z is a list of interaction results
-+        # that by construction are scattered across devices on the first (batch) dim.
-+        # The output is a list of tensors scattered across devices according to the
-+        # distribution of z.
-+        p = parallel_apply(self.top_l_replicas, z, None, device_ids)
-+
-+        ### gather the distributed results ###
-+        p0 = gather(p, self.output_d, dim=0)
-+
-+        # clamp output if needed
-+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
-+            z0 = torch.clamp(
-+                p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
-+            )
-+        else:
-+            z0 = p0
-+
-+        return z0
-+
-+
-+if __name__ == "__main__":
-+    # the reference implementation doesn't clear the cache currently
-+    # but the submissions are required to do that
-+    mlperf_logger.log_event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
-+
-+    mlperf_logger.log_start(key=mlperf_logger.constants.INIT_START, log_all_ranks=True)
-+
-+    ### import packages ###
-+    import sys
-+    import os
-+    import argparse
-+
-+    ### parse arguments ###
-+    parser = argparse.ArgumentParser(
-+        description="Train Deep Learning Recommendation Model (DLRM)"
-+    )
-+    # model related parameters
-+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
-+    parser.add_argument("--arch-embedding-size", type=str, default="4-3-2")
-+    # j will be replaced with the table number
-+    parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2")
-+    parser.add_argument("--arch-mlp-top", type=str, default="4-2-1")
-+    parser.add_argument("--arch-interaction-op", type=str, default="dot")
-+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
-+    # embedding table options
-+    parser.add_argument("--md-flag", action="store_true", default=False)
-+    parser.add_argument("--md-threshold", type=int, default=200)
-+    parser.add_argument("--md-temperature", type=float, default=0.3)
-+    parser.add_argument("--md-round-dims", action="store_true", default=False)
-+    parser.add_argument("--qr-flag", action="store_true", default=False)
-+    parser.add_argument("--qr-threshold", type=int, default=200)
-+    parser.add_argument("--qr-operation", type=str, default="mult")
-+    parser.add_argument("--qr-collisions", type=int, default=4)
-+    # activations and loss
-+    parser.add_argument("--activation-function", type=str, default="relu")
-+    parser.add_argument("--loss-function", type=str, default="mse")  # or bce or wbce
-+    parser.add_argument("--loss-weights", type=str, default="1.0-1.0")  # for wbce
-+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
-+    parser.add_argument("--round-targets", type=bool, default=False)
-+    # data
-+    parser.add_argument("--data-size", type=int, default=1)
-+    parser.add_argument("--num-batches", type=int, default=0)
-+    parser.add_argument(
-+        "--data-generation", type=str, default="random"
-+    )  # synthetic or dataset
-+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
-+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
-+    parser.add_argument("--raw-data-file", type=str, default="")
-+    parser.add_argument("--processed-data-file", type=str, default="")
-+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
-+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
-+    parser.add_argument("--max-ind-range", type=int, default=-1)
-+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
-+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
-+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
-+    parser.add_argument("--num-workers", type=int, default=0)
-+    parser.add_argument("--memory-map", action="store_true", default=False)
-+    # training
-+    parser.add_argument("--mini-batch-size", type=int, default=1)
-+    parser.add_argument("--nepochs", type=int, default=1)
-+    parser.add_argument("--learning-rate", type=float, default=0.01)
-+    parser.add_argument("--print-precision", type=int, default=5)
-+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
-+    parser.add_argument("--sync-dense-params", type=bool, default=True)
-+    # inference
-+    parser.add_argument("--inference-only", action="store_true", default=False)
-+    # onnx
-+    parser.add_argument("--save-onnx", action="store_true", default=False)
-+    # gpu
-+    parser.add_argument("--use-gpu", action="store_true", default=False)
-+    # distributed run
-+    parser.add_argument("--dist-backend", type=str, default="")
-+    # debugging and profiling
-+    parser.add_argument("--print-freq", type=int, default=1)
-+    parser.add_argument("--test-freq", type=int, default=-1)
-+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
-+    parser.add_argument("--test-num-workers", type=int, default=-1)
-+    parser.add_argument("--print-time", action="store_true", default=False)
-+    parser.add_argument("--debug-mode", action="store_true", default=False)
-+    parser.add_argument("--enable-profiling", action="store_true", default=False)
-+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
-+    parser.add_argument("--profiling-start-iter", type=int, default=50)
-+    parser.add_argument("--profiling-num-iters", type=int, default=100)
-+    # store/load model
-+    parser.add_argument("--out-dir", type=str, default=".")
-+    parser.add_argument("--save-model", type=str, default="")
-+    parser.add_argument("--load-model", type=str, default="")
-+    # mlperf logging (disables other output and stops early)
-+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
-+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
-+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
-+    # stop at target AUC Terabyte (no subsampling) 0.8025
-+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
-+    parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
-+    parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False)
-+    # LR policy
-+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
-+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
-+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
-+    # embedding table is sparse table only if sparse_dense_boundary >= 2048
-+    parser.add_argument("--sparse-dense-boundary", type=int, default=2048)
-+    # bf16 option
-+    parser.add_argument("--bf16", action='store_true', default=False)
-+    # ipex option
-+    parser.add_argument("--use-ipex", action="store_true", default=False)
-+    # lamb
-+    parser.add_argument("--optimizer", type=int, default=0, help='optimizer:[0:sgd, 1:lamb/sgd, 2:adagrad, 3:sparseadam]')
-+    parser.add_argument("--lamblr", type=float, default=0.01, help='lr for lamb')
-+    parser.add_argument("--eval-data-path", type=str, default="./data/valid.bin")
-+    parser.add_argument("--day-feature-count", type=str, default="./data/day_fea_count.npz")
-+    args = parser.parse_args()
-+
-+    ext_dist.init_distributed(backend=args.dist_backend)
-+
-+    if args.mlperf_logging:
-+        print('command line args: ', json.dumps(vars(args)))
-+
-+    ### some basic setup ###
-+    np.random.seed(args.numpy_rand_seed)
-+    np.set_printoptions(precision=args.print_precision)
-+    torch.set_printoptions(precision=args.print_precision)
-+    torch.manual_seed(args.numpy_rand_seed)
-+
-+    if (args.test_mini_batch_size < 0):
-+        # if the parameter is not set, use the training batch size
-+        args.test_mini_batch_size = args.mini_batch_size
-+    if (args.test_num_workers < 0):
-+        # if the parameter is not set, use the same parameter for training
-+        args.test_num_workers = args.num_workers
-+    if (args.mini_batch_size % ext_dist.my_size !=0 or args.test_mini_batch_size % ext_dist.my_size != 0):
-+        print("Either test minibatch (%d) or train minibatch (%d) does not split across %d ranks" % (args.test_mini_batch_size, args.mini_batch_size, ext_dist.my_size))
-+        sys.exit(1)
-+
-+    use_gpu = args.use_gpu and torch.cuda.is_available()
-+    use_ipex = args.use_ipex
-+    if use_gpu:
-+        torch.cuda.manual_seed_all(args.numpy_rand_seed)
-+        torch.backends.cudnn.deterministic = True
-+        if ext_dist.my_size > 1:
-+            ngpus = torch.cuda.device_count()  # 1
-+            if ext_dist.my_local_size > torch.cuda.device_count():
-+                print("Not sufficient GPUs available... local_size = %d, ngpus = %d" % (ext_dist.my_local_size, ngpus))
-+                sys.exit(1)
-+            ngpus = 1
-+            device = torch.device("cuda", ext_dist.my_local_rank)
-+        else:
-+            device = torch.device("cuda", 0)
-+            ngpus = torch.cuda.device_count()  # 1
-+        print("Using {} GPU(s)...".format(ngpus))
-+    elif use_ipex:
-+        device = torch.device("dpcpp")
-+        print("Using IPEX...")
-+    else:
-+        device = torch.device("cpu")
-+        print("Using CPU...")
-+
-+    ### prepare training data ###
-+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
-+    # input data
-+
-+    mlperf_logger.barrier()
-+    mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP)
-+    mlperf_logger.barrier()
-+    mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START)
-+    mlperf_logger.barrier()
-+
-+    if (args.data_generation == "dataset"):
-+        test_data, test_ld = \
-+            dp.make_criteo_data_and_loaders_test(args)
-+        nbatches_test = len(test_ld)
-+        ln_emb = test_data.counts
-+        # enforce maximum limit on number of vectors per embedding
-+        if args.max_ind_range > 0:
-+            ln_emb = np.array(list(map(
-+                lambda x: x if x < args.max_ind_range else args.max_ind_range,
-+                ln_emb
-+            )))
-+        m_den = test_data.m_den
-+        ln_bot[0] = m_den
-+
-+    else:
-+        # input and target at random
-+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
-+        m_den = ln_bot[0]
-+        train_data, train_ld = dp.make_random_data_and_loader(args, ln_emb, m_den)
-+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
-+
-+    ### parse command line arguments ###
-+    m_spa = args.arch_sparse_feature_size
-+    num_fea = ln_emb.size + 1  # num sparse + num dense features
-+    m_den_out = ln_bot[ln_bot.size - 1]
-+    if args.arch_interaction_op == "dot":
-+        # approach 1: all
-+        # num_int = num_fea * num_fea + m_den_out
-+        # approach 2: unique
-+        if args.arch_interaction_itself:
-+            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
-+        else:
-+            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
-+    elif args.arch_interaction_op == "cat":
-+        num_int = num_fea * m_den_out
-+    else:
-+        sys.exit(
-+            "ERROR: --arch-interaction-op="
-+            + args.arch_interaction_op
-+            + " is not supported"
-+        )
-+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
-+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
-+
-+    # sanity check: feature sizes and mlp dimensions must match
-+    if m_den != ln_bot[0]:
-+        sys.exit(
-+            "ERROR: arch-dense-feature-size "
-+            + str(m_den)
-+            + " does not match first dim of bottom mlp "
-+            + str(ln_bot[0])
-+        )
-+    if args.qr_flag:
-+        if args.qr_operation == "concat" and 2 * m_spa != m_den_out:
-+            sys.exit(
-+                "ERROR: 2 arch-sparse-feature-size "
-+                + str(2 * m_spa)
-+                + " does not match last dim of bottom mlp "
-+                + str(m_den_out)
-+                + " (note that the last dim of bottom mlp must be 2x the embedding dim)"
-+            )
-+        if args.qr_operation != "concat" and m_spa != m_den_out:
-+            sys.exit(
-+                "ERROR: arch-sparse-feature-size "
-+                + str(m_spa)
-+                + " does not match last dim of bottom mlp "
-+                + str(m_den_out)
-+            )
-+    else:
-+        if m_spa != m_den_out:
-+            sys.exit(
-+                "ERROR: arch-sparse-feature-size "
-+                + str(m_spa)
-+                + " does not match last dim of bottom mlp "
-+                + str(m_den_out)
-+            )
-+    if num_int != ln_top[0]:
-+        sys.exit(
-+            "ERROR: # of feature interactions "
-+            + str(num_int)
-+            + " does not match first dimension of top mlp "
-+            + str(ln_top[0])
-+        )
-+
-+    # assign mixed dimensions if applicable
-+    if args.md_flag:
-+        m_spa = md_solver(
-+            torch.tensor(ln_emb),
-+            args.md_temperature,  # alpha
-+            d0=m_spa,
-+            round_dim=args.md_round_dims
-+        ).tolist()
-+   
-+    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
-+
-+    ### construct the neural network specified above ###
-+    # WARNING: to obtain exactly the same initialization for
-+    # the weights we need to start from the same random seed.
-+    # np.random.seed(args.numpy_rand_seed)
-+    print('Creating the model...')
-+    dlrm = DLRM_Net(
-+        m_spa,
-+        ln_emb,
-+        ln_bot,
-+        ln_top,
-+        arch_interaction_op=args.arch_interaction_op,
-+        arch_interaction_itself=args.arch_interaction_itself,
-+        sigmoid_bot=-1,
-+        sigmoid_top=ln_top.size - 2,
-+        sync_dense_params=args.sync_dense_params,
-+        loss_threshold=args.loss_threshold,
-+        ndevices=ndevices,
-+        qr_flag=args.qr_flag,
-+        qr_operation=args.qr_operation,
-+        qr_collisions=args.qr_collisions,
-+        qr_threshold=args.qr_threshold,
-+        md_flag=args.md_flag,
-+        md_threshold=args.md_threshold,
-+        sparse_dense_boundary=args.sparse_dense_boundary,
-+        bf16 = args.bf16,
-+        use_ipex = args.use_ipex
-+    )
-+    
-+    print('Model created!')
-+    # test prints
-+    if args.debug_mode:
-+        print("initial parameters (weights and bias):")
-+        for param in dlrm.parameters():
-+            print(param.detach().cpu().numpy())
-+        # print(dlrm)
-+
-+    if args.use_ipex:
-+       dlrm = dlrm.to(device)
-+       print(dlrm, device, args.use_ipex)
-+
-+    if use_gpu:
-+        # Custom Model-Data Parallel
-+        # the mlps are replicated and use data parallelism, while
-+        # the embeddings are distributed and use model parallelism
-+        dlrm = dlrm.to(device)  # .cuda()
-+        if dlrm.ndevices > 1:
-+            dlrm.emb_l = dlrm.create_emb(m_spa, ln_emb)
-+
-+    if ext_dist.my_size > 1:
-+        if use_gpu:
-+            device_ids = [ext_dist.my_local_rank]
-+            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l, device_ids=device_ids)
-+            dlrm.top_l = ext_dist.DDP(dlrm.top_l, device_ids=device_ids)
-+        else:
-+            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l)
-+            dlrm.top_l = ext_dist.DDP(dlrm.top_l)
-+            for i in range(len(dlrm.emb_dense)):
-+                dlrm.emb_dense[i] = ext_dist.DDP(dlrm.emb_dense[i])
-+
-+    # specify the loss function
-+    if args.loss_function == "mse":
-+        loss_fn = torch.nn.MSELoss(reduction="mean")
-+    elif args.loss_function == "bce":
-+        loss_fn = torch.nn.BCELoss(reduction="mean")
-+    elif args.loss_function == "wbce":
-+        loss_ws = torch.tensor(np.fromstring(args.loss_weights, dtype=float, sep="-"))
-+        loss_fn = torch.nn.BCELoss(reduction="none")
-+    else:
-+        sys.exit("ERROR: --loss-function=" + args.loss_function + " is not supported")
-+    ### main loop ###
-+    def time_wrap(use_gpu):
-+        if use_gpu:
-+            torch.cuda.synchronize()
-+        return time.time()
-+
-+    def dlrm_wrap(X, lS_o, lS_i, use_gpu, use_ipex, device):
-+        if use_gpu or use_ipex:  # .cuda()
-+            # lS_i can be either a list of tensors or a stacked tensor.
-+            # Handle each case below:
-+            lS_i = [S_i.to(device) for S_i in lS_i] if isinstance(lS_i, list) \
-+                else lS_i.to(device)
-+            lS_o = [S_o.to(device) for S_o in lS_o] if isinstance(lS_o, list) \
-+                else lS_o.to(device)
-+            return dlrm(
-+                X.to(device),
-+                lS_o,
-+                lS_i
-+            )
-+        else:
-+            return dlrm(X, lS_o, lS_i)
-+
-+    def loss_fn_wrap(Z, T, use_gpu, use_ipex, device):
-+        if args.loss_function == "mse" or args.loss_function == "bce":
-+            if use_gpu or use_ipex:
-+                return loss_fn(Z, T.to(device))
-+            else:
-+                return loss_fn(Z, T)
-+        elif args.loss_function == "wbce":
-+            if use_gpu:
-+                loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T).to(device)
-+                loss_fn_ = loss_fn(Z, T.to(device))
-+            else:
-+                loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T)
-+                loss_fn_ = loss_fn(Z, T.to(device))
-+            loss_sc_ = loss_ws_ * loss_fn_
-+            # debug prints
-+            # print(loss_ws_)
-+            # print(loss_fn_)
-+            return loss_sc_.mean()
-+
-+    # training or inference
-+    best_gA_test = 0
-+    best_auc_test = 0
-+    skip_upto_epoch = 0
-+    skip_upto_batch = 0
-+    total_time = 0
-+    total_loss = 0
-+    total_accu = 0
-+    total_iter = 0
-+    total_samp = 0
-+    k = 0
-+
-+
-+    # Load model is specified
-+    if not (args.load_model == ""):
-+        print("Loading trained model {}".format(args.load_model))
-+        ld_model = torch.load(os.path.join(args.load_model, "dlrm_s_pytorch_" + str(dlrm.rank) + "_.pkl"), map_location=torch.device('cpu'))
++        print("Loading trained model {}".format(args.load_model))
++        ld_model = torch.load(os.path.join(args.load_model, "dlrm_s_pytorch_" + str(dlrm.rank) + "_.pkl"), map_location=torch.device('cpu'))
 +        dlrm.load_state_dict(ld_model["state_dict"])
 +
 +    if args.use_ipex:
@@ -9557,462 +7677,6 @@ index efce1d3..ab6b9a3 100644
  
      log_event(
          key=constants.SUBMISSION_ENTRY,
-diff --git a/model_compression/AGP_Structure/dlrm.schedule_agp_1.yaml b/model_compression/AGP_Structure/dlrm.schedule_agp_1.yaml
-new file mode 100755
-index 0000000..d0d50f8
---- /dev/null
-+++ b/model_compression/AGP_Structure/dlrm.schedule_agp_1.yaml
-@@ -0,0 +1,68 @@
-+version: 1
-+pruners:
-+  low_pruner:
-+    class: L1RankedStructureParameterPruner_AGP
-+    initial_sparsity : 0.01
-+    final_sparsity: 0.50
-+    group_type: Rows
-+    weights: [bot_l.module.0.weight,
-+              bot_l.module.2.weight,
-+              bot_l.module.4.weight,
-+              bot_l.module.6.weight,
-+              top_l.module.0.weight,
-+              top_l.module.2.weight,
-+              top_l.module.4.weight,
-+              top_l.module.6.weight,
-+              top_l.module.8.weight]
-+
-+  fine_pruner:
-+    class:  AutomatedGradualPruner
-+    initial_sparsity : 0.01
-+    final_sparsity: 0.50
-+    weights: [bot_l.module.0.weight,
-+              bot_l.module.2.weight,
-+              bot_l.module.4.weight,
-+              bot_l.module.6.weight,
-+              top_l.module.0.weight,
-+              top_l.module.2.weight,
-+              top_l.module.4.weight,
-+              top_l.module.6.weight,
-+              top_l.module.8.weight]
-+
-+#lr_schedulers:
-+#  pruning_lr:
-+#    class: StepLR
-+#    step_size: 50
-+#    gamma: 0.10
-+
-+
-+#extensions:
-+#  net_thinner:
-+#      class: 'FilterRemover'
-+#      thinning_func_str: remove_filters
-+#      arch: 'resnet20_cifar'
-+#      dataset: 'cifar10'
-+
-+policies:
-+  - pruner:
-+      instance_name : low_pruner
-+    starting_epoch: 0
-+    ending_epoch: 2
-+    frequency: 1
-+
-+  - pruner:
-+      instance_name : fine_pruner
-+    starting_epoch: 2
-+    ending_epoch: 4
-+    frequency: 1
-+
-+# After completing the pruning, we perform network thinning and continue fine-tuning.
-+  #- extension:
-+  #    instance_name: net_thinner
-+  #  epochs: [22]
-+
-+  #- lr_scheduler:
-+  #    instance_name: pruning_lr
-+  #  starting_epoch: 0
-+  #  ending_epoch: 400
-+  #  frequency: 1
-diff --git a/model_compression/AGP_Structure/dlrm.schedule_agp_2.yaml b/model_compression/AGP_Structure/dlrm.schedule_agp_2.yaml
-new file mode 100755
-index 0000000..af068a6
---- /dev/null
-+++ b/model_compression/AGP_Structure/dlrm.schedule_agp_2.yaml
-@@ -0,0 +1,68 @@
-+version: 1
-+pruners:
-+  low_pruner:
-+    class: L1RankedStructureParameterPruner_AGP
-+    initial_sparsity : 0.01
-+    final_sparsity: 0.99
-+    group_type: Rows
-+    weights: [bot_l.module.0.weight,
-+              bot_l.module.1.weight,
-+              bot_l.module.2.weight,
-+              bot_l.module.3.weight,
-+              top_l.module.0.weight,
-+              top_l.module.1.weight,
-+              top_l.module.2.weight,
-+              top_l.module.3.weight,
-+              top_l.module.4.weight]
-+
-+  fine_pruner:
-+    class:  AutomatedGradualPruner
-+    initial_sparsity : 0.01
-+    final_sparsity: 0.99
-+    weights: [bot_l.module.0.weight,
-+              bot_l.module.1.weight,
-+              bot_l.module.2.weight,
-+              bot_l.module.3.weight,
-+              top_l.module.0.weight,
-+              top_l.module.1.weight,
-+              top_l.module.2.weight,
-+              top_l.module.3.weight,
-+              top_l.module.4.weight]
-+
-+#lr_schedulers:
-+#  pruning_lr:
-+#    class: StepLR
-+#    step_size: 50
-+#    gamma: 0.10
-+
-+
-+#extensions:
-+#  net_thinner:
-+#      class: 'FilterRemover'
-+#      thinning_func_str: remove_filters
-+#      arch: 'resnet20_cifar'
-+#      dataset: 'cifar10'
-+
-+policies:
-+  - pruner:
-+      instance_name : low_pruner
-+    starting_epoch: 0
-+    ending_epoch: 2
-+    frequency: 1
-+
-+  - pruner:
-+      instance_name : fine_pruner
-+    starting_epoch: 2
-+    ending_epoch: 4
-+    frequency: 1
-+
-+# After completing the pruning, we perform network thinning and continue fine-tuning.
-+  #- extension:
-+  #    instance_name: net_thinner
-+  #  epochs: [22]
-+
-+  #- lr_scheduler:
-+  #    instance_name: pruning_lr
-+  #  starting_epoch: 0
-+  #  ending_epoch: 400
-+  #  frequency: 1
-diff --git a/model_compression/AGP_Weight/dlrm.schedule_agp.yaml b/model_compression/AGP_Weight/dlrm.schedule_agp.yaml
-new file mode 100755
-index 0000000..f175206
---- /dev/null
-+++ b/model_compression/AGP_Weight/dlrm.schedule_agp.yaml
-@@ -0,0 +1,96 @@
-+# This schedule performs element-wise (fine grain) pruning, following the Automated Gradual Pruner (Zhu-Gupta) schedule.
-+#
-+# time python3 compress_classifier.py -a=alexnet --lr=0.005 -p=50 ../../../data.imagenet -j 22 --epochs 90 --pretrained --compress=../agp-pruning/alexnet.schedule_agp.yaml
-+#
-+# Parameters:
-+#
-+# +----+---------------------------+------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+
-+# |    | Name                      | Shape            |   NNZ (dense) |   NNZ (sparse) |   Cols (%) |   Rows (%) |   Ch (%) |   2D (%) |   3D (%) |   Fine (%) |     Std |     Mean |   Abs-Mean |
-+# |----+---------------------------+------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------|
-+# |  0 | features.module.0.weight  | (64, 3, 11, 11)  |         23232 |          23232 |    0.00000 |    0.00000 |  0.00000 |  0.00000 |  0.00000 |    0.00000 | 0.15003 | -0.00002 |    0.10000 |
-+# |  1 | features.module.3.weight  | (192, 64, 5, 5)  |        307200 |         116736 |    0.00000 |    0.00000 |  0.00000 |  2.19727 |  0.00000 |   62.00000 | 0.04665 | -0.00245 |    0.02222 |
-+# |  2 | features.module.6.weight  | (384, 192, 3, 3) |        663552 |         232244 |    0.00000 |    0.00000 |  0.00000 |  8.85824 |  0.00000 |   64.99988 | 0.03270 | -0.00179 |    0.01627 |
-+# |  3 | features.module.8.weight  | (256, 384, 3, 3) |        884736 |         504300 |    0.00000 |    0.00000 |  0.00000 |  0.44861 |  0.00000 |   42.99995 | 0.02758 | -0.00193 |    0.01720 |
-+# |  4 | features.module.10.weight | (256, 256, 3, 3) |        589824 |         336200 |    0.00000 |    0.00000 |  0.00000 |  0.68512 |  0.00000 |   42.99995 | 0.02836 | -0.00284 |    0.01795 |
-+# |  5 | classifier.1.weight       | (4096, 9216)     |      37748736 |        3397387 |    0.00000 |    0.21973 |  0.00000 |  0.21973 |  0.00000 |   91.00000 | 0.00567 | -0.00017 |    0.00151 |
-+# |  6 | classifier.4.weight       | (4096, 4096)     |      16777216 |        1509950 |    0.21973 |    3.93066 |  0.00000 |  3.93066 |  0.00000 |   91.00000 | 0.00798 | -0.00054 |    0.00217 |
-+# |  7 | classifier.6.weight       | (1000, 4096)     |       4096000 |        1024000 |    2.97852 |    0.00000 |  0.00000 |  0.00000 |  0.00000 |   75.00000 | 0.01688 |  0.00052 |    0.00753 |
-+# |  8 | Total sparsity:           | -                |      61090496 |        7144049 |    0.00000 |    0.00000 |  0.00000 |  0.00000 |  0.00000 |   88.30579 | 0.00000 |  0.00000 |    0.00000 |
-+# +----+---------------------------+------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+
-+# Total sparsity: 88.31
-+#
-+# --- validate (epoch=89)-----------
-+# 128116 samples (256 per mini-batch)
-+# Epoch: [89][   50/  500]    Loss 2.180059    Top1 51.671875    Top5 74.250000
-+# Epoch: [89][  100/  500]    Loss 2.174012    Top1 51.851562    Top5 74.386719
-+# Epoch: [89][  150/  500]    Loss 2.187923    Top1 51.520833    Top5 74.184896
-+# Epoch: [89][  200/  500]    Loss 2.192574    Top1 51.433594    Top5 74.156250
-+# Epoch: [89][  250/  500]    Loss 2.184967    Top1 51.459375    Top5 74.279688
-+# Epoch: [89][  300/  500]    Loss 2.184608    Top1 51.440104    Top5 74.239583
-+# Epoch: [89][  350/  500]    Loss 2.179280    Top1 51.537946    Top5 74.357143
-+# Epoch: [89][  400/  500]    Loss 2.182293    Top1 51.491211    Top5 74.354492
-+# Epoch: [89][  450/  500]    Loss 2.182311    Top1 51.440104    Top5 74.333333
-+# Epoch: [89][  500/  500]    Loss 2.183890    Top1 51.459375    Top5 74.328125
-+# ==> Top1: 51.456    Top5: 74.326    Loss: 2.185
-+#
-+# Saving checkpoint
-+# --- test ---------------------
-+# 50000 samples (256 per mini-batch)
-+# Test: [   50/  195]    Loss 1.493478    Top1 63.070312    Top5 85.898438
-+# Test: [  100/  195]    Loss 1.643213    Top1 60.539063    Top5 83.589844
-+# Test: [  150/  195]    Loss 1.836551    Top1 57.466146    Top5 80.273438
-+# ==> Top1: 56.528    Top5: 79.352    Loss: 1.897
-+#
-+#
-+# Log file for this run: /data/home/cvds_lab/nzmora/private-distiller/examples/classifier_compression/logs/2018.03.30-142316/2018.03.30-142316.log
-+#
-+# real    664m25.914s
-+# user    13391m25.914s
-+# sys     1569m37.119s
-+
-+version: 1
-+pruners:
-+  bot_l1_pruner:
-+    class: 'AutomatedGradualPruner'
-+    initial_sparsity : 0.01
-+    final_sparsity: 0.50
-+    weights: ['bot_l.module.1.weight', 'bot_l.module.1.bias']
-+
-+  top_l1_pruner:
-+    class: 'AutomatedGradualPruner'
-+    initial_sparsity: 0.01
-+    final_sparsity: 0.50
-+    weights: ['top_l.module.0.weight', 'top_l.module.0.bias']
-+
-+  top_l2_pruner:
-+    class: 'AutomatedGradualPruner'
-+    initial_sparsity : 0.01
-+    final_sparsity: 0.50
-+    weights: ['top_l.module.1.weight', top_l.module.1.bias]
-+
-+#lr_schedulers:
-+  # Learning rate decay scheduler
-+#   pruning_lr:
-+#     class: ExponentialLR
-+#     gamma: 0.9
-+
-+
-+policies:
-+  - pruner:
-+      instance_name : 'bot_l1_pruner'
-+    starting_epoch: 1
-+    ending_epoch: 10
-+    frequency: 1
-+
-+  - pruner:
-+      instance_name : 'top_l1_pruner'
-+    starting_epoch: 1
-+    ending_epoch: 10
-+    frequency: 1
-+
-+  - pruner:
-+      instance_name : 'top_l2_pruner'
-+    starting_epoch: 1
-+    ending_epoch: 10
-+    frequency: 1
-+
-diff --git a/model_compression/AGP_Weight/dlrm.schedule_sensivity.yaml b/model_compression/AGP_Weight/dlrm.schedule_sensivity.yaml
-new file mode 100755
-index 0000000..19a0368
---- /dev/null
-+++ b/model_compression/AGP_Weight/dlrm.schedule_sensivity.yaml
-@@ -0,0 +1,89 @@
-+#
-+# This schedule is an example of "Iterative Pruning" for Alexnet/Imagent, as
-+# described in chapter 3 of Song Han's PhD dissertation: "EFFICIENT METHODS AND
-+# HARDWARE FOR DEEP LEARNING"
-+#
-+# The pruning policy uses multiple pruning phases.  Each pruning phase is
-+# followed by a retraining phase.
-+# In this particular policy, pruning is scheduled every 2 epochs.
-+# After 38/2 pruning phases, pruning ends and the only retraining continues.
-+#
-+# time python3 compress_classifier.py -a alexnet --lr 0.005 -p 50 ../../../data.imagenet -j=44 --epochs=90 --pretrained --compress=../sensitivity-pruning/alexnet.schedule_sensitivity.yaml
-+#
-+# Parameters:
-+#
-+# +----+---------------------------+------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+
-+# |    | Name                      | Shape            |   NNZ (dense) |   NNZ (sparse) |   Cols (%) |   Rows (%) |   Ch (%) |   2D (%) |   3D (%) |   Fine (%) |     Std |     Mean |   Abs-Mean |
-+# |----+---------------------------+------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------|
-+# |  0 | features.module.0.weight  | (64, 3, 11, 11)  |         23232 |          13373 |    0.00000 |    0.00000 |  0.00000 |  0.00000 |  0.00000 |   42.43716 | 0.14381 | -0.00002 |    0.08794 |
-+# |  1 | features.module.3.weight  | (192, 64, 5, 5)  |        307200 |         115322 |    0.00000 |    0.00000 |  0.00000 |  2.04264 |  0.00000 |   62.46029 | 0.04702 | -0.00248 |    0.02286 |
-+# |  2 | features.module.6.weight  | (384, 192, 3, 3) |        663552 |         256454 |    0.00000 |    0.00000 |  0.00000 |  6.13742 |  0.00000 |   61.35133 | 0.03354 | -0.00184 |    0.01803 |
-+# |  3 | features.module.8.weight  | (256, 384, 3, 3) |        884736 |         315278 |    0.00000 |    0.00000 |  0.00000 |  7.02922 |  0.00000 |   64.36474 | 0.02647 | -0.00168 |    0.01423 |
-+# |  4 | features.module.10.weight | (256, 256, 3, 3) |        589824 |         186861 |    0.00000 |    0.00000 |  0.00000 | 15.72266 |  0.00000 |   68.31919 | 0.02714 | -0.00245 |    0.01408 |
-+# |  5 | classifier.1.weight       | (4096, 9216)     |      37748736 |        3395124 |    0.00000 |    0.21973 |  0.00000 |  0.21973 |  0.00000 |   91.00599 | 0.00589 | -0.00020 |    0.00168 |
-+# |  6 | classifier.4.weight       | (4096, 4096)     |      16777216 |        1783541 |    0.21973 |    3.49121 |  0.00000 |  3.49121 |  0.00000 |   89.36927 | 0.00849 | -0.00066 |    0.00263 |
-+# |  7 | classifier.6.weight       | (1000, 4096)     |       4096000 |         993134 |    3.39355 |    0.00000 |  0.00000 |  0.00000 |  0.00000 |   75.75356 | 0.01718 |  0.00029 |    0.00777 |
-+# |  8 | Total sparsity:           | -                |      61090496 |        7059087 |    0.00000 |    0.00000 |  0.00000 |  0.00000 |  0.00000 |   88.44487 | 0.00000 |  0.00000 |    0.00000 |
-+# +----+---------------------------+------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+
-+# Total sparsity: 88.44
-+#
-+# --- validate (epoch=89)-----------
-+# 128116 samples (256 per mini-batch)
-+# Epoch: [89][   50/  500]    Loss 2.149753    Top1 51.976562    Top5 74.859375
-+# Epoch: [89][  100/  500]    Loss 2.154934    Top1 51.941406    Top5 74.550781
-+# Epoch: [89][  150/  500]    Loss 2.159868    Top1 51.880208    Top5 74.513021
-+# Epoch: [89][  200/  500]    Loss 2.158245    Top1 51.875000    Top5 74.597656
-+# Epoch: [89][  250/  500]    Loss 2.150266    Top1 51.920313    Top5 74.667187
-+# Epoch: [89][  300/  500]    Loss 2.152199    Top1 51.933594    Top5 74.682292
-+# Epoch: [89][  350/  500]    Loss 2.152126    Top1 51.952009    Top5 74.684152
-+# Epoch: [89][  400/  500]    Loss 2.153599    Top1 51.949219    Top5 74.648438
-+# Epoch: [89][  450/  500]    Loss 2.151281    Top1 52.046875    Top5 74.703993
-+# Epoch: [89][  500/  500]    Loss 2.149620    Top1 52.032031    Top5 74.765625
-+# ==> Top1: 52.029    Top5: 74.767    Loss: 2.150
-+#
-+# Saving checkpoint
-+# --- test ---------------------
-+# 50000 samples (256 per mini-batch)
-+# Test: [   50/  195]    Loss 1.484814    Top1 63.328125    Top5 85.820312
-+# Test: [  100/  195]    Loss 1.636993    Top1 60.835938    Top5 83.617188
-+# Test: [  150/  195]    Loss 1.832027    Top1 57.713542    Top5 80.330729
-+# ==> Top1: 56.762    Top5: 79.340    Loss: 1.892
-+#
-+#
-+# Log file for this run: /data/home/cvds_lab/nzmora/private-distiller/examples/classifier_compression/logs/2018.04.08-154509/2018.04.08-154509.log
-+#
-+# real    646m54.061s
-+# user    14899m29.068s
-+# sys     1901m19.958s
-+
-+version: 1
-+pruners:
-+  pruner1:
-+    class: 'SensitivityPruner'
-+    sensitivities:
-+      'features.module.0.weight': 0.25
-+      'features.module.3.weight': 0.35
-+      'features.module.6.weight': 0.40
-+      'features.module.8.weight': 0.45
-+      'features.module.10.weight': 0.55
-+      'classifier.1.weight': 0.875
-+      'classifier.4.weight': 0.875
-+      'classifier.6.weight': 0.625
-+
-+lr_schedulers:
-+   pruning_lr:
-+     class: ExponentialLR
-+     gamma: 0.9
-+
-+policies:
-+  - pruner:
-+      instance_name : 'pruner1'
-+    starting_epoch: 0
-+    ending_epoch: 38
-+    frequency: 2
-+
-+  - lr_scheduler:
-+      instance_name: pruning_lr
-+    starting_epoch: 24
-+    ending_epoch: 200
-+    frequency: 1
-diff --git a/model_compression/__init__.py b/model_compression/__init__.py
-new file mode 100644
-index 0000000..e69de29
-diff --git a/model_compression/hyparameters.py b/model_compression/hyparameters.py
-new file mode 100644
-index 0000000..4c08876
---- /dev/null
-+++ b/model_compression/hyparameters.py
-@@ -0,0 +1,101 @@
-+import argparse
-+
-+class hyparams:
-+    ### parse arguments ###
-+    parser = argparse.ArgumentParser(
-+        description="Train Deep Learning Recommendation Model (DLRM)"
-+    )
-+    # model related parameters
-+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
-+    parser.add_argument("--arch-embedding-size", type=str, default="4-3-2")
-+    # j will be replaced with the table number
-+    parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2")
-+    parser.add_argument("--arch-mlp-top", type=str, default="4-2-1")
-+    parser.add_argument("--arch-interaction-op", type=str, default="dot")
-+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
-+    # embedding table options
-+    parser.add_argument("--md-flag", action="store_true", default=False)
-+    parser.add_argument("--md-threshold", type=int, default=200)
-+    parser.add_argument("--md-temperature", type=float, default=0.3)
-+    parser.add_argument("--md-round-dims", action="store_true", default=False)
-+    parser.add_argument("--qr-flag", action="store_true", default=False)
-+    parser.add_argument("--qr-threshold", type=int, default=200)
-+    parser.add_argument("--qr-operation", type=str, default="mult")
-+    parser.add_argument("--qr-collisions", type=int, default=4)
-+    # activations and loss
-+    parser.add_argument("--activation-function", type=str, default="relu")
-+    parser.add_argument("--loss-function", type=str, default="mse")  # or bce or wbce
-+    parser.add_argument("--loss-weights", type=str, default="1.0-1.0")  # for wbce
-+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
-+    parser.add_argument("--round-targets", type=bool, default=False)
-+    # data
-+    parser.add_argument("--data-size", type=int, default=1)
-+    parser.add_argument("--num-batches", type=int, default=0)
-+    parser.add_argument(
-+        "--data-generation", type=str, default="random"
-+    )  # synthetic or dataset
-+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
-+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
-+    parser.add_argument("--raw-data-file", type=str, default="")
-+    parser.add_argument("--processed-data-file", type=str, default="")
-+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
-+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
-+    parser.add_argument("--max-ind-range", type=int, default=-1)
-+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
-+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
-+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
-+    parser.add_argument("--num-workers", type=int, default=0)
-+    parser.add_argument("--memory-map", action="store_true", default=False)
-+    # training
-+    parser.add_argument("--mini-batch-size", type=int, default=1)
-+    parser.add_argument("--nepochs", type=int, default=1)
-+    parser.add_argument("--learning-rate", type=float, default=0.01)
-+    parser.add_argument("--print-precision", type=int, default=5)
-+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
-+    parser.add_argument("--sync-dense-params", type=bool, default=True)
-+    # inference
-+    parser.add_argument("--inference-only", action="store_true", default=False)
-+    # onnx
-+    parser.add_argument("--save-onnx", action="store_true", default=False)
-+    # gpu
-+    parser.add_argument("--use-gpu", action="store_true", default=False)
-+    # distributed run
-+    parser.add_argument("--dist-backend", type=str, default="")
-+    # debugging and profiling
-+    parser.add_argument("--print-freq", type=int, default=1)
-+    parser.add_argument("--test-freq", type=int, default=-1)
-+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
-+    parser.add_argument("--test-num-workers", type=int, default=-1)
-+    parser.add_argument("--print-time", action="store_true", default=False)
-+    parser.add_argument("--debug-mode", action="store_true", default=False)
-+    parser.add_argument("--enable-profiling", action="store_true", default=False)
-+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
-+    parser.add_argument("--profiling-start-iter", type=int, default=50)
-+    parser.add_argument("--profiling-num-iters", type=int, default=100)
-+    # store/load model
-+    parser.add_argument("--out-dir", type=str, default=".")
-+    parser.add_argument("--save-model", type=str, default="")
-+    parser.add_argument("--load-model", type=str, default="")
-+    # mlperf logging (disables other output and stops early)
-+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
-+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
-+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
-+    # stop at target AUC Terabyte (no subsampling) 0.8025
-+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
-+    parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
-+    parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False)
-+    # LR policy
-+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
-+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
-+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
-+    # embedding table is sparse table only if sparse_dense_boundary >= 2048
-+    parser.add_argument("--sparse-dense-boundary", type=int, default=2048)
-+    # bf16 option
-+    parser.add_argument("--bf16", action='store_true', default=False)
-+    # ipex option
-+    parser.add_argument("--use-ipex", action="store_true", default=False)
-+    # lamb
-+    parser.add_argument("--optimizer", type=int, default=0, help='optimizer:[0:sgd, 1:lamb/sgd, 2:adagrad, 3:sparseadam]')
-+    # distiller option
-+    parser.add_argument("--model-compression-type", type=str, default=None)
-+    parser.add_argument("--compression-file", type=str, default="./model_compression/dlrm.schedule_agp.yaml")
-\ No newline at end of file
 diff --git a/optim/rwsadagrad.py b/optim/rwsadagrad.py
 deleted file mode 100644
 index 95381ec..0000000
@@ -10157,30 +7821,6 @@ index 83ab042..0000000
 -tqdm
 -torchrec-nightly
 -torchx-nightly
-diff --git a/run_compress.sh b/run_compress.sh
-new file mode 100755
-index 0000000..3766f37
---- /dev/null
-+++ b/run_compress.sh
-@@ -0,0 +1,18 @@
-+source ~/.local/env/setvars.sh
-+seed_num=$(date +%s)
-+#export CCL_ATL_SHM=0
-+export KMP_BLOCKTIME=1
-+export KMP_AFFINITY="granularity=fine,compact,1,0"
-+
-+
-+#Four nodes
-+#python -u ./launch.py --distributed --nproc_per_node=2 --nnodes=1 --hostfile ./hosts --master_addr="10.0.0.44" ./dlrm_s_pytorch_lamb_sparselamb_test.py --arch-sparse-feature-size=64 --arch-mlp-bot="13-128-64" --arch-mlp-top="256-128-1" --max-ind-range=40000000  --data-generation=dataset --data-set=terabyte --raw-data-file=$DATA_PATH/day --processed-data-file=$DATA_PATH/terabyte_processed.npz --loss-function=bce --round-targets=True --bf16 --num-workers=0 --test-num-workers=0 --use-ipex --optimizer=1 --dist-backend=ccl --learning-rate=16 --mini-batch-size=65536 --print-freq=16 --print-time --test-freq=800 --sparse-dense-boundary=403346 --test-mini-batch-size=4096 --lr-num-warmup-steps=4000 --lr-decay-start-step=5760 --lr-num-decay-steps=27000 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle --numpy-rand-seed=12345 --save-model "./model/" $dlrm_extra_option 2>&1 | tee run_${seed_num}_03_30_2021.log
-+
-+# run model compression
-+python -u ./launch.py --distributed --nproc_per_node=2 --nnodes=1 --hostfile ./hosts --master_addr="10.0.0.44" --master_port="29501" ./dlrm_s_pytorch_compress.py --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-128-64" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000  --data-generation=dataset --data-set=terabyte --raw-data-file=$DATA_PATH/day --processed-data-file=$DATA_PATH/terabyte_processed.npz --loss-function=bce --round-targets=True --num-workers=0 --nepochs 4 --test-num-workers=0 --use-ipex --optimizer=1 --dist-backend=ccl --learning-rate=16 --mini-batch-size=65536 --print-freq=16 --print-time --test-freq=800 --sparse-dense-boundary=403346 --test-mini-batch-size=4096 --lr-num-warmup-steps=4000 --lr-decay-start-step=5760 --lr-num-decay-steps=27000 --memory-map --mlperf-logging --mlperf-auc-threshold=-1 --mlperf-bin-loader --mlperf-bin-shuffle --numpy-rand-seed=12345 --save-model "./model_compression/model/compress/AGP_Structure/test2" --model-compression-type "AGP" --compression-file "./model_compression/AGP_Structure/dlrm.schedule_agp_2.yaml" $dlrm_extra_option 2>&1 | tee ./model_compression/model/compress/AGP_Structure/test2/run_model_09_2021_test.log
-+
-+&&
-+
-+#run compression analysis
-+python -u ./launch.py --distributed --nproc_per_node=2 --nnodes=1 --hostfile ./hosts --master_addr="10.0.0.44" ./analysis_model.py --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-128-64" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000  --data-generation=dataset --data-set=terabyte --raw-data-file=$DATA_PATH/day --processed-data-file=$DATA_PATH/terabyte_processed.npz --loss-function=bce --round-targets=True --num-workers=0 --test-num-workers=0 --use-ipex --optimizer=1 --dist-backend=ccl --learning-rate=16 --nepochs 10 --mini-batch-size=65536 --print-freq=16 --print-time --test-freq=800 --sparse-dense-boundary=403346 --test-mini-batch-size=4096 --lr-num-warmup-steps=4000 --lr-decay-start-step=5760 --lr-num-decay-steps=27000 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle --numpy-rand-seed=12345 > ./model_compression/model/compress/AGP_Structure/test2/analysis_10_19_2021.log 2>&1 &
-+
 diff --git a/test/dlrm_s_test.sh b/test/dlrm_s_test.sh
 deleted file mode 100755
 index e504545..0000000

From 273080c6e94ac00bc27c121f6ae329129b4a5565 Mon Sep 17 00:00:00 2001
From: tianyi1 <tianyi.liu@intel.com>
Date: Wed, 7 Sep 2022 14:58:26 +0800
Subject: [PATCH 2/3] trigger the jenkins test

---
 modelzoo/dlrm/patch_dlrm.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/modelzoo/dlrm/patch_dlrm.sh b/modelzoo/dlrm/patch_dlrm.sh
index 2ada5a9cc..19773160d 100644
--- a/modelzoo/dlrm/patch_dlrm.sh
+++ b/modelzoo/dlrm/patch_dlrm.sh
@@ -5,15 +5,11 @@ get_original_model () {
 
 }
 
-
-
 apply_patch () {
    patch -p1 <../dlrm.patch
 
 }
 
-
-
 get_original_model
 
-apply_patch
\ No newline at end of file
+apply_patch

From 65866b538d9388f43800a7b902f04e21a10b5773 Mon Sep 17 00:00:00 2001
From: tianyi1 <tianyi.liu@intel.com>
Date: Thu, 8 Sep 2022 11:12:25 +0800
Subject: [PATCH 3/3] re-triger the jenkins test

---
 modelzoo/dlrm/patch_dlrm.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelzoo/dlrm/patch_dlrm.sh b/modelzoo/dlrm/patch_dlrm.sh
index 19773160d..3df17a713 100644
--- a/modelzoo/dlrm/patch_dlrm.sh
+++ b/modelzoo/dlrm/patch_dlrm.sh
@@ -5,11 +5,13 @@ get_original_model () {
 
 }
 
+
 apply_patch () {
    patch -p1 <../dlrm.patch
 
 }
 
+
 get_original_model
 
 apply_patch