From 6a6ce12152b5664c2bdaf7e39a82bbf398ba888e Mon Sep 17 00:00:00 2001
From: Sammy Sidhu <samster25@users.noreply.github.com>
Date: Fri, 23 Aug 2024 13:05:27 -0700
Subject: [PATCH] fix mypy errors

---
 benchmarking/tpch/data_generation.py                     | 2 +-
 benchmarking/tpch/pipelined_data_generation.py           | 2 +-
 daft/context.py                                          | 2 +-
 daft/dataframe/dataframe.py                              | 2 +-
 daft/expressions/expressions.py                          | 1 +
 tests/expressions/test_udf.py                            | 6 +++---
 tests/integration/io/test_list_files_s3_minio.py         | 2 +-
 tutorials/delta_lake/1-local-image-batch-inference.ipynb | 2 +-
 tutorials/delta_lake/2-distributed-batch-inference.ipynb | 2 +-
 tutorials/mnist.ipynb                                    | 4 ++--
 10 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/benchmarking/tpch/data_generation.py b/benchmarking/tpch/data_generation.py
index 1908828a40..84a7adca1f 100644
--- a/benchmarking/tpch/data_generation.py
+++ b/benchmarking/tpch/data_generation.py
@@ -253,7 +253,7 @@ def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str:
     Returns:
         str: path to folder with generated CSV files
     """
-    cachedir = os.path.join(basedir, ("%.1f" % scale_factor).replace(".", "_"), str(num_parts))
+    cachedir = os.path.join(basedir, (f"{scale_factor:.1f}").replace(".", "_"), str(num_parts))
     if not os.path.exists(cachedir):
         # If running in CI, use a scale factor of 0.2
         # Otherwise, check for SCALE_FACTOR env variable or default to 1
diff --git a/benchmarking/tpch/pipelined_data_generation.py b/benchmarking/tpch/pipelined_data_generation.py
index 36d3629c67..f28063a990 100644
--- a/benchmarking/tpch/pipelined_data_generation.py
+++ b/benchmarking/tpch/pipelined_data_generation.py
@@ -48,7 +48,7 @@ def pipelined_data_generation(
 ):
     assert num_parts > 1, "script should only be used if num_parts > 1"
 
-    cachedir = pathlib.Path(scratch_dir) / ("%.1f" % scale_factor).replace(".", "_") / str(num_parts)
+    cachedir = pathlib.Path(scratch_dir) / (f"{scale_factor:.1f}").replace(".", "_") / str(num_parts)
 
     if not cachedir.exists():
         logger.info("Cloning tpch dbgen repo")
diff --git a/daft/context.py b/daft/context.py
index f286c77c7b..38ef8545d5 100644
--- a/daft/context.py
+++ b/daft/context.py
@@ -17,7 +17,7 @@
 
 
 class _RunnerConfig:
-    name = ClassVar[str]
+    name: ClassVar[str]
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
index 3dd7458db4..37dea4d822 100644
--- a/daft/dataframe/dataframe.py
+++ b/daft/dataframe/dataframe.py
@@ -1984,7 +1984,7 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any)
         """
         result = func(self, *args, **kwargs)
         assert isinstance(result, DataFrame), (
-            "Func returned an instance of type [%s], " "should have been DataFrame." % type(result)
+            f"Func returned an instance of type [{type(result)}], " "should have been DataFrame."
         )
         return result
 
diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
index 8de584035b..9f5085ac3a 100644
--- a/daft/expressions/expressions.py
+++ b/daft/expressions/expressions.py
@@ -116,6 +116,7 @@ def lit(value: object) -> Expression:
         lit_value = _time_lit(i64_value, time_unit)
     elif isinstance(value, Decimal):
         sign, digits, exponent = value.as_tuple()
+        assert isinstance(exponent, int)
         lit_value = _decimal_lit(sign == 1, digits, exponent)
     elif isinstance(value, Series):
         lit_value = _series_lit(value._series)
diff --git a/tests/expressions/test_udf.py b/tests/expressions/test_udf.py
index 1c6c00d3aa..2572eb1adc 100644
--- a/tests/expressions/test_udf.py
+++ b/tests/expressions/test_udf.py
@@ -154,11 +154,11 @@ def test_udf_return_containers(container, batch_size):
 
     @udf(return_dtype=DataType.string(), batch_size=batch_size)
     def identity(data):
-        if container == Series:
+        if container is Series:
             return data
-        elif container == list:
+        elif container is list:
             return data.to_pylist()
-        elif container == np.ndarray:
+        elif container is np.ndarray:
             return np.array(data.to_arrow())
         else:
             raise NotImplementedError(f"Test not implemented for container type: {container}")
diff --git a/tests/integration/io/test_list_files_s3_minio.py b/tests/integration/io/test_list_files_s3_minio.py
index b98100ef5c..5cdfd59c68 100644
--- a/tests/integration/io/test_list_files_s3_minio.py
+++ b/tests/integration/io/test_list_files_s3_minio.py
@@ -213,7 +213,7 @@ def test_directory_globbing_fragment_wildcard(minio_io_config, path_expect_pair,
         for name in files:
             fs.touch(f"bucket/{name}")
 
-        if type(expect) == type and issubclass(expect, BaseException):
+        if type(expect) is type and issubclass(expect, BaseException):
             with pytest.raises(expect):
                 io_glob(globpath, io_config=minio_io_config, fanout_limit=fanout_limit)
         else:
diff --git a/tutorials/delta_lake/1-local-image-batch-inference.ipynb b/tutorials/delta_lake/1-local-image-batch-inference.ipynb
index 3d39917ba1..96d2562975 100644
--- a/tutorials/delta_lake/1-local-image-batch-inference.ipynb
+++ b/tutorials/delta_lake/1-local-image-batch-inference.ipynb
@@ -381,7 +381,7 @@
     "        batch = self.preprocess(images_array)\n",
     "        prediction = self.model(batch).softmax(0)\n",
     "        class_ids = prediction.argmax(1)\n",
-    "        scores = prediction[:, class_ids]\n",
+    "        prediction[:, class_ids]\n",
     "        return [self.category_map[class_id] for class_id in class_ids]"
    ]
   },
diff --git a/tutorials/delta_lake/2-distributed-batch-inference.ipynb b/tutorials/delta_lake/2-distributed-batch-inference.ipynb
index 41a6bb315a..8462a74e0f 100644
--- a/tutorials/delta_lake/2-distributed-batch-inference.ipynb
+++ b/tutorials/delta_lake/2-distributed-batch-inference.ipynb
@@ -337,7 +337,7 @@
     "        batch = self.preprocess(images_array)\n",
     "        prediction = self.model(batch).softmax(0)\n",
     "        class_ids = prediction.argmax(1)\n",
-    "        scores = prediction[:, class_ids]\n",
+    "        prediction[:, class_ids]\n",
     "        return [self.category_map[class_id] for class_id in class_ids]\n",
     "\n",
     "\n",
diff --git a/tutorials/mnist.ipynb b/tutorials/mnist.ipynb
index 776ca0937a..28973d1b47 100644
--- a/tutorials/mnist.ipynb
+++ b/tutorials/mnist.ipynb
@@ -235,7 +235,7 @@
     "\n",
     "images_df = images_df.with_column(\n",
     "    \"image_2d\",\n",
-    "    col(\"image\").apply(lambda l: np.array(l).reshape(28, 28), return_dtype=DataType.python()),\n",
+    "    col(\"image\").apply(lambda img: np.array(img).reshape(28, 28), return_dtype=DataType.python()),\n",
     ")"
    ]
   },
@@ -495,7 +495,7 @@
     "\n",
     "class Net(nn.Module):\n",
     "    def __init__(self):\n",
-    "        super(Net, self).__init__()\n",
+    "        super().__init__()\n",
     "        self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
     "        self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
     "        self.dropout1 = nn.Dropout(0.25)\n",