ray-project · scv119 · May 7, 2022 · May 2, 2022 · May 3, 2022 · May 5, 2022
@@ -2403,6 +2403,33 @@ def test_csv_read_no_header(shutdown_only, tmp_path):
     assert df.equals(out_df)
 
 
+def test_csv_read_with_column_type_specified(shutdown_only, tmp_path):
+    from pyarrow import csv
+
+    file_path = os.path.join(str(tmp_path), "test.csv")
+    df = pd.DataFrame({"one": [1, 2, 3e+5], "two": ["a", "b", "c"]})
+    df.to_csv(file_path)
+
+    # Incorrect to parse scientific notation in int64 as PyArrow represents
+    # it as double.
+    with pytest.raises(pa.lib.ArrowInvalid):
+        ray.data.read_csv(
+            file_path,
+            convert_options=csv.ConvertOptions(
+                column_types={"one": "int64", "two": "string"}),
+        )
+
+    # Parsing scientific notation in double shoud work.
+    ds = ray.data.read_csv(
+        file_path,
+        convert_options=csv.ConvertOptions(
+            column_types={"one": "float64", "two": "string"}),
+    )
+    expected_df = pd.DataFrame(
+            {"one": [1.0, 2.0, 3e+5], "two": ["a", "b", "c"]})
+    assert df.equals(expected_df)
+
+
 class NodeLoggerOutputDatasource(Datasource[Union[ArrowRow, int]]):
     """A writable datasource that logs node IDs of write tasks, for testing."""