From a4b0f350540257c49e245dba7b83712adf5cbc18 Mon Sep 17 00:00:00 2001
From: Eero Lihavainen <eero.lihavainen@nitor.com>
Date: Sat, 28 Oct 2023 15:23:19 +0300
Subject: [PATCH] remove redundant constructor

---
 cpp/src/arrow/dataset/file_base.h     |  5 -----
 cpp/src/arrow/filesystem/filesystem.h |  3 ---
 python/pyarrow/_dataset.pxd           |  3 +--
 python/pyarrow/_dataset.pyx           |  7 +++++--
 python/pyarrow/tests/test_dataset.py  | 17 ++++++++++++++---
 5 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
index 0cf283a06d61a..2a17cbf8f884d 100644
--- a/cpp/src/arrow/dataset/file_base.h
+++ b/cpp/src/arrow/dataset/file_base.h
@@ -54,11 +54,6 @@ class ARROW_DS_EXPORT FileSource : public util::EqualityComparable<FileSource> {
       : file_info_(std::move(path)),
         filesystem_(std::move(filesystem)),
         compression_(compression) {}
-  FileSource(std::string path, int64_t size, std::shared_ptr<fs::FileSystem> filesystem,
-             Compression::type compression = Compression::UNCOMPRESSED)
-      : file_info_(std::move(path), std::move(size)),
-        filesystem_(std::move(filesystem)),
-        compression_(compression) {}
   FileSource(fs::FileInfo info, std::shared_ptr<fs::FileSystem> filesystem,
              Compression::type compression = Compression::UNCOMPRESSED)
       : file_info_(std::move(info)),
diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h
index 3f233c74d5a1f..baf05cadbb2d3 100644
--- a/cpp/src/arrow/filesystem/filesystem.h
+++ b/cpp/src/arrow/filesystem/filesystem.h
@@ -60,9 +60,6 @@ struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> {
 
   explicit FileInfo(std::string path, FileType type = FileType::Unknown)
       : path_(std::move(path)), type_(type) {}
-  explicit FileInfo(std::string path, int64_t size, FileType type = FileType::Unknown)
-      : path_(std::move(path)), type_(type), size_(size) {}
-
   /// The file type
   FileType type() const { return type_; }
   void set_type(FileType type) { type_ = type; }
diff --git a/python/pyarrow/_dataset.pxd b/python/pyarrow/_dataset.pxd
index 71b2a58c35abc..b9e3aac68ee1f 100644
--- a/python/pyarrow/_dataset.pxd
+++ b/python/pyarrow/_dataset.pxd
@@ -22,12 +22,11 @@
 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow_dataset cimport *
 from pyarrow.lib cimport *
-from pyarrow._fs cimport FileSystem
+from pyarrow._fs cimport FileSystem, FileInfo
 
 
 cdef CFileSource _make_file_source(object file, FileSystem filesystem=*, int64_t file_size=*)
 
-
 cdef class DatasetFactory(_Weakrefable):
 
     cdef:
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 445d86d85387b..b99197cb37332 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -32,7 +32,7 @@ from pyarrow.includes.libarrow_dataset cimport *
 from pyarrow._acero cimport ExecNodeOptions
 from pyarrow._compute cimport Expression, _bind
 from pyarrow._compute import _forbid_instantiation
-from pyarrow._fs cimport FileSystem, FileSelector
+from pyarrow._fs cimport FileSystem, FileSelector, FileInfo
 from pyarrow._csv cimport (
     ConvertOptions, ParseOptions, ReadOptions, WriteOptions)
 from pyarrow.util import _is_iterable, _is_path_like, _stringify_path
@@ -101,6 +101,7 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, int6
     cdef:
         CFileSource c_source
         shared_ptr[CFileSystem] c_filesystem
+        CFileInfo c_info
         c_string c_path
         shared_ptr[CRandomAccessFile] c_file
         shared_ptr[CBuffer] c_buffer
@@ -117,7 +118,9 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, int6
 
         if file_size >= 0:
             c_size = file_size
-            c_source = CFileSource(move(c_path), move(c_size), move(c_filesystem))
+            info = FileInfo(c_path, size=c_size)
+            c_info = info.unwrap()
+            c_source = CFileSource(move(c_info), move(c_filesystem))
         else:
             c_source = CFileSource(move(c_path), move(c_filesystem))
     elif hasattr(file, 'read'):
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 07474dadd7652..55e3d7442714f 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -991,7 +991,6 @@ def test_make_fragment_with_size(s3_example_simple):
 
     fragments = [file_format.make_fragment(path, fs)
                  for path in paths]
-
     dataset = ds.FileSystemDataset(
         fragments, format=file_format, schema=table.schema, filesystem=fs
     )
@@ -999,7 +998,18 @@ def test_make_fragment_with_size(s3_example_simple):
     tbl = dataset.to_table()
     assert tbl.equals(table)
 
-    sizes_toosmall = [1]
+    # true sizes -> works
+    sizes_true = [dataset.filesystem.get_file_info(x).size for x in dataset.files]
+    fragments_with_size = [file_format.make_fragment(path, fs, file_size=size)
+                           for path, size in zip(paths, sizes_true)]
+    dataset_with_size = ds.FileSystemDataset(
+            fragments_with_size, format=file_format, schema=table.schema, filesystem=fs
+    )
+    tbl = dataset.to_table()
+    assert tbl.equals(table)
+
+    # too small sizes -> error
+    sizes_toosmall = [1 for path in paths]
     fragments_with_size = [file_format.make_fragment(path, fs, file_size=size)
                            for path, size in zip(paths, sizes_toosmall)]
 
@@ -1010,7 +1020,8 @@ def test_make_fragment_with_size(s3_example_simple):
     with pytest.raises(pyarrow.lib.ArrowInvalid, match='Parquet file size is 1 bytes'):
         table = dataset_with_size.to_table()
 
-    sizes_toolarge = [1000000]
+    # too large sizes -> error
+    sizes_toolarge = [1000000 for path in paths]
     fragments_with_size = [file_format.make_fragment(path, fs, file_size=size)
                            for path, size in zip(paths, sizes_toolarge)]