diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 0b388a4ba3814..8a1c3722d4a0f 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -872,12 +873,13 @@ garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader, } } -typedef struct GArrowCSVReadOptionsPrivate_ +struct GArrowCSVReadOptionsPrivate { arrow::csv::ReadOptions read_options; arrow::csv::ParseOptions parse_options; arrow::csv::ConvertOptions convert_options; -} GArrowCSVReadOptionsPrivate; + GList *timestamp_parsers; +}; enum { PROP_USE_THREADS = 1, @@ -902,6 +904,17 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReadOptions, garrow_csv_read_options, G_TYPE static_cast( \ garrow_csv_read_options_get_instance_private(GARROW_CSV_READ_OPTIONS(object))) +static void +garrow_csv_read_options_dispose(GObject *object) +{ + auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(object); + + g_list_free_full(priv->timestamp_parsers, g_object_unref); + priv->timestamp_parsers = nullptr; + + G_OBJECT_CLASS(garrow_csv_read_options_parent_class)->dispose(object); +} + static void garrow_csv_read_options_set_property(GObject *object, guint prop_id, @@ -1032,6 +1045,7 @@ garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass) auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_csv_read_options_dispose; gobject_class->set_property = garrow_csv_read_options_set_property; gobject_class->get_property = garrow_csv_read_options_get_property; @@ -1623,6 +1637,71 @@ garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options, priv->read_options.column_names.push_back(column_name); } +/** + * garrow_csv_read_options_set_timestamp_parsers: + * @options: A #GArrowCSVReadOptions. + * @parsers: (element-type GArrowTimestampParser): The list of + * #GArrowTimestampParser to be added. + * + * Since: 16.0.0 + */ +void +garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options, + GList *parsers) +{ + auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options); + g_list_free_full(priv->timestamp_parsers, g_object_unref); + priv->convert_options.timestamp_parsers.clear(); + for (auto node = parsers; node; node = g_list_next(node)) { + if (!node->data) { + continue; + } + auto parser = GARROW_TIMESTAMP_PARSER(node->data); + g_object_ref(parser); + priv->timestamp_parsers = g_list_prepend(priv->timestamp_parsers, parser); + priv->convert_options.timestamp_parsers.push_back( + garrow_timestamp_parser_get_raw(parser)); + } + priv->timestamp_parsers = g_list_reverse(priv->timestamp_parsers); +} + +/** + * garrow_csv_read_options_get_timestamp_parsers: + * @options: A #GArrowCSVReadOptions. + * + * Returns: (element-type GArrowTimestampParser) (transfer none): + * + * The list of #GArrowTimestampParsers to be used. + * + * Since: 16.0.0 + */ +GList * +garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options) +{ + auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options); + return priv->timestamp_parsers; +} + +/** + * garrow_csv_read_options_add_timestamp_parser: + * @options: A #GArrowCSVReadOptions. + * @parser: The #GArrowTimestampParser to be added. + * + * Since: 16.0.0 + */ +void +garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options, + GArrowTimestampParser *parser) +{ + auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options); + if (parser) { + g_object_ref(parser); + priv->timestamp_parsers = g_list_append(priv->timestamp_parsers, parser); + priv->convert_options.timestamp_parsers.push_back( + garrow_timestamp_parser_get_raw(parser)); + } +} + typedef struct GArrowCSVReaderPrivate_ { std::shared_ptr reader; diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h index 08faf86cd09f7..96e4c5bbb5890 100644 --- a/c_glib/arrow-glib/reader.h +++ b/c_glib/arrow-glib/reader.h @@ -19,13 +19,12 @@ #pragma once +#include +#include #include #include #include - -#include - -#include +#include G_BEGIN_DECLS @@ -239,6 +238,17 @@ GARROW_AVAILABLE_IN_0_15 void garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options, const gchar *column_name); +GARROW_AVAILABLE_IN_16_0 +void +garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options, + GList *parsers); +GARROW_AVAILABLE_IN_16_0 +GList * +garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options); +GARROW_AVAILABLE_IN_16_0 +void +garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options, + GArrowTimestampParser *parser); #define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader, garrow_csv_reader, GARROW, CSV_READER, GObject) diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb index 018f062ac3b99..cc102553b121e 100644 --- a/c_glib/test/test-csv-reader.rb +++ b/c_glib/test/test-csv-reader.rb @@ -236,6 +236,21 @@ def test_generate_column_names assert_equal(build_table(columns), table.read) end + + def test_timestamp_parsers + options = Arrow::CSVReadOptions.new + assert_equal([], options.timestamp_parsers) + + iso8601_timestamp_parser = Arrow::ISO8601TimestampParser.new + options.timestamp_parsers = [iso8601_timestamp_parser] + assert_equal([iso8601_timestamp_parser], + options.timestamp_parsers) + + date_timestamp_parser = Arrow::StrptimeTimestampParser.new("%Y-%m-%d") + options.add_timestamp_parser(date_timestamp_parser) + assert_equal([iso8601_timestamp_parser, date_timestamp_parser], + options.timestamp_parsers) + end end end end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index 9d1432bbfbabb..bd0d03930885c 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -138,6 +138,7 @@ def require_libraries require "arrow/timestamp-array" require "arrow/timestamp-array-builder" require "arrow/timestamp-data-type" + require "arrow/timestamp-parser" require "arrow/union-array-builder" require "arrow/writable" end diff --git a/ruby/red-arrow/lib/arrow/timestamp-parser.rb b/ruby/red-arrow/lib/arrow/timestamp-parser.rb new file mode 100644 index 0000000000000..d50ac5846efb1 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/timestamp-parser.rb @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class TimestampParser + class << self + def try_convert(value) + case value + when :iso8601 + ISO8601TimestampParser.new + when String + StrptimeTimestampParser.new(value) + else + nil + end + end + end + end +end diff --git a/ruby/red-arrow/test/test-csv-loader.rb b/ruby/red-arrow/test/test-csv-loader.rb index 7f7f23498d0fd..72bae2fcaba70 100644 --- a/ruby/red-arrow/test/test-csv-loader.rb +++ b/ruby/red-arrow/test/test-csv-loader.rb @@ -246,5 +246,42 @@ def load_csv(data, **options) encoding: encoding, compression: :gzip)) end + + sub_test_case(":timestamp_parsers") do + test(":iso8601") do + data_type = Arrow::TimestampDataType.new(:second, + GLib::TimeZone.new("UTC")) + timestamps = [ + Time.iso8601("2024-03-16T23:54:12Z"), + Time.iso8601("2024-03-16T23:54:13Z"), + Time.iso8601("2024-03-16T23:54:14Z"), + ] + values = Arrow::TimestampArray.new(data_type, timestamps) + assert_equal(Arrow::Table.new(value: values), + load_csv(<<-CSV, headers: true, timestamp_parsers: [:iso8601])) +value +#{timestamps[0].iso8601} +#{timestamps[1].iso8601} +#{timestamps[2].iso8601} + CSV + end + + test("String") do + timestamps = [ + Time.iso8601("2024-03-16T23:54:12Z"), + Time.iso8601("2024-03-16T23:54:13Z"), + Time.iso8601("2024-03-16T23:54:14Z"), + ] + values = Arrow::TimestampArray.new(:second, timestamps) + format = "%Y-%m-%dT%H:%M:%S" + assert_equal(Arrow::Table.new(value: values).schema, + load_csv(<<-CSV, headers: true, timestamp_parsers: [format]).schema) +value +#{timestamps[0].iso8601.chomp("Z")} +#{timestamps[1].iso8601.chomp("Z")} +#{timestamps[2].iso8601.chomp("Z")} + CSV + end + end end end