Skip to content

Commit

Permalink
apacheGH-40573: [GLib][Ruby][CSV] Add support for customizing timesta…
Browse files Browse the repository at this point in the history
…mp parsers (apache#40590)

### Rationale for this change

ISO8601 timestamp values in CSV can be parsed by default but non-ISO8601 timestamp values can't.

### What changes are included in this PR?

* Add `garrow_csv_read_options_set_timestamp_parsers()`
* Add `garrow_csv_read_options_get_timestamp_parsers()`
* Add `garrow_csv_read_options_add_timestamp_parser()`
* Add `Arrow::TimestampParser.try_convert` for implicit cast

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: apache#40573

Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
kou authored Mar 18, 2024
1 parent 4569d6e commit 7d3f7b3
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 6 deletions.
83 changes: 81 additions & 2 deletions c_glib/arrow-glib/reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <arrow-glib/record-batch.hpp>
#include <arrow-glib/schema.hpp>
#include <arrow-glib/table.hpp>
#include <arrow-glib/timestamp-parser.hpp>

#include <arrow/c/bridge.h>

Expand Down Expand Up @@ -872,12 +873,13 @@ garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader,
}
}

typedef struct GArrowCSVReadOptionsPrivate_
struct GArrowCSVReadOptionsPrivate
{
arrow::csv::ReadOptions read_options;
arrow::csv::ParseOptions parse_options;
arrow::csv::ConvertOptions convert_options;
} GArrowCSVReadOptionsPrivate;
GList *timestamp_parsers;
};

enum {
PROP_USE_THREADS = 1,
Expand All @@ -902,6 +904,17 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReadOptions, garrow_csv_read_options, G_TYPE
static_cast<GArrowCSVReadOptionsPrivate *>( \
garrow_csv_read_options_get_instance_private(GARROW_CSV_READ_OPTIONS(object)))

static void
garrow_csv_read_options_dispose(GObject *object)
{
auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(object);

g_list_free_full(priv->timestamp_parsers, g_object_unref);
priv->timestamp_parsers = nullptr;

G_OBJECT_CLASS(garrow_csv_read_options_parent_class)->dispose(object);
}

static void
garrow_csv_read_options_set_property(GObject *object,
guint prop_id,
Expand Down Expand Up @@ -1032,6 +1045,7 @@ garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)

auto gobject_class = G_OBJECT_CLASS(klass);

gobject_class->dispose = garrow_csv_read_options_dispose;
gobject_class->set_property = garrow_csv_read_options_set_property;
gobject_class->get_property = garrow_csv_read_options_get_property;

Expand Down Expand Up @@ -1623,6 +1637,71 @@ garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
priv->read_options.column_names.push_back(column_name);
}

/**
* garrow_csv_read_options_set_timestamp_parsers:
* @options: A #GArrowCSVReadOptions.
* @parsers: (element-type GArrowTimestampParser): The list of
* #GArrowTimestampParser to be added.
*
* Since: 16.0.0
*/
void
garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options,
GList *parsers)
{
auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
g_list_free_full(priv->timestamp_parsers, g_object_unref);
priv->convert_options.timestamp_parsers.clear();
for (auto node = parsers; node; node = g_list_next(node)) {
if (!node->data) {
continue;
}
auto parser = GARROW_TIMESTAMP_PARSER(node->data);
g_object_ref(parser);
priv->timestamp_parsers = g_list_prepend(priv->timestamp_parsers, parser);
priv->convert_options.timestamp_parsers.push_back(
garrow_timestamp_parser_get_raw(parser));
}
priv->timestamp_parsers = g_list_reverse(priv->timestamp_parsers);
}

/**
* garrow_csv_read_options_get_timestamp_parsers:
* @options: A #GArrowCSVReadOptions.
*
* Returns: (element-type GArrowTimestampParser) (transfer none):
*
* The list of #GArrowTimestampParsers to be used.
*
* Since: 16.0.0
*/
GList *
garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options)
{
auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
return priv->timestamp_parsers;
}

/**
* garrow_csv_read_options_add_timestamp_parser:
* @options: A #GArrowCSVReadOptions.
* @parser: The #GArrowTimestampParser to be added.
*
* Since: 16.0.0
*/
void
garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options,
GArrowTimestampParser *parser)
{
auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
if (parser) {
g_object_ref(parser);
priv->timestamp_parsers = g_list_append(priv->timestamp_parsers, parser);
priv->convert_options.timestamp_parsers.push_back(
garrow_timestamp_parser_get_raw(parser));
}
}

typedef struct GArrowCSVReaderPrivate_
{
std::shared_ptr<arrow::csv::TableReader> reader;
Expand Down
18 changes: 14 additions & 4 deletions c_glib/arrow-glib/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,12 @@

#pragma once

#include <arrow-glib/input-stream.h>
#include <arrow-glib/metadata-version.h>
#include <arrow-glib/record-batch.h>
#include <arrow-glib/schema.h>
#include <arrow-glib/table.h>

#include <arrow-glib/input-stream.h>

#include <arrow-glib/metadata-version.h>
#include <arrow-glib/timestamp-parser.h>

G_BEGIN_DECLS

Expand Down Expand Up @@ -239,6 +238,17 @@ GARROW_AVAILABLE_IN_0_15
void
garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
const gchar *column_name);
GARROW_AVAILABLE_IN_16_0
void
garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options,
GList *parsers);
GARROW_AVAILABLE_IN_16_0
GList *
garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options);
GARROW_AVAILABLE_IN_16_0
void
garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options,
GArrowTimestampParser *parser);

#define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader, garrow_csv_reader, GARROW, CSV_READER, GObject)
Expand Down
15 changes: 15 additions & 0 deletions c_glib/test/test-csv-reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,21 @@ def test_generate_column_names
assert_equal(build_table(columns),
table.read)
end

def test_timestamp_parsers
options = Arrow::CSVReadOptions.new
assert_equal([], options.timestamp_parsers)

iso8601_timestamp_parser = Arrow::ISO8601TimestampParser.new
options.timestamp_parsers = [iso8601_timestamp_parser]
assert_equal([iso8601_timestamp_parser],
options.timestamp_parsers)

date_timestamp_parser = Arrow::StrptimeTimestampParser.new("%Y-%m-%d")
options.add_timestamp_parser(date_timestamp_parser)
assert_equal([iso8601_timestamp_parser, date_timestamp_parser],
options.timestamp_parsers)
end
end
end
end
1 change: 1 addition & 0 deletions ruby/red-arrow/lib/arrow/loader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def require_libraries
require "arrow/timestamp-array"
require "arrow/timestamp-array-builder"
require "arrow/timestamp-data-type"
require "arrow/timestamp-parser"
require "arrow/union-array-builder"
require "arrow/writable"
end
Expand Down
33 changes: 33 additions & 0 deletions ruby/red-arrow/lib/arrow/timestamp-parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

module Arrow
class TimestampParser
class << self
def try_convert(value)
case value
when :iso8601
ISO8601TimestampParser.new
when String
StrptimeTimestampParser.new(value)
else
nil
end
end
end
end
end
37 changes: 37 additions & 0 deletions ruby/red-arrow/test/test-csv-loader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -246,5 +246,42 @@ def load_csv(data, **options)
encoding: encoding,
compression: :gzip))
end

sub_test_case(":timestamp_parsers") do
test(":iso8601") do
data_type = Arrow::TimestampDataType.new(:second,
GLib::TimeZone.new("UTC"))
timestamps = [
Time.iso8601("2024-03-16T23:54:12Z"),
Time.iso8601("2024-03-16T23:54:13Z"),
Time.iso8601("2024-03-16T23:54:14Z"),
]
values = Arrow::TimestampArray.new(data_type, timestamps)
assert_equal(Arrow::Table.new(value: values),
load_csv(<<-CSV, headers: true, timestamp_parsers: [:iso8601]))
value
#{timestamps[0].iso8601}
#{timestamps[1].iso8601}
#{timestamps[2].iso8601}
CSV
end

test("String") do
timestamps = [
Time.iso8601("2024-03-16T23:54:12Z"),
Time.iso8601("2024-03-16T23:54:13Z"),
Time.iso8601("2024-03-16T23:54:14Z"),
]
values = Arrow::TimestampArray.new(:second, timestamps)
format = "%Y-%m-%dT%H:%M:%S"
assert_equal(Arrow::Table.new(value: values).schema,
load_csv(<<-CSV, headers: true, timestamp_parsers: [format]).schema)
value
#{timestamps[0].iso8601.chomp("Z")}
#{timestamps[1].iso8601.chomp("Z")}
#{timestamps[2].iso8601.chomp("Z")}
CSV
end
end
end
end

0 comments on commit 7d3f7b3

Please sign in to comment.