Skip to content

Commit

Permalink
ORC-1151: [C++] Fix ColumnWriter for non-UTC Timestamp columns (#1088)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Fix converting non UTC timestamps for statistics.

### Why are the changes needed?
Currently, the statistics for timestamp columns are incorrect, when the writer's time zone is not UTC.

### How was this patch tested?
Ran the existing test cases.

(cherry picked from commit 9042421)
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
noirello authored and dongjoon-hyun committed Apr 22, 2022
1 parent 24ab20e commit 1740c82
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 1 deletion.
2 changes: 1 addition & 1 deletion c++/src/ColumnWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1809,7 +1809,7 @@ namespace orc {
// TimestampVectorBatch already stores data in UTC
int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
if (!isUTC) {
millsUTC = timezone.convertToUTC(millsUTC);
millsUTC = timezone.convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000;
}
++count;
if (enableBloomFilter) {
Expand Down
97 changes: 97 additions & 0 deletions c++/test/TestTimestampStatistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@

#include "Adaptor.hh"

#include "MemoryInputStream.hh"
#include "MemoryOutputStream.hh"

#include "wrap/gmock.h"
#include "wrap/gtest-wrapper.h"

namespace orc {

static const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M

TEST(TestTimestampStatistics, testOldFile) {

std::stringstream ss;
Expand Down Expand Up @@ -57,4 +62,96 @@ namespace orc {
EXPECT_EQ("Data type: Timestamp\nValues: 12\nHas null: no\nMinimum: 1995-01-01 00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString());
}

TEST(TestTimestampStatistics, testTimezoneUTC) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool *pool = getDefaultPool();
std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col:timestamp>"));
WriterOptions wOptions;
wOptions.setMemoryPool(pool);
std::unique_ptr<Writer> writer = createWriter(*type, &memStream, wOptions);
std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(1024);
StructVectorBatch *root = dynamic_cast<StructVectorBatch *>(batch.get());
TimestampVectorBatch *col = dynamic_cast<orc::TimestampVectorBatch *>(root->fields[0]);

int64_t expectedMinMillis = 1650133963321; // 2022-04-16T18:32:43.321+00:00
int64_t expectedMaxMillis = 1650133964321; // 2022-04-16T18:32:44.321+00:00

col->data[0] = expectedMinMillis / 1000;
col->nanoseconds[0] = expectedMinMillis % 1000 * 1000000;
col->data[1] = expectedMaxMillis / 1000;
col->nanoseconds[1] = expectedMaxMillis % 1000 * 1000000;
col->numElements = 2;
root->numElements = 2;

writer->add(*batch);
writer->close();

std::unique_ptr<InputStream> inStream(new MemoryInputStream(
memStream.getData(), memStream.getLength()));
ReaderOptions rOptions;
rOptions.setMemoryPool(*pool);
std::unique_ptr<Reader> reader = createReader(std::move(inStream), rOptions);

std::unique_ptr<StripeStatistics> stripeStats = reader->getStripeStatistics(0);
const TimestampColumnStatistics* stripeColStats =
reinterpret_cast<const TimestampColumnStatistics*>(stripeStats->getColumnStatistics(1));

EXPECT_TRUE(stripeColStats->hasLowerBound());
EXPECT_TRUE(stripeColStats->hasUpperBound());
EXPECT_TRUE(stripeColStats->hasMinimum());
EXPECT_TRUE(stripeColStats->hasMaximum());
EXPECT_EQ(stripeColStats->getMinimum(), expectedMinMillis);
EXPECT_EQ(stripeColStats->getMaximum(), expectedMaxMillis);
EXPECT_EQ(stripeColStats->getLowerBound(), expectedMinMillis);
EXPECT_EQ(stripeColStats->getUpperBound(), expectedMaxMillis + 1);
}

TEST(TestTimestampStatistics, testTimezoneNonUTC) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool *pool = getDefaultPool();
std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col:timestamp>"));
WriterOptions wOptions;
wOptions.setMemoryPool(pool);
wOptions.setTimezoneName("America/Los_Angeles");
std::unique_ptr<Writer> writer = createWriter(*type, &memStream, wOptions);
std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(1024);
StructVectorBatch *root = dynamic_cast<StructVectorBatch *>(batch.get());
TimestampVectorBatch *col = dynamic_cast<orc::TimestampVectorBatch *>(root->fields[0]);

int64_t minMillis = 1650133963321; // 2022-04-16T18:32:43.321+00:00
int64_t maxMillis = 1650133964321; // 2022-04-16T18:32:44.321+00:00

col->data[0] = minMillis / 1000;
col->nanoseconds[0] = minMillis % 1000 * 1000000;
col->data[1] = maxMillis / 1000;
col->nanoseconds[1] = maxMillis % 1000 * 1000000;
col->numElements = 2;
root->numElements = 2;

writer->add(*batch);
writer->close();

std::unique_ptr<InputStream> inStream(new MemoryInputStream(
memStream.getData(), memStream.getLength()));
ReaderOptions rOptions;
rOptions.setMemoryPool(*pool);
std::unique_ptr<Reader> reader = createReader(std::move(inStream), rOptions);

std::unique_ptr<StripeStatistics> stripeStats = reader->getStripeStatistics(0);
const TimestampColumnStatistics* stripeColStats =
reinterpret_cast<const TimestampColumnStatistics*>(stripeStats->getColumnStatistics(1));

int64_t expectedMaxMillis = 1650108764321; // 2022-04-16T11:32:44.321+00:00
int64_t expectedMinMillis = 1650108763321; // 2022-04-16T11:32:43.321+00:00

EXPECT_TRUE(stripeColStats->hasLowerBound());
EXPECT_TRUE(stripeColStats->hasUpperBound());
EXPECT_TRUE(stripeColStats->hasMinimum());
EXPECT_TRUE(stripeColStats->hasMaximum());
EXPECT_EQ(stripeColStats->getMinimum(), expectedMinMillis);
EXPECT_EQ(stripeColStats->getMaximum(), expectedMaxMillis);
EXPECT_EQ(stripeColStats->getLowerBound(), expectedMinMillis);
EXPECT_EQ(stripeColStats->getUpperBound(), expectedMaxMillis + 1);
}

} // namespace

0 comments on commit 1740c82

Please sign in to comment.