Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions c++/include/orc/Reader.hh
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,18 @@ namespace orc {
*/
const std::string& getTimezoneName() const;

/**
* Set whether date and timestamp values returned by this row reader should use the
* proleptic Gregorian calendar.
*/
RowReaderOptions& setUseProlepticGregorian(bool useProlepticGregorian);

/**
* Get whether date and timestamp values returned by this row reader should use the
* proleptic Gregorian calendar.
*/
bool getUseProlepticGregorian() const;

/**
* Get the IdReadIntentMap map that was supplied by client.
*/
Expand Down Expand Up @@ -476,6 +488,11 @@ namespace orc {
*/
virtual uint32_t getWriterIdValue() const = 0;

/**
* Was this file written using the proleptic Gregorian calendar for date and timestamp values?
*/
virtual bool writerUsedProlepticGregorian() const = 0;

/**
* Get the version of the writer.
* @return the version of the writer.
Expand Down
1 change: 1 addition & 0 deletions c++/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ set(SOURCE_FILES
ColumnWriter.cc
Common.cc
Compression.cc
DateUtils.cc
Exceptions.cc
Int128.cc
LzoDecompressor.cc
Expand Down
57 changes: 54 additions & 3 deletions c++/src/ColumnReader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "Adaptor.hh"
#include "ByteRLE.hh"
#include "ColumnReader.hh"
#include "DateUtils.hh"
#include "RLE.hh"
#include "orc/Exceptions.hh"

Expand Down Expand Up @@ -275,6 +276,38 @@ namespace orc {
}
};

class DateColumnReader : public IntegerColumnReader<LongVectorBatch> {
private:
const bool writerUsedProleptic;
const bool useProleptic;

void convertCalendar(LongVectorBatch& batch, uint64_t numValues) {
if (writerUsedProleptic == useProleptic) {
return;
}
const char* notNull = batch.hasNulls ? batch.notNull.data() : nullptr;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull == nullptr || notNull[i]) {
batch.data[i] =
convertDate(static_cast<int32_t>(batch.data[i]), writerUsedProleptic, useProleptic);
}
}
}

public:
DateColumnReader(const Type& type, StripeStreams& stripe)
: IntegerColumnReader<LongVectorBatch>(type, stripe),
writerUsedProleptic(stripe.writerUsedProlepticGregorian()),
useProleptic(stripe.useProlepticGregorian()) {}

void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull,
const ReadPhase& readPhase, uint16_t* sel_rowid_idx, size_t sel_size) override {
IntegerColumnReader<LongVectorBatch>::next(rowBatch, numValues, notNull, readPhase,
sel_rowid_idx, sel_size);
convertCalendar(dynamic_cast<LongVectorBatch&>(rowBatch), numValues);
}
};

class TimestampColumnReader : public ColumnReader {
private:
std::unique_ptr<orc::RleDecoder> secondsRle;
Expand All @@ -283,6 +316,8 @@ namespace orc {
const Timezone& readerTimezone;
const int64_t epochOffset;
const bool sameTimezone;
const bool writerUsedProleptic;
const bool useProleptic;

void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull,
const ReadPhase& readPhase);
Expand All @@ -302,6 +337,8 @@ namespace orc {

void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions,
const ReadPhase& readPhase) override;

void convertCalendar(int64_t& seconds, int64_t nanoseconds) const;
};

TimestampColumnReader::TimestampColumnReader(const Type& type, StripeStreams& stripe,
Expand All @@ -310,7 +347,9 @@ namespace orc {
writerTimezone(isInstantType ? getTimezoneByName("GMT") : stripe.getWriterTimezone()),
readerTimezone(isInstantType ? getTimezoneByName("GMT") : stripe.getReaderTimezone()),
epochOffset(writerTimezone.getEpoch()),
sameTimezone(&writerTimezone == &readerTimezone) {
sameTimezone(&writerTimezone == &readerTimezone),
writerUsedProleptic(stripe.writerUsedProlepticGregorian()),
useProleptic(stripe.useProlepticGregorian()) {
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
Expand All @@ -325,6 +364,15 @@ namespace orc {
// PASS
}

void TimestampColumnReader::convertCalendar(int64_t& seconds, int64_t nanoseconds) const {
if (writerUsedProleptic == useProleptic) {
return;
}
const int64_t millis = seconds * 1000 + nanoseconds / 1000000;
const int64_t convertedMillis = convertTime(millis, writerUsedProleptic, useProleptic);
seconds += (convertedMillis - millis) / 1000;
}

uint64_t TimestampColumnReader::skip(uint64_t numValues, const ReadPhase& readPhase) {
numValues = ColumnReader::skip(numValues, readPhase);
numValues = skipInternal(numValues, readPhase);
Expand Down Expand Up @@ -385,6 +433,7 @@ namespace orc {
if (secsBuffer[i] < 0 && nanoBuffer[i] > 999999) {
secsBuffer[i] -= 1;
}
convertCalendar(secsBuffer[i], nanoBuffer[i]);
}
}
}
Expand Down Expand Up @@ -427,9 +476,10 @@ namespace orc {
}
}
secsBuffer[idx] = writerTime;
if (secsBuffer[idx] < 0 && nanoBuffer[i] > 999999) {
if (secsBuffer[idx] < 0 && nanoBuffer[idx] > 999999) {
secsBuffer[idx] -= 1;
}
convertCalendar(secsBuffer[idx], nanoBuffer[idx]);
}
}
}
Expand Down Expand Up @@ -2341,8 +2391,9 @@ namespace orc {
}
}
case LONG:
case DATE:
return std::make_unique<IntegerColumnReader<LongVectorBatch>>(type, stripe);
case DATE:
return std::make_unique<DateColumnReader>(type, stripe);
case BINARY:
case CHAR:
case STRING:
Expand Down
4 changes: 4 additions & 0 deletions c++/src/ColumnReader.hh
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ namespace orc {
*/
virtual const Timezone& getReaderTimezone() const = 0;

virtual bool writerUsedProlepticGregorian() const = 0;

virtual bool useProlepticGregorian() const = 0;

/**
* Get the error stream.
* @return a pointer to the stream that should get error messages
Expand Down
160 changes: 160 additions & 0 deletions c++/src/DateUtils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "DateUtils.hh"

#include <cstdio>
#include <stdexcept>

namespace orc {
namespace {
constexpr int64_t UNIX_EPOCH_JDN = 2440588;
constexpr int32_t SWITCHOVER_DAYS = -141427; // 1582-10-15
constexpr int64_t MILLIS_PER_DAY = 24LL * 60 * 60 * 1000;

struct CivilDate {
int32_t year;
int32_t month;
int32_t day;
};

int64_t gregorianJdn(int32_t year, int32_t month, int32_t day) {
const int32_t a = (14 - month) / 12;
const int64_t y = static_cast<int64_t>(year) + 4800 - a;
const int32_t m = month + 12 * a - 3;
return day + (153 * m + 2) / 5 + 365 * y + y / 4 - y / 100 + y / 400 - 32045;
}

int64_t julianJdn(int32_t year, int32_t month, int32_t day) {
const int32_t a = (14 - month) / 12;
const int64_t y = static_cast<int64_t>(year) + 4800 - a;
const int32_t m = month + 12 * a - 3;
return day + (153 * m + 2) / 5 + 365 * y + y / 4 - 32083;
}

CivilDate gregorianFromJdn(int64_t jdn) {
const int64_t a = jdn + 32044;
const int64_t b = (4 * a + 3) / 146097;
const int64_t c = a - (146097 * b) / 4;
const int64_t d = (4 * c + 3) / 1461;
const int64_t e = c - (1461 * d) / 4;
const int64_t m = (5 * e + 2) / 153;
return {static_cast<int32_t>(100 * b + d - 4800 + m / 10),
static_cast<int32_t>(m + 3 - 12 * (m / 10)),
static_cast<int32_t>(e - (153 * m + 2) / 5 + 1)};
}

CivilDate julianFromJdn(int64_t jdn) {
const int64_t c = jdn + 32082;
const int64_t d = (4 * c + 3) / 1461;
const int64_t e = c - (1461 * d) / 4;
const int64_t m = (5 * e + 2) / 153;
return {static_cast<int32_t>(d - 4800 + m / 10),
static_cast<int32_t>(m + 3 - 12 * (m / 10)),
static_cast<int32_t>(e - (153 * m + 2) / 5 + 1)};
}

bool isOnOrAfterGregorianCutover(const CivilDate& date) {
if (date.year != 1582) return date.year > 1582;
if (date.month != 10) return date.month > 10;
return date.day >= 15;
}

CivilDate parseDate(const std::string& date) {
CivilDate result {0, 0, 0};
if (std::sscanf(date.c_str(), "%d-%d-%d", &result.year, &result.month, &result.day) != 3) {
throw std::invalid_argument("Invalid date: " + date);
}
return result;
}

int64_t floorDivide(int64_t value, int64_t divisor) {
int64_t quotient = value / divisor;
const int64_t remainder = value % divisor;
if (remainder != 0 && ((remainder < 0) != (divisor < 0))) {
--quotient;
}
return quotient;
}
} // namespace

int32_t parseHybridDate(const std::string& date) {
const CivilDate civilDate = parseDate(date);
const int64_t jdn = isOnOrAfterGregorianCutover(civilDate)
? gregorianJdn(civilDate.year, civilDate.month, civilDate.day)
: julianJdn(civilDate.year, civilDate.month, civilDate.day);
return static_cast<int32_t>(jdn - UNIX_EPOCH_JDN);
}

int32_t parseProlepticDate(const std::string& date) {
const CivilDate civilDate = parseDate(date);
return static_cast<int32_t>(
gregorianJdn(civilDate.year, civilDate.month, civilDate.day) - UNIX_EPOCH_JDN);
}

int32_t convertDateToProleptic(int32_t hybrid) {
if (hybrid >= SWITCHOVER_DAYS) {
return hybrid;
}
const CivilDate hybridDate = julianFromJdn(static_cast<int64_t>(hybrid) + UNIX_EPOCH_JDN);
return static_cast<int32_t>(
gregorianJdn(hybridDate.year, hybridDate.month, hybridDate.day) - UNIX_EPOCH_JDN);
}

int32_t convertDateToHybrid(int32_t proleptic) {
if (proleptic >= SWITCHOVER_DAYS) {
return proleptic;
}
const CivilDate prolepticDate =
gregorianFromJdn(static_cast<int64_t>(proleptic) + UNIX_EPOCH_JDN);
return static_cast<int32_t>(
julianJdn(prolepticDate.year, prolepticDate.month, prolepticDate.day) - UNIX_EPOCH_JDN);
}

int32_t convertDate(int32_t original, bool fromProleptic, bool toProleptic) {
if (fromProleptic == toProleptic) {
return original;
}
return toProleptic ? convertDateToProleptic(original) : convertDateToHybrid(original);
}

int64_t convertTimeToProleptic(int64_t hybridMillis) {
const int64_t hybridDay = floorDivide(hybridMillis, MILLIS_PER_DAY);
const int64_t millisOfDay = hybridMillis - hybridDay * MILLIS_PER_DAY;
return static_cast<int64_t>(convertDateToProleptic(static_cast<int32_t>(hybridDay))) *
MILLIS_PER_DAY +
millisOfDay;
}

int64_t convertTimeToHybrid(int64_t prolepticMillis) {
const int64_t prolepticDay = floorDivide(prolepticMillis, MILLIS_PER_DAY);
const int64_t millisOfDay = prolepticMillis - prolepticDay * MILLIS_PER_DAY;
return static_cast<int64_t>(convertDateToHybrid(static_cast<int32_t>(prolepticDay))) *
MILLIS_PER_DAY +
millisOfDay;
}

int64_t convertTime(int64_t originalMillis, bool fromProleptic, bool toProleptic) {
if (fromProleptic == toProleptic) {
return originalMillis;
}
return toProleptic ? convertTimeToProleptic(originalMillis)
: convertTimeToHybrid(originalMillis);
}

} // namespace orc
40 changes: 40 additions & 0 deletions c++/src/DateUtils.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef ORC_DATE_UTILS_HH
#define ORC_DATE_UTILS_HH

#include <stdint.h>
#include <string>

namespace orc {

int32_t parseHybridDate(const std::string& date);
int32_t parseProlepticDate(const std::string& date);

int32_t convertDateToProleptic(int32_t hybrid);
int32_t convertDateToHybrid(int32_t proleptic);
int32_t convertDate(int32_t original, bool fromProleptic, bool toProleptic);

int64_t convertTimeToProleptic(int64_t hybridMillis);
int64_t convertTimeToHybrid(int64_t prolepticMillis);
int64_t convertTime(int64_t originalMillis, bool fromProleptic, bool toProleptic);

} // namespace orc

#endif // ORC_DATE_UTILS_HH
Loading