diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 1ab60b1d2c5..cc3f08a6349 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -367,6 +367,18 @@ namespace orc { */ const std::string& getTimezoneName() const; + /** + * Set whether date and timestamp values returned by this row reader should use the + * proleptic Gregorian calendar. + */ + RowReaderOptions& setUseProlepticGregorian(bool useProlepticGregorian); + + /** + * Get whether date and timestamp values returned by this row reader should use the + * proleptic Gregorian calendar. + */ + bool getUseProlepticGregorian() const; + /** * Get the IdReadIntentMap map that was supplied by client. */ @@ -476,6 +488,11 @@ namespace orc { */ virtual uint32_t getWriterIdValue() const = 0; + /** + * Was this file written using the proleptic Gregorian calendar for date and timestamp values? + */ + virtual bool writerUsedProlepticGregorian() const = 0; + /** * Get the version of the writer. * @return the version of the writer. diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index b9904160b83..c19ca87b8f8 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -172,6 +172,7 @@ set(SOURCE_FILES ColumnWriter.cc Common.cc Compression.cc + DateUtils.cc Exceptions.cc Int128.cc LzoDecompressor.cc diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index c8b03fc435e..3f9439d5d9f 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -21,6 +21,7 @@ #include "Adaptor.hh" #include "ByteRLE.hh" #include "ColumnReader.hh" +#include "DateUtils.hh" #include "RLE.hh" #include "orc/Exceptions.hh" @@ -275,6 +276,38 @@ namespace orc { } }; + class DateColumnReader : public IntegerColumnReader { + private: + const bool writerUsedProleptic; + const bool useProleptic; + + void convertCalendar(LongVectorBatch& batch, uint64_t numValues) { + if (writerUsedProleptic == useProleptic) { + return; + } + const char* notNull = batch.hasNulls ? batch.notNull.data() : nullptr; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + batch.data[i] = + convertDate(static_cast(batch.data[i]), writerUsedProleptic, useProleptic); + } + } + } + + public: + DateColumnReader(const Type& type, StripeStreams& stripe) + : IntegerColumnReader(type, stripe), + writerUsedProleptic(stripe.writerUsedProlepticGregorian()), + useProleptic(stripe.useProlepticGregorian()) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull, + const ReadPhase& readPhase, uint16_t* sel_rowid_idx, size_t sel_size) override { + IntegerColumnReader::next(rowBatch, numValues, notNull, readPhase, + sel_rowid_idx, sel_size); + convertCalendar(dynamic_cast(rowBatch), numValues); + } + }; + class TimestampColumnReader : public ColumnReader { private: std::unique_ptr secondsRle; @@ -283,6 +316,8 @@ namespace orc { const Timezone& readerTimezone; const int64_t epochOffset; const bool sameTimezone; + const bool writerUsedProleptic; + const bool useProleptic; void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull, const ReadPhase& readPhase); @@ -302,6 +337,8 @@ namespace orc { void seekToRowGroup(std::unordered_map& positions, const ReadPhase& readPhase) override; + + void convertCalendar(int64_t& seconds, int64_t nanoseconds) const; }; TimestampColumnReader::TimestampColumnReader(const Type& type, StripeStreams& stripe, @@ -310,7 +347,9 @@ namespace orc { writerTimezone(isInstantType ? getTimezoneByName("GMT") : stripe.getWriterTimezone()), readerTimezone(isInstantType ? getTimezoneByName("GMT") : stripe.getReaderTimezone()), epochOffset(writerTimezone.getEpoch()), - sameTimezone(&writerTimezone == &readerTimezone) { + sameTimezone(&writerTimezone == &readerTimezone), + writerUsedProleptic(stripe.writerUsedProlepticGregorian()), + useProleptic(stripe.useProlepticGregorian()) { RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); std::unique_ptr stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); @@ -325,6 +364,15 @@ namespace orc { // PASS } + void TimestampColumnReader::convertCalendar(int64_t& seconds, int64_t nanoseconds) const { + if (writerUsedProleptic == useProleptic) { + return; + } + const int64_t millis = seconds * 1000 + nanoseconds / 1000000; + const int64_t convertedMillis = convertTime(millis, writerUsedProleptic, useProleptic); + seconds += (convertedMillis - millis) / 1000; + } + uint64_t TimestampColumnReader::skip(uint64_t numValues, const ReadPhase& readPhase) { numValues = ColumnReader::skip(numValues, readPhase); numValues = skipInternal(numValues, readPhase); @@ -385,6 +433,7 @@ namespace orc { if (secsBuffer[i] < 0 && nanoBuffer[i] > 999999) { secsBuffer[i] -= 1; } + convertCalendar(secsBuffer[i], nanoBuffer[i]); } } } @@ -427,9 +476,10 @@ namespace orc { } } secsBuffer[idx] = writerTime; - if (secsBuffer[idx] < 0 && nanoBuffer[i] > 999999) { + if (secsBuffer[idx] < 0 && nanoBuffer[idx] > 999999) { secsBuffer[idx] -= 1; } + convertCalendar(secsBuffer[idx], nanoBuffer[idx]); } } } @@ -2341,8 +2391,9 @@ namespace orc { } } case LONG: - case DATE: return std::make_unique>(type, stripe); + case DATE: + return std::make_unique(type, stripe); case BINARY: case CHAR: case STRING: diff --git a/c++/src/ColumnReader.hh b/c++/src/ColumnReader.hh index b22bbc9617c..62485b4ee2c 100644 --- a/c++/src/ColumnReader.hh +++ b/c++/src/ColumnReader.hh @@ -77,6 +77,10 @@ namespace orc { */ virtual const Timezone& getReaderTimezone() const = 0; + virtual bool writerUsedProlepticGregorian() const = 0; + + virtual bool useProlepticGregorian() const = 0; + /** * Get the error stream. * @return a pointer to the stream that should get error messages diff --git a/c++/src/DateUtils.cc b/c++/src/DateUtils.cc new file mode 100644 index 00000000000..4cdee789bf5 --- /dev/null +++ b/c++/src/DateUtils.cc @@ -0,0 +1,160 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DateUtils.hh" + +#include +#include + +namespace orc { + namespace { + constexpr int64_t UNIX_EPOCH_JDN = 2440588; + constexpr int32_t SWITCHOVER_DAYS = -141427; // 1582-10-15 + constexpr int64_t MILLIS_PER_DAY = 24LL * 60 * 60 * 1000; + + struct CivilDate { + int32_t year; + int32_t month; + int32_t day; + }; + + int64_t gregorianJdn(int32_t year, int32_t month, int32_t day) { + const int32_t a = (14 - month) / 12; + const int64_t y = static_cast(year) + 4800 - a; + const int32_t m = month + 12 * a - 3; + return day + (153 * m + 2) / 5 + 365 * y + y / 4 - y / 100 + y / 400 - 32045; + } + + int64_t julianJdn(int32_t year, int32_t month, int32_t day) { + const int32_t a = (14 - month) / 12; + const int64_t y = static_cast(year) + 4800 - a; + const int32_t m = month + 12 * a - 3; + return day + (153 * m + 2) / 5 + 365 * y + y / 4 - 32083; + } + + CivilDate gregorianFromJdn(int64_t jdn) { + const int64_t a = jdn + 32044; + const int64_t b = (4 * a + 3) / 146097; + const int64_t c = a - (146097 * b) / 4; + const int64_t d = (4 * c + 3) / 1461; + const int64_t e = c - (1461 * d) / 4; + const int64_t m = (5 * e + 2) / 153; + return {static_cast(100 * b + d - 4800 + m / 10), + static_cast(m + 3 - 12 * (m / 10)), + static_cast(e - (153 * m + 2) / 5 + 1)}; + } + + CivilDate julianFromJdn(int64_t jdn) { + const int64_t c = jdn + 32082; + const int64_t d = (4 * c + 3) / 1461; + const int64_t e = c - (1461 * d) / 4; + const int64_t m = (5 * e + 2) / 153; + return {static_cast(d - 4800 + m / 10), + static_cast(m + 3 - 12 * (m / 10)), + static_cast(e - (153 * m + 2) / 5 + 1)}; + } + + bool isOnOrAfterGregorianCutover(const CivilDate& date) { + if (date.year != 1582) return date.year > 1582; + if (date.month != 10) return date.month > 10; + return date.day >= 15; + } + + CivilDate parseDate(const std::string& date) { + CivilDate result {0, 0, 0}; + if (std::sscanf(date.c_str(), "%d-%d-%d", &result.year, &result.month, &result.day) != 3) { + throw std::invalid_argument("Invalid date: " + date); + } + return result; + } + + int64_t floorDivide(int64_t value, int64_t divisor) { + int64_t quotient = value / divisor; + const int64_t remainder = value % divisor; + if (remainder != 0 && ((remainder < 0) != (divisor < 0))) { + --quotient; + } + return quotient; + } + } // namespace + + int32_t parseHybridDate(const std::string& date) { + const CivilDate civilDate = parseDate(date); + const int64_t jdn = isOnOrAfterGregorianCutover(civilDate) + ? gregorianJdn(civilDate.year, civilDate.month, civilDate.day) + : julianJdn(civilDate.year, civilDate.month, civilDate.day); + return static_cast(jdn - UNIX_EPOCH_JDN); + } + + int32_t parseProlepticDate(const std::string& date) { + const CivilDate civilDate = parseDate(date); + return static_cast( + gregorianJdn(civilDate.year, civilDate.month, civilDate.day) - UNIX_EPOCH_JDN); + } + + int32_t convertDateToProleptic(int32_t hybrid) { + if (hybrid >= SWITCHOVER_DAYS) { + return hybrid; + } + const CivilDate hybridDate = julianFromJdn(static_cast(hybrid) + UNIX_EPOCH_JDN); + return static_cast( + gregorianJdn(hybridDate.year, hybridDate.month, hybridDate.day) - UNIX_EPOCH_JDN); + } + + int32_t convertDateToHybrid(int32_t proleptic) { + if (proleptic >= SWITCHOVER_DAYS) { + return proleptic; + } + const CivilDate prolepticDate = + gregorianFromJdn(static_cast(proleptic) + UNIX_EPOCH_JDN); + return static_cast( + julianJdn(prolepticDate.year, prolepticDate.month, prolepticDate.day) - UNIX_EPOCH_JDN); + } + + int32_t convertDate(int32_t original, bool fromProleptic, bool toProleptic) { + if (fromProleptic == toProleptic) { + return original; + } + return toProleptic ? convertDateToProleptic(original) : convertDateToHybrid(original); + } + + int64_t convertTimeToProleptic(int64_t hybridMillis) { + const int64_t hybridDay = floorDivide(hybridMillis, MILLIS_PER_DAY); + const int64_t millisOfDay = hybridMillis - hybridDay * MILLIS_PER_DAY; + return static_cast(convertDateToProleptic(static_cast(hybridDay))) * + MILLIS_PER_DAY + + millisOfDay; + } + + int64_t convertTimeToHybrid(int64_t prolepticMillis) { + const int64_t prolepticDay = floorDivide(prolepticMillis, MILLIS_PER_DAY); + const int64_t millisOfDay = prolepticMillis - prolepticDay * MILLIS_PER_DAY; + return static_cast(convertDateToHybrid(static_cast(prolepticDay))) * + MILLIS_PER_DAY + + millisOfDay; + } + + int64_t convertTime(int64_t originalMillis, bool fromProleptic, bool toProleptic) { + if (fromProleptic == toProleptic) { + return originalMillis; + } + return toProleptic ? convertTimeToProleptic(originalMillis) + : convertTimeToHybrid(originalMillis); + } + +} // namespace orc diff --git a/c++/src/DateUtils.hh b/c++/src/DateUtils.hh new file mode 100644 index 00000000000..75ef3d04376 --- /dev/null +++ b/c++/src/DateUtils.hh @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_DATE_UTILS_HH +#define ORC_DATE_UTILS_HH + +#include +#include + +namespace orc { + + int32_t parseHybridDate(const std::string& date); + int32_t parseProlepticDate(const std::string& date); + + int32_t convertDateToProleptic(int32_t hybrid); + int32_t convertDateToHybrid(int32_t proleptic); + int32_t convertDate(int32_t original, bool fromProleptic, bool toProleptic); + + int64_t convertTimeToProleptic(int64_t hybridMillis); + int64_t convertTimeToHybrid(int64_t prolepticMillis); + int64_t convertTime(int64_t originalMillis, bool fromProleptic, bool toProleptic); + +} // namespace orc + +#endif // ORC_DATE_UTILS_HH diff --git a/c++/src/Options.hh b/c++/src/Options.hh index 014d0ef0e3b..6ea0e455947 100644 --- a/c++/src/Options.hh +++ b/c++/src/Options.hh @@ -149,6 +149,7 @@ namespace orc { std::string readerTimezone; RowReaderOptions::IdReadIntentMap idReadIntentMap; bool useTightNumericVector; + bool useProlepticGregorian; RowReaderOptionsPrivate() { selection = ColumnSelection_NONE; @@ -160,6 +161,7 @@ namespace orc { enableLazyDecoding = false; readerTimezone = "GMT"; useTightNumericVector = false; + useProlepticGregorian = true; } }; @@ -337,6 +339,15 @@ namespace orc { return privateBits->readerTimezone; } + RowReaderOptions& RowReaderOptions::setUseProlepticGregorian(bool useProlepticGregorian) { + privateBits->useProlepticGregorian = useProlepticGregorian; + return *this; + } + + bool RowReaderOptions::getUseProlepticGregorian() const { + return privateBits->useProlepticGregorian; + } + const RowReaderOptions::IdReadIntentMap RowReaderOptions::getIdReadIntentMap() const { return privateBits->idReadIntentMap; } diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index d5f83fc3590..97e1903ad0d 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -36,6 +36,18 @@ #include namespace orc { + namespace { + bool getWriterUsedProlepticGregorian(const proto::Footer& footer) { + if (footer.has_calendar()) { + return footer.calendar() == proto::PROLEPTIC_GREGORIAN; + } + if (!footer.has_writer()) { + return false; + } + return footer.writer() != WriterId::ORC_JAVA_WRITER; + } + } // namespace + // ORC files writen by these versions of cpp writers have inconsistent bloom filter // hashing. Bloom filters of them should not be used. static const char* BAD_CPP_BLOOM_FILTER_VERSIONS[] = { @@ -271,6 +283,7 @@ namespace orc { firstRowOfStripe(*contents->pool, 0), enableEncodedBlock(opts.getEnableLazyDecoding()), readerTimezone(getTimezoneByName(opts.getTimezoneName())), + useProlepticGregorianValue(opts.getUseProlepticGregorian()), filter(_filter), stringDictFilter(_stringDictFilter) { uint64_t numberOfStripes; @@ -325,7 +338,9 @@ namespace orc { sargs = opts.getSearchArgument(); sargsApplier.reset(new SargsApplier(*contents->schema, sargs.get(), footer->rowindexstride(), getWriterVersionImpl(_contents.get()), - contents->readerMetrics)); + contents->readerMetrics, nullptr, + contents->writerUsedProlepticGregorian, + opts.getUseProlepticGregorian())); } skipBloomFilters = hasBadBloomFilters(); @@ -738,6 +753,7 @@ namespace orc { contents->schema = convertType(footer->types(0), *footer); contents->blockSize = getCompressionBlockSize(*contents->postscript); contents->compression = convertCompressionKind(*contents->postscript); + contents->writerUsedProlepticGregorian = getWriterUsedProlepticGregorian(*footer); } std::string ReaderImpl::getSerializedFileTail() const { @@ -823,6 +839,10 @@ namespace orc { } } + bool ReaderImpl::writerUsedProlepticGregorian() const { + return contents->writerUsedProlepticGregorian; + } + std::string ReaderImpl::getSoftwareVersion() const { std::ostringstream buffer; buffer << writerIdToString(getWriterIdValue()); @@ -1040,7 +1060,9 @@ namespace orc { auto sargs = opts.getSearchArgument(); sargsApplier.reset(new SargsApplier(*contents->schema, sargs.get(), footer->rowindexstride(), getWriterVersionImpl(contents.get()), - contents->readerMetrics)); + contents->readerMetrics, nullptr, + contents->writerUsedProlepticGregorian, + opts.getUseProlepticGregorian())); if (sargsApplier == nullptr || contents->metadata == nullptr) { return allStripesNeeded; diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 05990851a74..e20f5eab929 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -101,6 +101,7 @@ namespace orc { std::unique_ptr metadata; ReaderMetrics* readerMetrics; std::unique_ptr sargsApplier; + bool writerUsedProlepticGregorian; }; proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, @@ -206,6 +207,7 @@ namespace orc { // desired timezone to return data of timestamp types. const Timezone& readerTimezone; + bool useProlepticGregorianValue; std::unique_ptr readerContext; const ORCFilter* filter; @@ -301,6 +303,9 @@ namespace orc { bool getThrowOnHive11DecimalOverflow() const; bool getIsDecimalAsLong() const; int32_t getForcedScaleOnHive11Decimal() const; + bool useProlepticGregorian() const { + return useProlepticGregorianValue; + } }; class ReaderImpl : public Reader { @@ -351,6 +356,8 @@ namespace orc { uint32_t getWriterIdValue() const override; + bool writerUsedProlepticGregorian() const override; + std::string getSoftwareVersion() const override; WriterVersion getWriterVersion() const override; diff --git a/c++/src/StripeStream.cc b/c++/src/StripeStream.cc index 56cde6bfe8d..046719953b5 100644 --- a/c++/src/StripeStream.cc +++ b/c++/src/StripeStream.cc @@ -74,6 +74,14 @@ namespace orc { return readerTimezone; } + bool StripeStreamsImpl::writerUsedProlepticGregorian() const { + return reader.getFileContents().writerUsedProlepticGregorian; + } + + bool StripeStreamsImpl::useProlepticGregorian() const { + return reader.useProlepticGregorian(); + } + std::ostream* StripeStreamsImpl::getErrorStream() const { return reader.getFileContents().errorStream; } diff --git a/c++/src/StripeStream.hh b/c++/src/StripeStream.hh index 5e190b4b44a..d6a82304fac 100644 --- a/c++/src/StripeStream.hh +++ b/c++/src/StripeStream.hh @@ -72,6 +72,10 @@ namespace orc { const Timezone& getReaderTimezone() const override; + bool writerUsedProlepticGregorian() const override; + + bool useProlepticGregorian() const override; + std::ostream* getErrorStream() const override; bool getThrowOnHive11DecimalOverflow() const override; diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc index 14ee68f7179..65e22dbc3a2 100644 --- a/c++/src/Writer.cc +++ b/c++/src/Writer.cc @@ -422,6 +422,7 @@ namespace orc { fileFooter.set_rowindexstride(static_cast(options.getRowIndexStride())); fileFooter.set_writer(writerId); fileFooter.set_softwareversion(ORC_VERSION); + fileFooter.set_calendar(proto::PROLEPTIC_GREGORIAN); uint32_t index = 0; buildFooterType(type, fileFooter, index); diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc index 9ab4cfb1e2c..9c7f0861bc1 100644 --- a/c++/src/sargs/PredicateLeaf.cc +++ b/c++/src/sargs/PredicateLeaf.cc @@ -17,6 +17,7 @@ */ #include "PredicateLeaf.hh" +#include "DateUtils.hh" #include "orc/BloomFilter.hh" #include "orc/Common.hh" #include "orc/Type.hh" @@ -510,7 +511,9 @@ namespace orc { return result; } - TruthValue PredicateLeaf::evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const { + TruthValue PredicateLeaf::evaluatePredicateMinMax(const proto::ColumnStatistics& colStats, + bool writerUsedProlepticGregorian, + bool useProlepticGregorian) const { TruthValue result = TruthValue::YES_NO_NULL; switch (mType) { case PredicateDataType::LONG: { @@ -549,8 +552,12 @@ namespace orc { if (colStats.has_datestatistics() && colStats.datestatistics().has_minimum() && colStats.datestatistics().has_maximum()) { const auto& stats = colStats.datestatistics(); - result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(), - stats.maximum(), col_stats_hasnull(colStats)); + const int32_t minimum = + convertDate(stats.minimum(), writerUsedProlepticGregorian, useProlepticGregorian); + const int32_t maximum = + convertDate(stats.maximum(), writerUsedProlepticGregorian, useProlepticGregorian); + result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), minimum, maximum, + col_stats_hasnull(colStats)); } break; } @@ -562,11 +569,15 @@ namespace orc { constexpr int32_t DEFAULT_MAX_NANOS = 999999; int32_t minNano = stats.has_minimumnanos() ? stats.minimumnanos() - 1 : DEFAULT_MIN_NANOS; int32_t maxNano = stats.has_maximumnanos() ? stats.maximumnanos() - 1 : DEFAULT_MAX_NANOS; + const int64_t minimum = + convertTime(stats.minimumutc(), writerUsedProlepticGregorian, useProlepticGregorian); + const int64_t maximum = + convertTime(stats.maximumutc(), writerUsedProlepticGregorian, useProlepticGregorian); Literal::Timestamp minTimestamp( - stats.minimumutc() / 1000, + minimum / 1000, static_cast((stats.minimumutc() % 1000) * 1000000) + minNano); Literal::Timestamp maxTimestamp( - stats.maximumutc() / 1000, + maximum / 1000, static_cast((stats.maximumutc() % 1000) * 1000000) + maxNano); result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp, maxTimestamp, col_stats_hasnull(colStats)); @@ -694,7 +705,9 @@ namespace orc { TruthValue PredicateLeaf::evaluate(const WriterVersion writerVersion, const proto::ColumnStatistics& colStats, - const BloomFilter* bloomFilter) const { + const BloomFilter* bloomFilter, + bool writerUsedProlepticGregorian, + bool useProlepticGregorian) const { // files written before ORC-135 stores timestamp wrt to local timezone // causing issues with PPD. disable PPD for timestamp for all old files if (mType == PredicateDataType::TIMESTAMP) { @@ -715,8 +728,12 @@ namespace orc { return TruthValue::IS_NULL; } - TruthValue result = evaluatePredicateMinMax(colStats); - if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) { + TruthValue result = + evaluatePredicateMinMax(colStats, writerUsedProlepticGregorian, useProlepticGregorian); + const bool calendarRebasedPredicate = + writerUsedProlepticGregorian != useProlepticGregorian && + (mType == PredicateDataType::DATE || mType == PredicateDataType::TIMESTAMP); + if (!calendarRebasedPredicate && shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) { return evaluatePredicateBloomFiter(bloomFilter, col_stats_hasnull(colStats)); } else { return result; diff --git a/c++/src/sargs/PredicateLeaf.hh b/c++/src/sargs/PredicateLeaf.hh index 21ed4561558..430e359c984 100644 --- a/c++/src/sargs/PredicateLeaf.hh +++ b/c++/src/sargs/PredicateLeaf.hh @@ -120,7 +120,9 @@ namespace orc { * Evaluate current PredicateLeaf based on ColumnStatistics and BloomFilter */ TruthValue evaluate(const WriterVersion writerVersion, const proto::ColumnStatistics& colStats, - const BloomFilter* bloomFilter) const; + const BloomFilter* bloomFilter, + bool writerUsedProlepticGregorian = true, + bool useProlepticGregorian = true) const; std::string toString() const; @@ -138,7 +140,9 @@ namespace orc { std::string columnDebugString() const; - TruthValue evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const; + TruthValue evaluatePredicateMinMax(const proto::ColumnStatistics& colStats, + bool writerUsedProlepticGregorian, + bool useProlepticGregorian) const; TruthValue evaluatePredicateBloomFiter(const BloomFilter* bloomFilter, bool hasNull) const; diff --git a/c++/src/sargs/SargsApplier.cc b/c++/src/sargs/SargsApplier.cc index 7bce1a61aa8..cc9e97b1cfc 100644 --- a/c++/src/sargs/SargsApplier.cc +++ b/c++/src/sargs/SargsApplier.cc @@ -39,12 +39,15 @@ namespace orc { SargsApplier::SargsApplier(const Type& type, const SearchArgument* searchArgument, uint64_t rowIndexStride, WriterVersion writerVersion, - ReaderMetrics* metrics, const SchemaEvolution* schemaEvolution) + ReaderMetrics* metrics, const SchemaEvolution* schemaEvolution, + bool writerUsedProlepticGregorian, bool useProlepticGregorian) : mType(type), mSearchArgument(searchArgument), mSchemaEvolution(schemaEvolution), mRowIndexStride(rowIndexStride), mWriterVersion(writerVersion), + mWriterUsedProlepticGregorian(writerUsedProlepticGregorian), + mUseProlepticGregorian(useProlepticGregorian), mHasEvaluatedFileStats(false), mFileStatsEvalResult(true), mMetrics(metrics) { @@ -104,7 +107,9 @@ namespace orc { bloomFilter = iter->second.entries.at(rowGroup); } - leafValues[pred] = leaves[pred].evaluate(mWriterVersion, statistics, bloomFilter.get()); + leafValues[pred] = + leaves[pred].evaluate(mWriterVersion, statistics, bloomFilter.get(), + mWriterUsedProlepticGregorian, mUseProlepticGregorian); } } @@ -155,8 +160,9 @@ namespace orc { for (size_t pred = 0; pred != leaves.size(); ++pred) { uint64_t columnId = mFilterColumns[pred]; if (columnId != INVALID_COLUMN_ID && colStats.size() > static_cast(columnId)) { - leafValues[pred] = leaves[pred].evaluate(mWriterVersion, - colStats.Get(static_cast(columnId)), nullptr); + leafValues[pred] = + leaves[pred].evaluate(mWriterVersion, colStats.Get(static_cast(columnId)), + nullptr, mWriterUsedProlepticGregorian, mUseProlepticGregorian); } } diff --git a/c++/src/sargs/SargsApplier.hh b/c++/src/sargs/SargsApplier.hh index 73703dcf6b8..f3ed1899d45 100644 --- a/c++/src/sargs/SargsApplier.hh +++ b/c++/src/sargs/SargsApplier.hh @@ -37,7 +37,8 @@ namespace orc { public: SargsApplier(const Type& type, const SearchArgument* searchArgument, uint64_t rowIndexStride, WriterVersion writerVersion, ReaderMetrics* metrics, - const SchemaEvolution* schemaEvolution = nullptr); + const SchemaEvolution* schemaEvolution = nullptr, + bool writerUsedProlepticGregorian = true, bool useProlepticGregorian = true); /** * Evaluate search argument on file statistics @@ -130,6 +131,8 @@ namespace orc { const SchemaEvolution* mSchemaEvolution; uint64_t mRowIndexStride; WriterVersion mWriterVersion; + bool mWriterUsedProlepticGregorian; + bool mUseProlepticGregorian; // column ids for each predicate leaf in the search argument std::vector mFilterColumns; diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index 387ce9dbf80..f05b265b5f0 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -35,6 +35,7 @@ add_executable (orc-test TestColumnPrinter.cc TestColumnReader.cc TestColumnStatistics.cc + TestDateUtils.cc TestCompression.cc TestDecompression.cc TestDecimal.cc diff --git a/c++/test/TestColumnReader.cc b/c++/test/TestColumnReader.cc index 6230c2e506b..5db7f18df6d 100644 --- a/c++/test/TestColumnReader.cc +++ b/c++/test/TestColumnReader.cc @@ -75,6 +75,14 @@ namespace orc { const Timezone& getReaderTimezone() const override { return getTimezoneByName("GMT"); } + + bool writerUsedProlepticGregorian() const override { + return true; + } + + bool useProlepticGregorian() const override { + return true; + } }; MockStripeStreams::~MockStripeStreams() { diff --git a/c++/test/TestDateUtils.cc b/c++/test/TestDateUtils.cc new file mode 100644 index 00000000000..b8343858186 --- /dev/null +++ b/c++/test/TestDateUtils.cc @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DateUtils.hh" + +#include "wrap/gmock.h" +#include "wrap/gtest-wrapper.h" + +namespace orc { + + TEST(TestDateUtils, convertHybridDateToProlepticMatchesJavaDateUtils) { + EXPECT_EQ(16768, convertDate(16768, false, true)); + EXPECT_EQ(-141427, convertDate(-141427, false, true)); + EXPECT_EQ(-141438, convertDate(-141428, false, true)); + EXPECT_EQ(-499955, convertDate(-499952, false, true)); + } + + TEST(TestDateUtils, convertProlepticDateToHybridIsInverse) { + EXPECT_EQ(16768, convertDate(16768, true, false)); + EXPECT_EQ(-141427, convertDate(-141427, true, false)); + EXPECT_EQ(-141428, convertDate(-141438, true, false)); + EXPECT_EQ(-499952, convertDate(-499955, true, false)); + } + + TEST(TestDateUtils, convertHiveLegacyReproDatesToProleptic) { + EXPECT_EQ(parseProlepticDate("0002-01-01"), convertDate(parseHybridDate("0002-01-01"), + false, true)); + EXPECT_EQ(parseProlepticDate("1500-01-01"), convertDate(parseHybridDate("1500-01-01"), + false, true)); + EXPECT_EQ(parseProlepticDate("1582-10-04"), convertDate(parseHybridDate("1582-10-04"), + false, true)); + EXPECT_EQ(parseProlepticDate("1582-11-04"), convertDate(parseHybridDate("1582-11-04"), + false, true)); + EXPECT_EQ(parseProlepticDate("2000-02-29"), convertDate(parseHybridDate("2000-02-29"), + false, true)); + } + + TEST(TestDateUtils, convertHiveLegacyReproTimestampsToProleptic) { + const int64_t millisInDay = 24 * 60 * 60 * 1000; + const int64_t millis = 123; + + EXPECT_EQ(parseProlepticDate("0002-01-01") * millisInDay + millis, + convertTime(parseHybridDate("0002-01-01") * millisInDay + millis, false, true)); + EXPECT_EQ(parseProlepticDate("1500-01-01") * millisInDay + millis, + convertTime(parseHybridDate("1500-01-01") * millisInDay + millis, false, true)); + EXPECT_EQ(parseProlepticDate("1582-10-04") * millisInDay + millis, + convertTime(parseHybridDate("1582-10-04") * millisInDay + millis, false, true)); + EXPECT_EQ(parseProlepticDate("1582-11-04") * millisInDay + millis, + convertTime(parseHybridDate("1582-11-04") * millisInDay + millis, false, true)); + EXPECT_EQ(parseProlepticDate("2000-02-29") * millisInDay + millis, + convertTime(parseHybridDate("2000-02-29") * millisInDay + millis, false, true)); + } + +} // namespace orc diff --git a/c++/test/TestPredicateLeaf.cc b/c++/test/TestPredicateLeaf.cc index e0ab293d311..df87a1930d5 100644 --- a/c++/test/TestPredicateLeaf.cc +++ b/c++/test/TestPredicateLeaf.cc @@ -17,6 +17,7 @@ */ #include "BloomFilter.hh" +#include "DateUtils.hh" #include "Statistics.hh" #include "orc/BloomFilter.hh" #include "orc/sargs/Literal.hh" @@ -173,6 +174,13 @@ namespace orc { return pred.evaluate(WriterVersion_ORC_135, pbStats, bf); } + static TruthValue evaluate(const PredicateLeaf& pred, const proto::ColumnStatistics& pbStats, + const BloomFilter* bf, bool writerUsedProlepticGregorian, + bool useProlepticGregorian) { + return pred.evaluate(WriterVersion_ORC_135, pbStats, bf, writerUsedProlepticGregorian, + useProlepticGregorian); + } + TEST(TestPredicateLeaf, testPredEvalWithColStats) { PredicateLeaf pred0(PredicateLeaf::Operator::NULL_SAFE_EQUALS, PredicateDataType::BOOLEAN, "x", Literal(true)); @@ -538,6 +546,35 @@ namespace orc { EXPECT_EQ(TruthValue::YES_NO_NULL, evaluate(pred, createDateStats(10.0, 100.0, true), &bf)); } + TEST(TestPredicateLeaf, testDateStatsRebasedAndBloomFilterSkipped) { + const int32_t prolepticDate = parseProlepticDate("1500-01-01"); + const int32_t hybridDate = parseHybridDate("1500-01-01"); + const int32_t hybridMin = parseHybridDate("1499-12-31"); + const int32_t hybridMax = parseHybridDate("1500-01-02"); + PredicateLeaf pred(PredicateLeaf::Operator::EQUALS, PredicateDataType::DATE, "x", + Literal(PredicateDataType::DATE, prolepticDate)); + BloomFilterImpl bf(10000); + bf.addLong(hybridDate); + + EXPECT_EQ(TruthValue::YES_NO, + evaluate(pred, createDateStats(hybridMin, hybridMax), &bf, false, true)); + } + + TEST(TestPredicateLeaf, testTimestampStatsRebasedAndBloomFilterSkipped) { + const int64_t millisPerDay = 24 * 60 * 60 * 1000LL; + const int64_t prolepticMillis = parseProlepticDate("1500-01-01") * millisPerDay; + const int64_t hybridMillis = parseHybridDate("1500-01-01") * millisPerDay; + const int64_t hybridMin = parseHybridDate("1499-12-31") * millisPerDay; + const int64_t hybridMax = parseHybridDate("1500-01-02") * millisPerDay; + PredicateLeaf pred(PredicateLeaf::Operator::EQUALS, PredicateDataType::TIMESTAMP, "x", + Literal(prolepticMillis / 1000, 0)); + BloomFilterImpl bf(10000); + bf.addLong(hybridMillis); + + EXPECT_EQ(TruthValue::YES_NO, + evaluate(pred, createTimestampStats(hybridMin, hybridMax), &bf, false, true)); + } + TEST(TestPredicateLeaf, testDateInBloomFilter) { PredicateLeaf pred( PredicateLeaf::Operator::IN, PredicateDataType::DATE, "x", diff --git a/c++/test/TestReader.cc b/c++/test/TestReader.cc index f709f693f10..5aac37e0eeb 100644 --- a/c++/test/TestReader.cc +++ b/c++/test/TestReader.cc @@ -18,6 +18,7 @@ #include +#include "DateUtils.hh" #include "Reader.hh" #include "orc/Reader.hh" @@ -34,6 +35,38 @@ namespace orc { static const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + std::string decodeBase64(const std::string& input) { + std::string output; + int value = 0; + int bits = -8; + for (const unsigned char c : input) { + if (c == '=') { + break; + } + int digit = -1; + if (c >= 'A' && c <= 'Z') { + digit = c - 'A'; + } else if (c >= 'a' && c <= 'z') { + digit = c - 'a' + 26; + } else if (c >= '0' && c <= '9') { + digit = c - '0' + 52; + } else if (c == '+') { + digit = 62; + } else if (c == '/') { + digit = 63; + } else { + continue; + } + value = (value << 6) + digit; + bits += 6; + if (bits >= 0) { + output.push_back(static_cast((value >> bits) & 0xff)); + bits -= 8; + } + } + return output; + } + TEST(TestReader, testWriterVersions) { EXPECT_EQ("original", writerVersionToString(WriterVersion_ORIGINAL)); EXPECT_EQ("HIVE-8732", writerVersionToString(WriterVersion_HIVE_8732)); @@ -53,6 +86,46 @@ namespace orc { EXPECT_EQ("unknown - 99", compressionKindToString(static_cast(99))); } + TEST(TestReader, testReadHiveLegacyDateAndTimestampWithProlepticCalendar) { + static const char* HIVE_LEGACY_ORC_BASE64 = + "T1JDEQAACgYSBAgFUAAvAAAKFQoDAAAAEg4IBToICJ3fVxCQrAFQAEAAAOOS52JjAAMhUQ5WL36Jzr" + "7DZ37Mk1H41rh91WbzAAYAIwAA7gSd31eQ5UIAdhMAABUCU2c7AAD4BP3Rv6XZA4Cwo5vgAgCbqgy" + "AAAAbr4ADEPRKfwcAAAoD3WgAAONi42ATYJDg5gLRjBJSYJpJQhlIMwL5ImCaSUIBSLMCaTYhJg4" + "GIGYCYSl215Jk/dAQZwBfAAAKLQoECAVQAAoOCAU6CAid31cQkKwBUAAKFQgFSg8YiY7DzPieHCD" + "2gbeqszdQAMEAAAgDELwBGgoIAxBIGDogNygFIhMIDBICAQIaCGRhdGVfY29sGgF0IgIIDyICCAk" + "wBToECAVQADoOCAU6CAid31cQkKwBUAA6FQgFSg8YiY7DzPieHCD2gbeqszdQAECQTghjEAEYgIA" + "QIgIADCgyMAaC9AMDT1JDFw=="; + const std::string fileBytes = decodeBase64(HIVE_LEGACY_ORC_BASE64); + auto inStream = std::make_unique(fileBytes.data(), fileBytes.size()); + ReaderOptions readerOptions; + std::unique_ptr reader = createReader(std::move(inStream), readerOptions); + + EXPECT_FALSE(reader->writerUsedProlepticGregorian()); + + RowReaderOptions rowReaderOptions; + rowReaderOptions.setTimezoneName("UTC").setUseProlepticGregorian(true); + std::unique_ptr rowReader = reader->createRowReader(rowReaderOptions); + std::unique_ptr batch = rowReader->createRowBatch(16); + + ASSERT_TRUE(rowReader->next(*batch)); + ASSERT_EQ(5, batch->numElements); + auto& root = dynamic_cast(*batch); + auto& dates = dynamic_cast(*root.fields[0]); + auto& timestamps = dynamic_cast(*root.fields[1]); + + const std::vector expectedDates = {"0002-01-01", "1500-01-01", "1582-10-04", + "1582-11-04", "2000-02-29"}; + for (size_t i = 0; i < expectedDates.size(); ++i) { + const int32_t expectedDay = parseProlepticDate(expectedDates[i]); + EXPECT_EQ(expectedDay, dates.data[i]) << "date row " << i; + EXPECT_EQ(static_cast(expectedDay) * 24 * 60 * 60, timestamps.data[i]) + << "timestamp row " << i; + EXPECT_EQ(123000000, timestamps.nanoseconds[i]) << "timestamp nanos row " << i; + } + + EXPECT_FALSE(rowReader->next(*batch)); + } + TEST(TestRowReader, computeBatchSize) { uint64_t rowIndexStride = 100; uint64_t rowsInCurrentStripe = 100 * 8 + 50; diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index 803f14de780..99acd0153e1 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -114,6 +114,7 @@ namespace orc { WriterId writerId = WriterId::ORC_CPP_WRITER; EXPECT_EQ(writerId, reader->getWriterId()); EXPECT_EQ(1, reader->getWriterIdValue()); + EXPECT_TRUE(reader->writerUsedProlepticGregorian()); std::unique_ptr batch = rowReader->createRowBatch(1024); EXPECT_FALSE(rowReader->next(*batch));