-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support scan filter for ORC decimal reader #11067
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,16 +67,17 @@ void SelectiveDecimalColumnReader<DataT>::seekToRowGroup(uint32_t index) { | |
|
||
template <typename DataT> | ||
template <bool kDense> | ||
void SelectiveDecimalColumnReader<DataT>::readHelper(RowSet rows) { | ||
vector_size_t numRows = rows.back() + 1; | ||
void SelectiveDecimalColumnReader<DataT>::readHelper( | ||
common::Filter* filter, | ||
RowSet rows) { | ||
ExtractToReader extractValues(this); | ||
common::AlwaysTrue filter; | ||
common::AlwaysTrue alwaysTrue; | ||
DirectRleColumnVisitor< | ||
int64_t, | ||
common::AlwaysTrue, | ||
decltype(extractValues), | ||
kDense> | ||
visitor(filter, this, rows, extractValues); | ||
visitor(alwaysTrue, this, rows, extractValues); | ||
|
||
// decode scale stream | ||
if (version_ == velox::dwrf::RleVersion_1) { | ||
|
@@ -96,46 +97,175 @@ void SelectiveDecimalColumnReader<DataT>::readHelper(RowSet rows) { | |
// reset numValues_ before reading values | ||
numValues_ = 0; | ||
valueSize_ = sizeof(DataT); | ||
vector_size_t numRows = rows.back() + 1; | ||
ensureValuesCapacity<DataT>(numRows); | ||
|
||
// decode value stream | ||
facebook::velox::dwio::common:: | ||
ColumnVisitor<DataT, common::AlwaysTrue, decltype(extractValues), kDense> | ||
valueVisitor(filter, this, rows, extractValues); | ||
valueVisitor(alwaysTrue, this, rows, extractValues); | ||
decodeWithVisitor<DirectDecoder<true>>(valueDecoder_.get(), valueVisitor); | ||
readOffset_ += numRows; | ||
|
||
// Fill decimals before applying filter. | ||
fillDecimals(); | ||
|
||
const auto rawNulls = nullsInReadRange_ | ||
? (kDense ? nullsInReadRange_->as<uint64_t>() : rawResultNulls_) | ||
: nullptr; | ||
// Treat the filter as kAlwaysTrue if any of the following conditions are met: | ||
// 1) No filter found; | ||
// 2) Filter is kIsNotNull but rawNulls==NULL (no elements is null). | ||
auto filterKind = | ||
!filter || (filter->kind() == common::FilterKind::kIsNotNull && !rawNulls) | ||
? common::FilterKind::kAlwaysTrue | ||
: filter->kind(); | ||
switch (filterKind) { | ||
case common::FilterKind::kAlwaysTrue: | ||
// Simply add all rows to output. | ||
for (vector_size_t i = 0; i < numValues_; i++) { | ||
addOutputRow(rows[i]); | ||
} | ||
break; | ||
case common::FilterKind::kIsNull: | ||
processNulls(true, rows, rawNulls); | ||
break; | ||
case common::FilterKind::kIsNotNull: | ||
processNulls(false, rows, rawNulls); | ||
break; | ||
case common::FilterKind::kBigintRange: | ||
case common::FilterKind::kBigintValuesUsingHashTable: | ||
case common::FilterKind::kBigintValuesUsingBitmask: | ||
case common::FilterKind::kNegatedBigintRange: | ||
case common::FilterKind::kNegatedBigintValuesUsingHashTable: | ||
case common::FilterKind::kNegatedBigintValuesUsingBitmask: | ||
case common::FilterKind::kBigintMultiRange: { | ||
if constexpr (std::is_same_v<DataT, int64_t>) { | ||
processFilter(filter, rows, rawNulls); | ||
} else { | ||
VELOX_UNSUPPORTED("Unsupported filter: {}.", (int)filterKind); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be the case for schema evolution. Just throw |
||
} | ||
break; | ||
} | ||
case common::FilterKind::kHugeintValuesUsingHashTable: | ||
case common::FilterKind::kHugeintRange: { | ||
if constexpr (std::is_same_v<DataT, int128_t>) { | ||
processFilter(filter, rows, rawNulls); | ||
} else { | ||
VELOX_UNSUPPORTED("Unsupported filter: {}.", (int)filterKind); | ||
} | ||
break; | ||
} | ||
default: | ||
VELOX_UNSUPPORTED("Unsupported filter: {}.", (int)filterKind); | ||
} | ||
} | ||
|
||
template <typename DataT> | ||
void SelectiveDecimalColumnReader<DataT>::processNulls( | ||
const bool isNull, | ||
const RowSet rows, | ||
const uint64_t* rawNulls) { | ||
if (!rawNulls) { | ||
return; | ||
} | ||
auto rawDecimal = values_->asMutable<DataT>(); | ||
auto rawScale = scaleBuffer_->asMutable<int64_t>(); | ||
|
||
returnReaderNulls_ = false; | ||
anyNulls_ = !isNull; | ||
allNull_ = isNull; | ||
vector_size_t idx = 0; | ||
for (vector_size_t i = 0; i < numValues_; i++) { | ||
if (isNull) { | ||
if (bits::isBitNull(rawNulls, i)) { | ||
bits::setNull(rawResultNulls_, idx); | ||
addOutputRow(rows[i]); | ||
idx++; | ||
} | ||
} else { | ||
if (!bits::isBitNull(rawNulls, i)) { | ||
bits::setNull(rawResultNulls_, idx, false); | ||
rawDecimal[idx] = rawDecimal[i]; | ||
rawScale[idx] = rawScale[i]; | ||
addOutputRow(rows[i]); | ||
idx++; | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename DataT> | ||
void SelectiveDecimalColumnReader<DataT>::processFilter( | ||
const common::Filter* filter, | ||
const RowSet rows, | ||
const uint64_t* rawNulls) { | ||
auto rawDecimal = values_->asMutable<DataT>(); | ||
|
||
returnReaderNulls_ = false; | ||
anyNulls_ = false; | ||
allNull_ = true; | ||
vector_size_t idx = 0; | ||
for (vector_size_t i = 0; i < numValues_; i++) { | ||
if (rawNulls && bits::isBitNull(rawNulls, i)) { | ||
if (filter->testNull()) { | ||
bits::setNull(rawResultNulls_, idx); | ||
addOutputRow(rows[i]); | ||
anyNulls_ = true; | ||
idx++; | ||
} | ||
} else { | ||
bool tested; | ||
if constexpr (std::is_same_v<DataT, int64_t>) { | ||
tested = filter->testInt64(rawDecimal[i]); | ||
} else { | ||
tested = filter->testInt128(rawDecimal[i]); | ||
} | ||
|
||
if (tested) { | ||
if (rawNulls) { | ||
bits::setNull(rawResultNulls_, idx, false); | ||
} | ||
rawDecimal[idx] = rawDecimal[i]; | ||
addOutputRow(rows[i]); | ||
allNull_ = false; | ||
idx++; | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename DataT> | ||
void SelectiveDecimalColumnReader<DataT>::read( | ||
vector_size_t offset, | ||
const RowSet& rows, | ||
const uint64_t* incomingNulls) { | ||
VELOX_CHECK(!scanSpec_->filter()); | ||
VELOX_CHECK(!scanSpec_->valueHook()); | ||
prepareRead<int64_t>(offset, rows, incomingNulls); | ||
bool isDense = rows.back() == rows.size() - 1; | ||
if (isDense) { | ||
readHelper<true>(rows); | ||
readHelper<true>(scanSpec_->filter(), rows); | ||
} else { | ||
readHelper<false>(rows); | ||
readHelper<false>(scanSpec_->filter(), rows); | ||
} | ||
} | ||
|
||
template <typename DataT> | ||
void SelectiveDecimalColumnReader<DataT>::getValues( | ||
const RowSet& rows, | ||
VectorPtr* result) { | ||
rawValues_ = values_->asMutable<char>(); | ||
getIntValues(rows, requestedType_, result); | ||
} | ||
|
||
template <typename DataT> | ||
void SelectiveDecimalColumnReader<DataT>::fillDecimals() { | ||
auto nullsPtr = | ||
resultNulls() ? resultNulls()->template as<uint64_t>() : nullptr; | ||
auto scales = scaleBuffer_->as<int64_t>(); | ||
auto values = values_->asMutable<DataT>(); | ||
|
||
DecimalUtil::fillDecimals<DataT>( | ||
values, nullsPtr, values, scales, numValues_, scale_); | ||
|
||
rawValues_ = values_->asMutable<char>(); | ||
getIntValues(rows, requestedType_, result); | ||
} | ||
|
||
template class SelectiveDecimalColumnReader<int64_t>; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -512,6 +512,13 @@ std::unique_ptr<common::Filter> leafCallToSubfieldFilter( | |
} | ||
return isNull(); | ||
} | ||
} else if (call.name() == "isnotnull") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In Spark there is an expression 'IsNotNull' and it is frequently used in filter pushdown.This issue adds the background for this change: #11093. Thanks. |
||
if (toSubfield(leftSide, subfield)) { | ||
if (negated) { | ||
return isNull(); | ||
} | ||
return isNotNull(); | ||
} | ||
} | ||
return nullptr; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract this logic to a static function or trait class so that it can be later reused in parquet as well