From 5af7ec318d106f5bdc9a5eebabe9a1184e799501 Mon Sep 17 00:00:00 2001 From: anlowee Date: Tue, 19 Aug 2025 16:13:33 +0000 Subject: [PATCH 01/34] Extract ClpCursor abstract class and move the original ClpCursor implementation to ClpArchiveCursor --- velox/connectors/clp/ClpDataSource.cpp | 23 ++++--- .../connectors/clp/search_lib/CMakeLists.txt | 3 +- .../{ClpCursor.cpp => ClpArchiveCursor.cpp} | 36 +++++------ .../clp/search_lib/ClpArchiveCursor.h | 60 +++++++++++++++++++ velox/connectors/clp/search_lib/ClpCursor.h | 40 ++++++------- 5 files changed, 114 insertions(+), 48 deletions(-) rename velox/connectors/clp/search_lib/{ClpCursor.cpp => ClpArchiveCursor.cpp} (91%) create mode 100644 velox/connectors/clp/search_lib/ClpArchiveCursor.h diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index a5a574eb3187..d1600234fed7 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -20,7 +20,7 @@ #include "velox/connectors/clp/ClpConnectorSplit.h" #include "velox/connectors/clp/ClpDataSource.h" #include "velox/connectors/clp/ClpTableHandle.h" -#include "velox/connectors/clp/search_lib/ClpCursor.h" +#include "velox/connectors/clp/search_lib/ClpArchiveCursor.h" #include "velox/connectors/clp/search_lib/ClpVectorLoader.h" #include "velox/vector/FlatVector.h" @@ -101,12 +101,21 @@ void ClpDataSource::addFieldsRecursively( void ClpDataSource::addSplit(std::shared_ptr split) { auto clpSplit = std::dynamic_pointer_cast(split); - if (storageType_ == ClpConfig::StorageType::kFs) { - cursor_ = std::make_unique( - clp_s::InputSource::Filesystem, clpSplit->path_); - } else if (storageType_ == ClpConfig::StorageType::kS3) { - cursor_ = std::make_unique( - clp_s::InputSource::Network, clpSplit->path_); + clp_s::InputSource inputSource; + if (ClpConfig::StorageType::kFs == storageType_) { + inputSource = clp_s::InputSource::Filesystem; + } else if (ClpConfig::StorageType::kS3 == storageType_) { + inputSource = clp_s::InputSource::Network; + } else { + VELOX_UNREACHABLE(); + } + + if (ClpConnectorSplit::SplitType::kArchive == clpSplit->type_) { + cursor_ = std::make_unique( + inputSource, clpSplit->path_); + } else { + VELOX_UNSUPPORTED( + "Unsupported split type: {}", static_cast(clpSplit->type_)); } auto pushDownQuery = clpSplit->kqlQuery_; diff --git a/velox/connectors/clp/search_lib/CMakeLists.txt b/velox/connectors/clp/search_lib/CMakeLists.txt index 55e68060fc40..80bf99471c8a 100644 --- a/velox/connectors/clp/search_lib/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/CMakeLists.txt @@ -14,7 +14,8 @@ velox_add_library( clp-s-search STATIC - ClpCursor.cpp + ClpArchiveCursor.cpp + ClpArchiveCursor.h ClpCursor.h ClpQueryRunner.cpp ClpQueryRunner.h diff --git a/velox/connectors/clp/search_lib/ClpCursor.cpp b/velox/connectors/clp/search_lib/ClpArchiveCursor.cpp similarity index 91% rename from velox/connectors/clp/search_lib/ClpCursor.cpp rename to velox/connectors/clp/search_lib/ClpArchiveCursor.cpp index fa6d67bf22b4..9b23c2244d33 100644 --- a/velox/connectors/clp/search_lib/ClpCursor.cpp +++ b/velox/connectors/clp/search_lib/ClpArchiveCursor.cpp @@ -16,6 +16,8 @@ #include +#include "ClpArchiveCursor.h" + #include "clp_s/ArchiveReader.hpp" #include "clp_s/search/EvaluateTimestampIndex.hpp" #include "clp_s/search/ast/ConvertToExists.hpp" @@ -25,27 +27,25 @@ #include "clp_s/search/ast/SearchUtils.hpp" #include "clp_s/search/kql/kql.hpp" -#include "velox/connectors/clp/search_lib/ClpCursor.h" - using namespace clp_s; using namespace clp_s::search; using namespace clp_s::search::ast; namespace facebook::velox::connector::clp::search_lib { -ClpCursor::ClpCursor(InputSource inputSource, std::string archivePath) - : errorCode_(ErrorCode::QueryNotInitialized), - inputSource_(inputSource), - archivePath_(std::move(archivePath)), +ClpArchiveCursor::ClpArchiveCursor( + clp_s::InputSource inputSource, + std::string_view splitPath) + : ClpCursor(inputSource, splitPath), archiveReader_(std::make_shared()) {} -ClpCursor::~ClpCursor() { - if (currentArchiveLoaded_) { +ClpArchiveCursor::~ClpArchiveCursor() { + if (currentSplitLoaded_) { archiveReader_->close(); } } -void ClpCursor::executeQuery( +void ClpArchiveCursor::executeQuery( const std::string& query, const std::vector& outputColumns) { query_ = query; @@ -53,21 +53,21 @@ void ClpCursor::executeQuery( errorCode_ = preprocessQuery(); } -uint64_t ClpCursor::fetchNext( +uint64_t ClpArchiveCursor::fetchNext( uint64_t numRows, const std::shared_ptr>& filteredRowIndices) { if (ErrorCode::Success != errorCode_) { return 0; } - if (false == currentArchiveLoaded_) { - errorCode_ = loadArchive(); + if (false == currentSplitLoaded_) { + errorCode_ = loadSplit(); if (ErrorCode::Success != errorCode_) { return 0; } archiveReader_->open_packed_streams(); - currentArchiveLoaded_ = true; + currentSplitLoaded_ = true; queryRunner_ = std::make_shared( schemaMatch_, expr_, archiveReader_, false, projection_); queryRunner_->global_init(); @@ -104,8 +104,8 @@ uint64_t ClpCursor::fetchNext( return 0; } -const std::vector& ClpCursor::getProjectedColumns() - const { +const std::vector& +ClpArchiveCursor::getProjectedColumns() const { if (queryRunner_) { return queryRunner_->getProjectedColumns(); } @@ -113,7 +113,7 @@ const std::vector& ClpCursor::getProjectedColumns() return kEmpty; } -ErrorCode ClpCursor::preprocessQuery() { +ErrorCode ClpArchiveCursor::preprocessQuery() { auto queryStream = std::istringstream(query_); expr_ = kql::parse_kql_expression(queryStream); if (nullptr == expr_) { @@ -150,14 +150,14 @@ ErrorCode ClpCursor::preprocessQuery() { return ErrorCode::Success; } -ErrorCode ClpCursor::loadArchive() { +ErrorCode ClpArchiveCursor::loadSplit() { auto networkAuthOption = inputSource_ == InputSource::Filesystem ? NetworkAuthOption{.method = AuthMethod::None} : NetworkAuthOption{.method = AuthMethod::S3PresignedUrlV4}; try { archiveReader_->open( - get_path_object_for_raw_path(archivePath_), networkAuthOption); + get_path_object_for_raw_path(splitPath_), networkAuthOption); } catch (std::exception& e) { VLOG(2) << "Failed to open archive file: " << e.what(); return ErrorCode::InternalError; diff --git a/velox/connectors/clp/search_lib/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/ClpArchiveCursor.h new file mode 100644 index 000000000000..9466bfda0fe9 --- /dev/null +++ b/velox/connectors/clp/search_lib/ClpArchiveCursor.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/connectors/clp/search_lib/ClpCursor.h" + +namespace facebook::velox::connector::clp::search_lib { + +class ClpArchiveCursor final : public ClpCursor { + public: + explicit ClpArchiveCursor( + clp_s::InputSource inputSource, + std::string_view splitPath); + ~ClpArchiveCursor() override; + + void executeQuery( + const std::string& query, + const std::vector& outputColumns) override; + + uint64_t fetchNext( + uint64_t numRows, + const std::shared_ptr>& filteredRowIndices) + override; + + const std::vector& getProjectedColumns() + const override; + + protected: + ErrorCode preprocessQuery() override; + + ErrorCode loadSplit() override; + + private: + std::vector matchedSchemas_; + size_t currentSchemaIndex_{0}; + int32_t currentSchemaId_{-1}; + bool currentSchemaTableLoaded_{false}; + + std::shared_ptr schemaMatch_; + std::shared_ptr queryRunner_; + std::shared_ptr projection_; + + std::shared_ptr archiveReader_; +}; + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ClpCursor.h b/velox/connectors/clp/search_lib/ClpCursor.h index f6303a2ef94b..55d547706a70 100644 --- a/velox/connectors/clp/search_lib/ClpCursor.h +++ b/velox/connectors/clp/search_lib/ClpCursor.h @@ -71,8 +71,11 @@ struct Field { /// while supporting projection and batch-oriented retrieval of filtered rows. class ClpCursor { public: - explicit ClpCursor(clp_s::InputSource inputSource, std::string archivePath); - ~ClpCursor(); + explicit ClpCursor(clp_s::InputSource inputSource, std::string_view splitPath) + : errorCode_(ErrorCode::QueryNotInitialized), + inputSource_(inputSource), + splitPath_(std::string(splitPath)) {} + virtual ~ClpCursor() = default; /// Executes a query. This function parses, validates, and prepares the given /// query for execution. @@ -80,54 +83,47 @@ class ClpCursor { /// @param query The KQL query to execute. /// @param outputColumns A vector specifying the columns to be included in the /// query result. - void executeQuery( + virtual void executeQuery( const std::string& query, - const std::vector& outputColumns); + const std::vector& outputColumns) = 0; - /// Fetches the next set of rows from the cursor. If the archive and schema + /// Fetches the next set of rows from the cursor. If the split and schema /// are not yet loaded, this function will perform the necessary loading. /// /// @param numRows The maximum number of rows to fetch. /// @param filteredRowIndices A vector of row indices that match the filter. /// @return The number of rows scanned. - uint64_t fetchNext( + virtual uint64_t fetchNext( uint64_t numRows, - const std::shared_ptr>& filteredRowIndices); + const std::shared_ptr>& filteredRowIndices) = 0; /// Retrieves the projected columns. /// /// @return A vector of BaseColumnReader pointers representing the projected /// columns. - const std::vector& getProjectedColumns() const; + virtual const std::vector& getProjectedColumns() + const = 0; - private: + protected: /// Preprocesses the query, performing parsing, validation, and optimization. /// /// @return The error code. - ErrorCode preprocessQuery(); + virtual ErrorCode preprocessQuery() = 0; - /// Loads the archive at the current index. /// /// @return The error code. - ErrorCode loadArchive(); + virtual ErrorCode loadSplit() = 0; ErrorCode errorCode_; clp_s::InputSource inputSource_{clp_s::InputSource::Filesystem}; - std::string archivePath_; + std::string splitPath_; std::string query_; std::vector outputColumns_; - std::vector matchedSchemas_; - size_t currentSchemaIndex_{0}; - int32_t currentSchemaId_{-1}; - bool currentSchemaTableLoaded_{false}; - bool currentArchiveLoaded_{false}; + + bool currentSplitLoaded_{false}; std::shared_ptr expr_; - std::shared_ptr schemaMatch_; - std::shared_ptr queryRunner_; - std::shared_ptr projection_; - std::shared_ptr archiveReader_; }; } // namespace facebook::velox::connector::clp::search_lib From cc82dd7733998185ec266951a18d91765b6243f1 Mon Sep 17 00:00:00 2001 From: anlowee Date: Tue, 19 Aug 2025 19:02:54 +0000 Subject: [PATCH 02/34] Rename the extracrted ClpCursor to BaseClpCursor, and add the skeleton of ClpIrCursor --- velox/connectors/clp/ClpDataSource.cpp | 5 +- velox/connectors/clp/ClpDataSource.h | 4 +- .../clp/search_lib/BaseClpCursor.cpp | 78 +++++++++++++++++++ .../{ClpCursor.h => BaseClpCursor.h} | 33 +++++--- .../connectors/clp/search_lib/CMakeLists.txt | 5 +- .../clp/search_lib/ClpArchiveCursor.cpp | 51 +----------- .../clp/search_lib/ClpArchiveCursor.h | 10 +-- .../connectors/clp/search_lib/ClpIrCursor.cpp | 21 +++++ velox/connectors/clp/search_lib/ClpIrCursor.h | 49 ++++++++++++ .../clp/search_lib/ClpVectorLoader.cpp | 6 +- .../clp/search_lib/ClpVectorLoader.h | 7 +- 11 files changed, 193 insertions(+), 76 deletions(-) create mode 100644 velox/connectors/clp/search_lib/BaseClpCursor.cpp rename velox/connectors/clp/search_lib/{ClpCursor.h => BaseClpCursor.h} (84%) create mode 100644 velox/connectors/clp/search_lib/ClpIrCursor.cpp create mode 100644 velox/connectors/clp/search_lib/ClpIrCursor.h diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index 962ea363ed05..6c702698fd03 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -163,7 +163,10 @@ VectorPtr ClpDataSource::createVector( vectorType, vectorSize, std::make_unique( - projectedColumn, projectedType, filteredRows), + projectedColumn, + projectedType, + filteredRows, + cursor_->getSplitType()), std::move(vector)); } diff --git a/velox/connectors/clp/ClpDataSource.h b/velox/connectors/clp/ClpDataSource.h index 70ec45643547..32611525fe20 100644 --- a/velox/connectors/clp/ClpDataSource.h +++ b/velox/connectors/clp/ClpDataSource.h @@ -20,7 +20,7 @@ #include "velox/connectors/Connector.h" #include "velox/connectors/clp/ClpConfig.h" -#include "velox/connectors/clp/search_lib/ClpCursor.h" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" namespace clp_s { class BaseColumnReader; @@ -102,7 +102,7 @@ class ClpDataSource : public DataSource { std::vector fields_; - std::unique_ptr cursor_; + std::unique_ptr cursor_; std::shared_ptr s3AuthProvider_; }; diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.cpp b/velox/connectors/clp/search_lib/BaseClpCursor.cpp new file mode 100644 index 000000000000..eafcccbf7a71 --- /dev/null +++ b/velox/connectors/clp/search_lib/BaseClpCursor.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" + +#include "clp_s/search/EvaluateTimestampIndex.hpp" +#include "clp_s/search/ast/ConvertToExists.hpp" +#include "clp_s/search/ast/EmptyExpr.hpp" +#include "clp_s/search/ast/NarrowTypes.hpp" +#include "clp_s/search/ast/OrOfAndForm.hpp" +#include "clp_s/search/ast/SearchUtils.hpp" +#include "clp_s/search/kql/kql.hpp" + +using namespace clp_s; +using namespace clp_s::search; +using namespace clp_s::search::ast; + +namespace facebook::velox::connector::clp::search_lib { + +void BaseClpCursor::executeQuery( + const std::string& query, + const std::vector& outputColumns) { + query_ = query; + outputColumns_ = outputColumns; + errorCode_ = preprocessQuery(); +} + +ErrorCode BaseClpCursor::preprocessQuery() { + auto queryStream = std::istringstream(query_); + expr_ = kql::parse_kql_expression(queryStream); + if (nullptr == expr_) { + VLOG(2) << "Failed to parse query '" << query_ << "'"; + return ErrorCode::InvalidQuerySyntax; + } + + if (std::dynamic_pointer_cast(expr_)) { + VLOG(2) << "Query '" << query_ << "' is logically false"; + return ErrorCode::LogicalError; + } + + OrOfAndForm standardizePass; + if (expr_ = standardizePass.run(expr_); + std::dynamic_pointer_cast(expr_)) { + VLOG(2) << "Query '" << query_ << "' is logically false"; + return ErrorCode::LogicalError; + } + + NarrowTypes narrowPass; + if (expr_ = narrowPass.run(expr_); + std::dynamic_pointer_cast(expr_)) { + VLOG(2) << "Query '" << query_ << "' is logically false"; + return ErrorCode::LogicalError; + } + + ConvertToExists convertPass; + if (expr_ = convertPass.run(expr_); + std::dynamic_pointer_cast(expr_)) { + VLOG(2) << "Query '" << query_ << "' is logically false"; + return ErrorCode::LogicalError; + } + + return ErrorCode::Success; +} + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h similarity index 84% rename from velox/connectors/clp/search_lib/ClpCursor.h rename to velox/connectors/clp/search_lib/BaseClpCursor.h index 55d547706a70..777851403833 100644 --- a/velox/connectors/clp/search_lib/ClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -19,6 +19,7 @@ #include #include +#include "connectors/clp/ClpConnectorSplit.h" #include "velox/connectors/clp/search_lib/ClpQueryRunner.h" namespace clp_s { @@ -69,13 +70,16 @@ struct Field { /// schemas and archives, applying filters, and iterating over the results. It /// abstracts away the low-level details of archive access and schema matching /// while supporting projection and batch-oriented retrieval of filtered rows. -class ClpCursor { +class BaseClpCursor { public: - explicit ClpCursor(clp_s::InputSource inputSource, std::string_view splitPath) + explicit BaseClpCursor( + clp_s::InputSource inputSource, + std::string_view splitPath) : errorCode_(ErrorCode::QueryNotInitialized), inputSource_(inputSource), - splitPath_(std::string(splitPath)) {} - virtual ~ClpCursor() = default; + splitPath_(std::string(splitPath)), + splitType_(ClpConnectorSplit::SplitType::kArchive) {} + virtual ~BaseClpCursor() = default; /// Executes a query. This function parses, validates, and prepares the given /// query for execution. @@ -83,9 +87,9 @@ class ClpCursor { /// @param query The KQL query to execute. /// @param outputColumns A vector specifying the columns to be included in the /// query result. - virtual void executeQuery( + void executeQuery( const std::string& query, - const std::vector& outputColumns) = 0; + const std::vector& outputColumns); /// Fetches the next set of rows from the cursor. If the split and schema /// are not yet loaded, this function will perform the necessary loading. @@ -104,12 +108,14 @@ class ClpCursor { virtual const std::vector& getProjectedColumns() const = 0; - protected: - /// Preprocesses the query, performing parsing, validation, and optimization. + /// Get the type of the split that the cursor is processing. /// - /// @return The error code. - virtual ErrorCode preprocessQuery() = 0; + /// @return The split type. + ClpConnectorSplit::SplitType getSplitType() const { + return splitType_; + } + protected: /// /// @return The error code. virtual ErrorCode loadSplit() = 0; @@ -118,12 +124,19 @@ class ClpCursor { clp_s::InputSource inputSource_{clp_s::InputSource::Filesystem}; std::string splitPath_; + ClpConnectorSplit::SplitType splitType_; std::string query_; std::vector outputColumns_; bool currentSplitLoaded_{false}; std::shared_ptr expr_; + + private: + /// Preprocesses the query, performing parsing, validation, and optimization. + /// + /// @return The error code. + ErrorCode preprocessQuery(); }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/CMakeLists.txt b/velox/connectors/clp/search_lib/CMakeLists.txt index c411181bb880..ae0ed3e9ea2a 100644 --- a/velox/connectors/clp/search_lib/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/CMakeLists.txt @@ -14,9 +14,12 @@ velox_add_library( clp-s-search STATIC + BaseClpCursor.cpp + BaseClpCursor.h ClpArchiveCursor.cpp ClpArchiveCursor.h - ClpCursor.h + ClpIrCursor.cpp + ClpIrCursor.h ClpPackageS3AuthProvider.cpp ClpPackageS3AuthProvider.h ClpQueryRunner.cpp diff --git a/velox/connectors/clp/search_lib/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/ClpArchiveCursor.cpp index 9b23c2244d33..5f7a0e0b0456 100644 --- a/velox/connectors/clp/search_lib/ClpArchiveCursor.cpp +++ b/velox/connectors/clp/search_lib/ClpArchiveCursor.cpp @@ -20,12 +20,8 @@ #include "clp_s/ArchiveReader.hpp" #include "clp_s/search/EvaluateTimestampIndex.hpp" -#include "clp_s/search/ast/ConvertToExists.hpp" #include "clp_s/search/ast/EmptyExpr.hpp" -#include "clp_s/search/ast/NarrowTypes.hpp" -#include "clp_s/search/ast/OrOfAndForm.hpp" #include "clp_s/search/ast/SearchUtils.hpp" -#include "clp_s/search/kql/kql.hpp" using namespace clp_s; using namespace clp_s::search; @@ -36,7 +32,7 @@ namespace facebook::velox::connector::clp::search_lib { ClpArchiveCursor::ClpArchiveCursor( clp_s::InputSource inputSource, std::string_view splitPath) - : ClpCursor(inputSource, splitPath), + : BaseClpCursor(inputSource, splitPath), archiveReader_(std::make_shared()) {} ClpArchiveCursor::~ClpArchiveCursor() { @@ -45,14 +41,6 @@ ClpArchiveCursor::~ClpArchiveCursor() { } } -void ClpArchiveCursor::executeQuery( - const std::string& query, - const std::vector& outputColumns) { - query_ = query; - outputColumns_ = outputColumns; - errorCode_ = preprocessQuery(); -} - uint64_t ClpArchiveCursor::fetchNext( uint64_t numRows, const std::shared_ptr>& filteredRowIndices) { @@ -113,43 +101,6 @@ ClpArchiveCursor::getProjectedColumns() const { return kEmpty; } -ErrorCode ClpArchiveCursor::preprocessQuery() { - auto queryStream = std::istringstream(query_); - expr_ = kql::parse_kql_expression(queryStream); - if (nullptr == expr_) { - VLOG(2) << "Failed to parse query '" << query_ << "'"; - return ErrorCode::InvalidQuerySyntax; - } - - if (std::dynamic_pointer_cast(expr_)) { - VLOG(2) << "Query '" << query_ << "' is logically false"; - return ErrorCode::LogicalError; - } - - OrOfAndForm standardizePass; - if (expr_ = standardizePass.run(expr_); - std::dynamic_pointer_cast(expr_)) { - VLOG(2) << "Query '" << query_ << "' is logically false"; - return ErrorCode::LogicalError; - } - - NarrowTypes narrowPass; - if (expr_ = narrowPass.run(expr_); - std::dynamic_pointer_cast(expr_)) { - VLOG(2) << "Query '" << query_ << "' is logically false"; - return ErrorCode::LogicalError; - } - - ConvertToExists convertPass; - if (expr_ = convertPass.run(expr_); - std::dynamic_pointer_cast(expr_)) { - VLOG(2) << "Query '" << query_ << "' is logically false"; - return ErrorCode::LogicalError; - } - - return ErrorCode::Success; -} - ErrorCode ClpArchiveCursor::loadSplit() { auto networkAuthOption = inputSource_ == InputSource::Filesystem ? NetworkAuthOption{.method = AuthMethod::None} diff --git a/velox/connectors/clp/search_lib/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/ClpArchiveCursor.h index 9466bfda0fe9..6167a82db7a0 100644 --- a/velox/connectors/clp/search_lib/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/ClpArchiveCursor.h @@ -16,21 +16,17 @@ #pragma once -#include "velox/connectors/clp/search_lib/ClpCursor.h" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" namespace facebook::velox::connector::clp::search_lib { -class ClpArchiveCursor final : public ClpCursor { +class ClpArchiveCursor final : public BaseClpCursor { public: explicit ClpArchiveCursor( clp_s::InputSource inputSource, std::string_view splitPath); ~ClpArchiveCursor() override; - void executeQuery( - const std::string& query, - const std::vector& outputColumns) override; - uint64_t fetchNext( uint64_t numRows, const std::shared_ptr>& filteredRowIndices) @@ -40,8 +36,6 @@ class ClpArchiveCursor final : public ClpCursor { const override; protected: - ErrorCode preprocessQuery() override; - ErrorCode loadSplit() override; private: diff --git a/velox/connectors/clp/search_lib/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ClpIrCursor.cpp new file mode 100644 index 000000000000..cf1a6ecec12b --- /dev/null +++ b/velox/connectors/clp/search_lib/ClpIrCursor.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ClpIrCursor.h" + +namespace facebook::velox::connector::clp::search_lib { + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ClpIrCursor.h b/velox/connectors/clp/search_lib/ClpIrCursor.h new file mode 100644 index 000000000000..82d2e3daf8ef --- /dev/null +++ b/velox/connectors/clp/search_lib/ClpIrCursor.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "streaming_compression/Decompressor.hpp" +#include "streaming_compression/zstd/Decompressor.hpp" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" + +namespace facebook::velox::connector::clp::search_lib { + +class ClpIrCursor final : public BaseClpCursor { + public: + explicit ClpIrCursor( + clp_s::InputSource inputSource, + std::string_view splitPath); + ~ClpIrCursor() override; + + uint64_t fetchNext( + uint64_t numRows, + const std::shared_ptr>& filteredRowIndices) + override; + + const std::vector& getProjectedColumns() + const override; + + protected: + ErrorCode loadSplit() override; + + private: + std::shared_ptr<::clp::ReaderInterface> ir_reader_; + std::shared_ptr<::clp::streaming_compression::zstd::Decompressor> + ir_decompressor_; +}; + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ClpVectorLoader.cpp b/velox/connectors/clp/search_lib/ClpVectorLoader.cpp index 6bd58e8a2a28..65ffb64c2508 100644 --- a/velox/connectors/clp/search_lib/ClpVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ClpVectorLoader.cpp @@ -124,10 +124,12 @@ auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { ClpVectorLoader::ClpVectorLoader( clp_s::BaseColumnReader* columnReader, ColumnType nodeType, - std::shared_ptr> filteredRowIndices) + std::shared_ptr> filteredRowIndices, + ClpConnectorSplit::SplitType splitType) : columnReader_(columnReader), nodeType_(nodeType), - filteredRowIndices_(std::move(filteredRowIndices)) {} + filteredRowIndices_(std::move(filteredRowIndices)), + splitType_(splitType) {} template void ClpVectorLoader::populateData(RowSet rows, VectorPtr vector) { diff --git a/velox/connectors/clp/search_lib/ClpVectorLoader.h b/velox/connectors/clp/search_lib/ClpVectorLoader.h index 36af6d7b807a..0d81b72ff013 100644 --- a/velox/connectors/clp/search_lib/ClpVectorLoader.h +++ b/velox/connectors/clp/search_lib/ClpVectorLoader.h @@ -18,8 +18,9 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" +#include "connectors/clp/ClpConnectorSplit.h" -#include "velox/connectors/clp/search_lib/ClpCursor.h" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" #include "velox/type/Timestamp.h" #include "velox/vector/FlatVector.h" #include "velox/vector/LazyVector.h" @@ -38,7 +39,8 @@ class ClpVectorLoader : public VectorLoader { ClpVectorLoader( clp_s::BaseColumnReader* columnReader, ColumnType nodeType, - std::shared_ptr> filteredRowIndices); + std::shared_ptr> filteredRowIndices, + ClpConnectorSplit::SplitType splitType); private: void loadInternal( @@ -58,6 +60,7 @@ class ClpVectorLoader : public VectorLoader { clp_s::BaseColumnReader* columnReader_; ColumnType nodeType_; std::shared_ptr> filteredRowIndices_; + ClpConnectorSplit::SplitType splitType_; inline static thread_local std::unique_ptr arrayParser_ = std::make_unique(); From 65d3bd11f9aa4f68e78ec6aa9194a1311dd43528 Mon Sep 17 00:00:00 2001 From: anlowee Date: Tue, 19 Aug 2025 23:25:34 +0000 Subject: [PATCH 03/34] WIP: implement the ClpIrCursor, one todo is to marshal the row in ClpVectorLoader --- .../connectors/clp/search_lib/ClpIrCursor.cpp | 111 ++++++++++++++++++ velox/connectors/clp/search_lib/ClpIrCursor.h | 102 +++++++++++++++- 2 files changed, 207 insertions(+), 6 deletions(-) diff --git a/velox/connectors/clp/search_lib/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ClpIrCursor.cpp index cf1a6ecec12b..8e938e4a4da1 100644 --- a/velox/connectors/clp/search_lib/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ClpIrCursor.cpp @@ -15,7 +15,118 @@ */ #include "ClpIrCursor.h" +#include "clp_s/InputConfig.hpp" + +using namespace clp_s; namespace facebook::velox::connector::clp::search_lib { +uint64_t ClpIrCursor::fetchNext( + uint64_t numRows, + const std::shared_ptr>& filteredRowIndices) { + if (ErrorCode::Success != errorCode_) { + return 0; + } + + if (false == currentSplitLoaded_) { + errorCode_ = loadSplit(); + if (ErrorCode::Success != errorCode_) { + return 0; + } + } + + size_t rowsFetched{0ULL}; + // while (rowsFetched < numRows && kvir_deserializer_.has_value()) { + // auto const + // result{kvir_deserializer_.value().deserialize_next_ir_unit(kvir_decompressor_)}; + // if (result.has_error() && std::errc::no_message != result.error()) { + // if (ErrorCode::Success != load_next_kvir_stream()) { + // return rowsFetched; + // } + // continue; + // } + // if (result.value() == ::clp::ffi::ir_stream::IrUnitType::EndOfStream) { + // if (ErrorCode::Success != load_next_kvir_stream()) { + // return rowsFetched; + // } + // continue; + // } + // if (result.value() == ::clp::ffi::ir_stream::IrUnitType::LogEvent) { + // auto const& ir_unit_handler = + // kvir_deserializer_.value().get_ir_unit_handler(); + // marshal_row(rowsFetched, column_vectors, ir_unit_handler); + // ++rowsFetched; + // } + // } + return rowsFetched; +} + +const std::vector& ClpIrCursor::getProjectedColumns() + const {} + +ErrorCode ClpIrCursor::loadSplit() { + auto networkAuthOption = inputSource_ == InputSource::Filesystem + ? NetworkAuthOption{.method = AuthMethod::None} + : NetworkAuthOption{.method = AuthMethod::S3PresignedUrlV4}; + + auto irHandler{ClpVeloxIrUnitHandler{shared_from_this()}}; + + auto queryHandlerResult{QueryHandlerType::create( + handleProjectionResolution, std::move(expr_), {}, ignoreCase_)}; + if (!queryHandlerResult) { + VLOG(2) << "Failed to create query handler for deserialization."; + return ErrorCode::InternalError; + } + auto queryHandler = std::move(queryHandlerResult).value(); + + auto irPath = Path{.source = inputSource_, .path = splitPath_}; + irReader_ = try_create_reader(irPath, networkAuthOption); + if (nullptr == irReader_) { + VLOG(2) << "Failed to open kv-ir stream \"" << splitPath_ + << "\" for reading."; + return ErrorCode::InternalError; + } + + auto deserializerResult = ::clp::ffi::ir_stream::make_deserializer( + *irReader_, irHandler, std::move(queryHandler)); + if (!deserializerResult) { + VLOG(2) << "Failed to create deserializer for deserialization."; + return ErrorCode::InternalError; + } + irDeserializer_ = std::make_shared<::clp::ffi::ir_stream::Deserializer< + ClpVeloxIrUnitHandler, + QueryHandlerType>>(std::move(deserializerResult).value()); + + return ErrorCode::Success; +} + +void ClpIrCursor::addDeserializedLogEvent( + ::clp::ffi::KeyValuePairLogEvent logEvent) { + deserializedLogEvent_.emplace(std::move(logEvent)); +} + +void ClpIrCursor::addOrderedResolvedId( + size_t idxOfProjectedColumn, + ::clp::ffi::SchemaTree::Node::id_t nodeId, + bool isAutoGenerated) { + orderedResolvedIds_.at(idxOfProjectedColumn) + .emplace_back(std::make_pair(nodeId, isAutoGenerated)); +} + +std::optional ClpIrCursor::findProjectedColumnIdxByKeyName( + std::string_view keyName) { + auto it = projectedColumnToIdx_.find(std::string(keyName)); + if (projectedColumnToIdx_.end() != it) { + return std::nullopt; + } + return it->second; +} + +auto ClpVeloxIrUnitHandler::handle_log_event( + ::clp::ffi::KeyValuePairLogEvent log_event) + -> ::clp::ffi::ir_stream::IRErrorCode { + cursor_->addDeserializedLogEvent(std::move(log_event)); + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; +} + } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ClpIrCursor.h b/velox/connectors/clp/search_lib/ClpIrCursor.h index 82d2e3daf8ef..22a3b07d81f2 100644 --- a/velox/connectors/clp/search_lib/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ClpIrCursor.h @@ -16,18 +16,61 @@ #pragma once +#include "ffi/ir_stream/Deserializer.hpp" #include "streaming_compression/Decompressor.hpp" #include "streaming_compression/zstd/Decompressor.hpp" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" namespace facebook::velox::connector::clp::search_lib { -class ClpIrCursor final : public BaseClpCursor { +class ClpIrCursor; + +class ClpVeloxIrUnitHandler { + public: + ClpVeloxIrUnitHandler(std::shared_ptr cursor) + : cursor_(std::move(cursor)) {} + + // Destructor + ~ClpVeloxIrUnitHandler() = default; + + // Methods implementing `IrUnitHandlerInterface` + [[nodiscard]] auto handle_log_event( + ::clp::ffi::KeyValuePairLogEvent log_event) + -> ::clp::ffi::ir_stream::IRErrorCode; + + [[nodiscard]] static auto handle_utc_offset_change( + [[maybe_unused]] ::clp::UtcOffset utc_offset_old, + [[maybe_unused]] ::clp::UtcOffset utc_offset_new) + -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + } + + [[nodiscard]] static auto handle_schema_tree_node_insertion( + [[maybe_unused]] bool is_auto_generated, + [[maybe_unused]] ::clp::ffi::SchemaTree::NodeLocator + schema_tree_node_locator, + [[maybe_unused]] std::shared_ptr<::clp::ffi::SchemaTree const> const& + schema_tree) -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + } + + [[nodiscard]] auto handle_end_of_stream() + -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + } + + private: + std::shared_ptr cursor_; +}; + +class ClpIrCursor final : public BaseClpCursor, + std::enable_shared_from_this { public: explicit ClpIrCursor( clp_s::InputSource inputSource, - std::string_view splitPath); - ~ClpIrCursor() override; + std::string_view splitPath, + bool ignoreCase) + : BaseClpCursor(inputSource, splitPath), ignoreCase_(ignoreCase) {} uint64_t fetchNext( uint64_t numRows, @@ -37,13 +80,60 @@ class ClpIrCursor final : public BaseClpCursor { const std::vector& getProjectedColumns() const override; + void addDeserializedLogEvent(::clp::ffi::KeyValuePairLogEvent logEvent); + + std::optional findProjectedColumnIdxByKeyName( + std::string_view keyName); + + void addOrderedResolvedId( + size_t idxOfProjectedColumn, + ::clp::ffi::SchemaTree::Node::id_t nodeId, + bool isAutoGenerated); + protected: ErrorCode loadSplit() override; private: - std::shared_ptr<::clp::ReaderInterface> ir_reader_; - std::shared_ptr<::clp::streaming_compression::zstd::Decompressor> - ir_decompressor_; + using OrderedResolvedId = + std::vector>; + std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; + bool ignoreCase_; + + std::map projectedColumnToIdx_; + std::vector orderedResolvedIds_; + std::optional<::clp::ffi::KeyValuePairLogEvent> deserializedLogEvent_; + + std::function(bool, ::clp::ffi::SchemaTree::Node::id_t, std::string_view)> + handleProjectionResolution = + [this]( + [[maybe_unused]] bool isAutoGenerated, + [[maybe_unused]] ::clp::ffi::SchemaTree::Node::id_t nodeId, + [[maybe_unused]] std::string_view keyName) + -> ystdlib::error_handling::Result { + auto projectedColumnIdx = findProjectedColumnIdxByKeyName(keyName); + if (projectedColumnIdx.has_value()) { + addOrderedResolvedId(projectedColumnIdx.value(), nodeId, isAutoGenerated); + return ::clp::ffi::ir_stream::ir_error_code_to_errc( + ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success); + } + return ::clp::ffi::ir_stream::ir_error_code_to_errc( + ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Corrupted_IR); + }; + using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< + decltype(handleProjectionResolution)>; + + std::shared_ptr<::clp::ffi::ir_stream:: + Deserializer> + irDeserializer_; + + std::optional<::clp::ffi::KeyValuePairLogEvent> const& + getDeserializedLogEvent() const { + return deserializedLogEvent_; + } + std::vector const& getOrderedResolvedIds() const { + return orderedResolvedIds_; + } }; } // namespace facebook::velox::connector::clp::search_lib From d45baf0489e36b38359e9559d6d555f36fec5657 Mon Sep 17 00:00:00 2001 From: anlowee Date: Wed, 20 Aug 2025 16:20:23 +0000 Subject: [PATCH 04/34] Remove the IR related code and add a subpackage to store archive search related code --- velox/connectors/clp/ClpDataSource.cpp | 2 +- .../connectors/clp/search_lib/BaseClpCursor.h | 2 +- .../connectors/clp/search_lib/CMakeLists.txt | 10 ++-- .../connectors/clp/search_lib/ClpIrCursor.cpp | 21 -------- velox/connectors/clp/search_lib/ClpIrCursor.h | 49 ------------------- .../clp/search_lib/archive/CMakeLists.txt | 19 +++++++ .../{ => archive}/ClpArchiveCursor.cpp | 0 .../{ => archive}/ClpArchiveCursor.h | 0 .../{ => archive}/ClpQueryRunner.cpp | 2 +- .../search_lib/{ => archive}/ClpQueryRunner.h | 0 10 files changed, 26 insertions(+), 79 deletions(-) delete mode 100644 velox/connectors/clp/search_lib/ClpIrCursor.cpp delete mode 100644 velox/connectors/clp/search_lib/ClpIrCursor.h create mode 100644 velox/connectors/clp/search_lib/archive/CMakeLists.txt rename velox/connectors/clp/search_lib/{ => archive}/ClpArchiveCursor.cpp (100%) rename velox/connectors/clp/search_lib/{ => archive}/ClpArchiveCursor.h (100%) rename velox/connectors/clp/search_lib/{ => archive}/ClpQueryRunner.cpp (97%) rename velox/connectors/clp/search_lib/{ => archive}/ClpQueryRunner.h (100%) diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index 6c702698fd03..97cf0557ab97 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -17,11 +17,11 @@ #include #include "search_lib/ClpS3AuthProviderBase.h" +#include "search_lib/archive/ClpArchiveCursor.h" #include "velox/connectors/clp/ClpColumnHandle.h" #include "velox/connectors/clp/ClpConnectorSplit.h" #include "velox/connectors/clp/ClpDataSource.h" #include "velox/connectors/clp/ClpTableHandle.h" -#include "velox/connectors/clp/search_lib/ClpArchiveCursor.h" #include "velox/connectors/clp/search_lib/ClpVectorLoader.h" #include "velox/vector/FlatVector.h" diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index 777851403833..d481708ed77e 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -19,8 +19,8 @@ #include #include +#include "archive/ClpQueryRunner.h" #include "connectors/clp/ClpConnectorSplit.h" -#include "velox/connectors/clp/search_lib/ClpQueryRunner.h" namespace clp_s { enum class InputSource : uint8_t; diff --git a/velox/connectors/clp/search_lib/CMakeLists.txt b/velox/connectors/clp/search_lib/CMakeLists.txt index ae0ed3e9ea2a..823e135ff4c4 100644 --- a/velox/connectors/clp/search_lib/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/CMakeLists.txt @@ -16,19 +16,17 @@ velox_add_library( STATIC BaseClpCursor.cpp BaseClpCursor.h - ClpArchiveCursor.cpp - ClpArchiveCursor.h - ClpIrCursor.cpp - ClpIrCursor.h ClpPackageS3AuthProvider.cpp ClpPackageS3AuthProvider.h - ClpQueryRunner.cpp - ClpQueryRunner.h ClpS3AuthProviderBase.cpp ClpS3AuthProviderBase.h ClpVectorLoader.cpp ClpVectorLoader.h) +target_include_directories(clp-s-search PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +add_subdirectory(archive) + velox_link_libraries( clp-s-search PUBLIC clp_s::archive_reader diff --git a/velox/connectors/clp/search_lib/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ClpIrCursor.cpp deleted file mode 100644 index cf1a6ecec12b..000000000000 --- a/velox/connectors/clp/search_lib/ClpIrCursor.cpp +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClpIrCursor.h" - -namespace facebook::velox::connector::clp::search_lib { - -} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ClpIrCursor.h b/velox/connectors/clp/search_lib/ClpIrCursor.h deleted file mode 100644 index 82d2e3daf8ef..000000000000 --- a/velox/connectors/clp/search_lib/ClpIrCursor.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "streaming_compression/Decompressor.hpp" -#include "streaming_compression/zstd/Decompressor.hpp" -#include "velox/connectors/clp/search_lib/BaseClpCursor.h" - -namespace facebook::velox::connector::clp::search_lib { - -class ClpIrCursor final : public BaseClpCursor { - public: - explicit ClpIrCursor( - clp_s::InputSource inputSource, - std::string_view splitPath); - ~ClpIrCursor() override; - - uint64_t fetchNext( - uint64_t numRows, - const std::shared_ptr>& filteredRowIndices) - override; - - const std::vector& getProjectedColumns() - const override; - - protected: - ErrorCode loadSplit() override; - - private: - std::shared_ptr<::clp::ReaderInterface> ir_reader_; - std::shared_ptr<::clp::streaming_compression::zstd::Decompressor> - ir_decompressor_; -}; - -} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/archive/CMakeLists.txt b/velox/connectors/clp/search_lib/archive/CMakeLists.txt new file mode 100644 index 000000000000..3777be57d533 --- /dev/null +++ b/velox/connectors/clp/search_lib/archive/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +target_sources( + clp-s-search + PRIVATE ClpArchiveCursor.cpp ClpArchiveCursor.h ClpQueryRunner.cpp + ClpQueryRunner.h) + +target_include_directories(clp-s-search PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/velox/connectors/clp/search_lib/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp similarity index 100% rename from velox/connectors/clp/search_lib/ClpArchiveCursor.cpp rename to velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp diff --git a/velox/connectors/clp/search_lib/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h similarity index 100% rename from velox/connectors/clp/search_lib/ClpArchiveCursor.h rename to velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h diff --git a/velox/connectors/clp/search_lib/ClpQueryRunner.cpp b/velox/connectors/clp/search_lib/archive/ClpQueryRunner.cpp similarity index 97% rename from velox/connectors/clp/search_lib/ClpQueryRunner.cpp rename to velox/connectors/clp/search_lib/archive/ClpQueryRunner.cpp index a1c9032ebd58..dcb6fdc4bb5b 100644 --- a/velox/connectors/clp/search_lib/ClpQueryRunner.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpQueryRunner.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/connectors/clp/search_lib/ClpQueryRunner.h" +#include "ClpQueryRunner.h" using namespace clp_s; using namespace clp_s::search; diff --git a/velox/connectors/clp/search_lib/ClpQueryRunner.h b/velox/connectors/clp/search_lib/archive/ClpQueryRunner.h similarity index 100% rename from velox/connectors/clp/search_lib/ClpQueryRunner.h rename to velox/connectors/clp/search_lib/archive/ClpQueryRunner.h From 7f40b7e041a136ab0e76cf75cd888f384a070961 Mon Sep 17 00:00:00 2001 From: anlowee Date: Wed, 20 Aug 2025 17:40:38 +0000 Subject: [PATCH 05/34] Address coderabbitai comments --- velox/connectors/clp/ClpDataSource.cpp | 4 ++-- velox/connectors/clp/search_lib/BaseClpCursor.cpp | 4 ++-- velox/connectors/clp/search_lib/BaseClpCursor.h | 13 ++++--------- .../connectors/clp/search_lib/ClpVectorLoader.cpp | 1 + velox/connectors/clp/search_lib/ClpVectorLoader.h | 7 +++++-- .../clp/search_lib/archive/ClpArchiveCursor.cpp | 15 ++++++++------- .../clp/search_lib/archive/ClpArchiveCursor.h | 11 +++++++++++ 7 files changed, 33 insertions(+), 22 deletions(-) diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index 97cf0557ab97..03059f584b64 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -16,13 +16,13 @@ #include -#include "search_lib/ClpS3AuthProviderBase.h" -#include "search_lib/archive/ClpArchiveCursor.h" #include "velox/connectors/clp/ClpColumnHandle.h" #include "velox/connectors/clp/ClpConnectorSplit.h" #include "velox/connectors/clp/ClpDataSource.h" #include "velox/connectors/clp/ClpTableHandle.h" +#include "velox/connectors/clp/search_lib/ClpS3AuthProviderBase.h" #include "velox/connectors/clp/search_lib/ClpVectorLoader.h" +#include "velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h" #include "velox/vector/FlatVector.h" namespace facebook::velox::connector::clp { diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.cpp b/velox/connectors/clp/search_lib/BaseClpCursor.cpp index eafcccbf7a71..b0c64bf464dd 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.cpp +++ b/velox/connectors/clp/search_lib/BaseClpCursor.cpp @@ -14,14 +14,14 @@ * limitations under the License. */ +#include + #include "velox/connectors/clp/search_lib/BaseClpCursor.h" -#include "clp_s/search/EvaluateTimestampIndex.hpp" #include "clp_s/search/ast/ConvertToExists.hpp" #include "clp_s/search/ast/EmptyExpr.hpp" #include "clp_s/search/ast/NarrowTypes.hpp" #include "clp_s/search/ast/OrOfAndForm.hpp" -#include "clp_s/search/ast/SearchUtils.hpp" #include "clp_s/search/kql/kql.hpp" using namespace clp_s; diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index d481708ed77e..aa4152dfbaf8 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -16,23 +16,18 @@ #pragma once +#include #include +#include #include -#include "archive/ClpQueryRunner.h" -#include "connectors/clp/ClpConnectorSplit.h" +#include "clp_s/InputConfig.hpp" +#include "velox/connectors/clp/ClpConnectorSplit.h" namespace clp_s { -enum class InputSource : uint8_t; -class ArchiveReader; class BaseColumnReader; } // namespace clp_s -namespace clp_s::search { -class Projection; -class SchemaMatch; -} // namespace clp_s::search - namespace clp_s::search::ast { class Expression; } // namespace clp_s::search::ast diff --git a/velox/connectors/clp/search_lib/ClpVectorLoader.cpp b/velox/connectors/clp/search_lib/ClpVectorLoader.cpp index 65ffb64c2508..1471b66a5e5e 100644 --- a/velox/connectors/clp/search_lib/ClpVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ClpVectorLoader.cpp @@ -21,6 +21,7 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" #include "velox/connectors/clp/search_lib/ClpVectorLoader.h" #include "velox/type/Timestamp.h" #include "velox/vector/ComplexVector.h" diff --git a/velox/connectors/clp/search_lib/ClpVectorLoader.h b/velox/connectors/clp/search_lib/ClpVectorLoader.h index 0d81b72ff013..69a7f259f243 100644 --- a/velox/connectors/clp/search_lib/ClpVectorLoader.h +++ b/velox/connectors/clp/search_lib/ClpVectorLoader.h @@ -16,11 +16,12 @@ #pragma once +#include + #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" -#include "connectors/clp/ClpConnectorSplit.h" +#include "velox/connectors/clp/ClpConnectorSplit.h" -#include "velox/connectors/clp/search_lib/BaseClpCursor.h" #include "velox/type/Timestamp.h" #include "velox/vector/FlatVector.h" #include "velox/vector/LazyVector.h" @@ -31,6 +32,8 @@ class BaseColumnReader; namespace facebook::velox::connector::clp::search_lib { +enum class ColumnType; + /// A custom Velox VectorLoader that populates Velox vectors from a CLP-based /// column reader. It supports various column types including integers, floats, /// booleans, strings, and arrays of strings. diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp index 5f7a0e0b0456..55c5e995df2d 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp @@ -16,12 +16,13 @@ #include -#include "ClpArchiveCursor.h" +#include "velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h" #include "clp_s/ArchiveReader.hpp" #include "clp_s/search/EvaluateTimestampIndex.hpp" #include "clp_s/search/ast/EmptyExpr.hpp" #include "clp_s/search/ast/SearchUtils.hpp" +#include "velox/connectors/clp/search_lib/archive/ClpQueryRunner.h" using namespace clp_s; using namespace clp_s::search; @@ -53,12 +54,6 @@ uint64_t ClpArchiveCursor::fetchNext( if (ErrorCode::Success != errorCode_) { return 0; } - - archiveReader_->open_packed_streams(); - currentSplitLoaded_ = true; - queryRunner_ = std::make_shared( - schemaMatch_, expr_, archiveReader_, false, projection_); - queryRunner_->global_init(); } while (currentSchemaIndex_ < matchedSchemas_.size()) { @@ -207,6 +202,12 @@ ErrorCode ClpArchiveCursor::loadSplit() { currentSchemaIndex_ = 0; currentSchemaTableLoaded_ = false; + + archiveReader_->open_packed_streams(); + currentSplitLoaded_ = true; + queryRunner_ = std::make_shared( + schemaMatch_, expr_, archiveReader_, false, projection_); + queryRunner_->global_init(); return ErrorCode::Success; } diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h index 6167a82db7a0..65859bc5b1a6 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h @@ -18,8 +18,19 @@ #include "velox/connectors/clp/search_lib/BaseClpCursor.h" +namespace clp_s { +class ArchiveReader; +} // namespace clp_s + +namespace clp_s::search { +class Projection; +class SchemaMatch; +} // namespace clp_s::search + namespace facebook::velox::connector::clp::search_lib { +class ClpQueryRunner; + class ClpArchiveCursor final : public BaseClpCursor { public: explicit ClpArchiveCursor( From c0cac3d302cd1ca6cbaad760086e865d1853c545 Mon Sep 17 00:00:00 2001 From: anlowee Date: Wed, 20 Aug 2025 22:39:26 +0000 Subject: [PATCH 06/34] Address comments --- .../connectors/clp/search_lib/BaseClpCursor.h | 10 ++++----- .../connectors/clp/search_lib/CMakeLists.txt | 13 ++---------- .../clp/search_lib/archive/CMakeLists.txt | 21 ++++++++++++++----- .../clp/search_lib/archive/ClpArchiveCursor.h | 2 ++ .../clp/search_lib/archive/ClpQueryRunner.cpp | 2 +- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index aa4152dfbaf8..91f44a258f81 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -61,9 +61,9 @@ struct Field { }; /// A query execution interface that manages the lifecycle of a query on a CLP-S -/// archive, including parsing and validating the query, loading the relevant -/// schemas and archives, applying filters, and iterating over the results. It -/// abstracts away the low-level details of archive access and schema matching +/// split (archive or IR), including parsing and validating the query, loading +/// the relevant splits, applying filters, and iterating over the results. It +/// abstracts away the low-level details of split access /// while supporting projection and batch-oriented retrieval of filtered rows. class BaseClpCursor { public: @@ -86,8 +86,8 @@ class BaseClpCursor { const std::string& query, const std::vector& outputColumns); - /// Fetches the next set of rows from the cursor. If the split and schema - /// are not yet loaded, this function will perform the necessary loading. + /// Fetches the next set of rows from the cursor. If the split is not yet + /// loaded, this function will perform the necessary loading. /// /// @param numRows The maximum number of rows to fetch. /// @param filteredRowIndices A vector of row indices that match the filter. diff --git a/velox/connectors/clp/search_lib/CMakeLists.txt b/velox/connectors/clp/search_lib/CMakeLists.txt index 823e135ff4c4..4575eec12849 100644 --- a/velox/connectors/clp/search_lib/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/CMakeLists.txt @@ -23,17 +23,8 @@ velox_add_library( ClpVectorLoader.cpp ClpVectorLoader.h) -target_include_directories(clp-s-search PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - add_subdirectory(archive) -velox_link_libraries( - clp-s-search - PUBLIC clp_s::archive_reader - PRIVATE - clp_s::clp_dependencies - clp_s::io - clp_s::search - clp_s::search::kql - velox_vector) +velox_link_libraries(clp-s-search PUBLIC clp-s-archive-search) + target_compile_features(clp-s-search PRIVATE cxx_std_20) diff --git a/velox/connectors/clp/search_lib/archive/CMakeLists.txt b/velox/connectors/clp/search_lib/archive/CMakeLists.txt index 3777be57d533..3fc3b92add43 100644 --- a/velox/connectors/clp/search_lib/archive/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/archive/CMakeLists.txt @@ -11,9 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -target_sources( - clp-s-search - PRIVATE ClpArchiveCursor.cpp ClpArchiveCursor.h ClpQueryRunner.cpp - ClpQueryRunner.h) +velox_add_library( + clp-s-archive-search + STATIC + ClpArchiveCursor.cpp + ClpArchiveCursor.h + ClpQueryRunner.cpp + ClpQueryRunner.h) -target_include_directories(clp-s-search PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +velox_link_libraries( + clp-s-archive-search + PUBLIC clp_s::archive_reader + PRIVATE + clp_s::clp_dependencies + clp_s::io + clp_s::search + clp_s::search::kql + velox_vector) diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h index 65859bc5b1a6..e8b0c1362ae3 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h @@ -31,6 +31,8 @@ namespace facebook::velox::connector::clp::search_lib { class ClpQueryRunner; +/// A query execution implementation that manages the lifecycle of a query on a +/// CLP-S archive. class ClpArchiveCursor final : public BaseClpCursor { public: explicit ClpArchiveCursor( diff --git a/velox/connectors/clp/search_lib/archive/ClpQueryRunner.cpp b/velox/connectors/clp/search_lib/archive/ClpQueryRunner.cpp index dcb6fdc4bb5b..0691b0af203a 100644 --- a/velox/connectors/clp/search_lib/archive/ClpQueryRunner.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpQueryRunner.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "ClpQueryRunner.h" +#include "velox/connectors/clp/search_lib/archive/ClpQueryRunner.h" using namespace clp_s; using namespace clp_s::search; From 27ba89734098c7db8c4b8896869302a737e04e23 Mon Sep 17 00:00:00 2001 From: anlowee Date: Tue, 2 Sep 2025 18:33:40 +0000 Subject: [PATCH 07/34] WIP --- .../ystdlib_cpp.cmake | 1 + velox/connectors/clp/ClpDataSource.cpp | 4 + .../connectors/clp/search_lib/CMakeLists.txt | 1 + .../clp/search_lib/ir/CMakeLists.txt | 10 +- .../clp/search_lib/ir/ClpIrCursor.cpp | 60 +++++------ .../clp/search_lib/ir/ClpIrCursor.h | 93 +----------------- .../search_lib/ir/ClpVeloxIrQueryHandler.cpp | 38 +++++++ .../search_lib/ir/ClpVeloxIrQueryHandler.h | 50 ++++++++++ .../search_lib/ir/ClpVeloxIrUnitHandler.cpp | 78 +++++++++++++++ .../clp/search_lib/ir/ClpVeloxIrUnitHandler.h | 77 +++++++++++++++ .../connectors/clp/tests/ClpConnectorTest.cpp | 34 +++++++ .../clp/tests/examples/example.clps | Bin 0 -> 261 bytes .../clp/tests/examples/example2.clps | Bin 0 -> 261 bytes 13 files changed, 322 insertions(+), 124 deletions(-) create mode 100644 velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.cpp create mode 100644 velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h create mode 100644 velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp create mode 100644 velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h create mode 100644 velox/connectors/clp/tests/examples/example.clps create mode 100644 velox/connectors/clp/tests/examples/example2.clps diff --git a/CMake/resolve_dependency_modules/ystdlib_cpp.cmake b/CMake/resolve_dependency_modules/ystdlib_cpp.cmake index a47b24416c87..931df80ead06 100644 --- a/CMake/resolve_dependency_modules/ystdlib_cpp.cmake +++ b/CMake/resolve_dependency_modules/ystdlib_cpp.cmake @@ -21,3 +21,4 @@ FetchContent_Declare( FetchContent_Populate(ystdlib_cpp) set(CLP_YSTDLIB_SOURCE_DIRECTORY "${ystdlib_cpp_SOURCE_DIR}") +include_directories(${ystdlib_cpp_SOURCE_DIR}/src) diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index 03059f584b64..9454c9e5e09d 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -23,6 +23,7 @@ #include "velox/connectors/clp/search_lib/ClpS3AuthProviderBase.h" #include "velox/connectors/clp/search_lib/ClpVectorLoader.h" #include "velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrCursor.h" #include "velox/vector/FlatVector.h" namespace facebook::velox::connector::clp { @@ -117,6 +118,9 @@ void ClpDataSource::addSplit(std::shared_ptr split) { if (ClpConnectorSplit::SplitType::kArchive == clpSplit->type_) { cursor_ = std::make_unique(inputSource, splitPath); + } else if (ClpConnectorSplit::SplitType::kIr == clpSplit->type_) { + cursor_ = + std::make_unique(inputSource, splitPath, true); } else { VELOX_UNSUPPORTED( "Unsupported split type: {}", static_cast(clpSplit->type_)); diff --git a/velox/connectors/clp/search_lib/CMakeLists.txt b/velox/connectors/clp/search_lib/CMakeLists.txt index 6edf2460935b..d7d69c3370a4 100644 --- a/velox/connectors/clp/search_lib/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/CMakeLists.txt @@ -24,6 +24,7 @@ velox_add_library( ClpVectorLoader.h) add_subdirectory(archive) +add_subdirectory(ir) velox_link_libraries( clp-s-search diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index 44ceb9f1be85..078fc93e32ee 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -11,7 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -velox_add_library(clp-s-ir-search STATIC ClpIrCursor.cpp ClpIrCursor.h) +velox_add_library( + clp-s-ir-search + STATIC + ClpIrCursor.cpp + ClpIrCursor.h + ClpVeloxIrQueryHandler.cpp + ClpVeloxIrQueryHandler.h + ClpVeloxIrUnitHandler.cpp + ClpVeloxIrUnitHandler.h) velox_link_libraries( clp-s-ir-search diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 8e938e4a4da1..9687b0931e67 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -14,7 +14,10 @@ * limitations under the License. */ -#include "ClpIrCursor.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrCursor.h" +#include "ffi/ir_stream/search/QueryHandler.hpp" + +#include "clp_s/ColumnReader.hpp" #include "clp_s/InputConfig.hpp" using namespace clp_s; @@ -36,6 +39,7 @@ uint64_t ClpIrCursor::fetchNext( } size_t rowsFetched{0ULL}; + deserialize(); // while (rowsFetched < numRows && kvir_deserializer_.has_value()) { // auto const // result{kvir_deserializer_.value().deserialize_next_ir_unit(kvir_decompressor_)}; @@ -61,18 +65,33 @@ uint64_t ClpIrCursor::fetchNext( return rowsFetched; } +ystdlib::error_handling::Result ClpIrCursor::deserialize() const { + while (::clp::ffi::ir_stream::IrUnitType::EndOfStream != + YSTDLIB_ERROR_HANDLING_TRYX( + irDeserializer_->deserialize_next_ir_unit(*irReader_))) { + } + return ystdlib::error_handling::success(); +} + const std::vector& ClpIrCursor::getProjectedColumns() - const {} + const { + auto projectedColumns = + std::make_unique>(); + for (Field field : outputColumns_) { + auto nodeId = + irDeserializer_->get_ir_unit_handler().findNodeIdByName(field.name); + } +} ErrorCode ClpIrCursor::loadSplit() { auto networkAuthOption = inputSource_ == InputSource::Filesystem ? NetworkAuthOption{.method = AuthMethod::None} : NetworkAuthOption{.method = AuthMethod::S3PresignedUrlV4}; - auto irHandler{ClpVeloxIrUnitHandler{shared_from_this()}}; + auto irHandler{ClpVeloxIrUnitHandler{}}; - auto queryHandlerResult{QueryHandlerType::create( - handleProjectionResolution, std::move(expr_), {}, ignoreCase_)}; + auto queryHandlerResult{ir::QueryHandlerType::create( + ir::handleProjectionResolution, std::move(expr_), {}, ignoreCase_)}; if (!queryHandlerResult) { VLOG(2) << "Failed to create query handler for deserialization."; return ErrorCode::InternalError; @@ -95,38 +114,9 @@ ErrorCode ClpIrCursor::loadSplit() { } irDeserializer_ = std::make_shared<::clp::ffi::ir_stream::Deserializer< ClpVeloxIrUnitHandler, - QueryHandlerType>>(std::move(deserializerResult).value()); + ir::QueryHandlerType>>(std::move(deserializerResult).value()); return ErrorCode::Success; } -void ClpIrCursor::addDeserializedLogEvent( - ::clp::ffi::KeyValuePairLogEvent logEvent) { - deserializedLogEvent_.emplace(std::move(logEvent)); -} - -void ClpIrCursor::addOrderedResolvedId( - size_t idxOfProjectedColumn, - ::clp::ffi::SchemaTree::Node::id_t nodeId, - bool isAutoGenerated) { - orderedResolvedIds_.at(idxOfProjectedColumn) - .emplace_back(std::make_pair(nodeId, isAutoGenerated)); -} - -std::optional ClpIrCursor::findProjectedColumnIdxByKeyName( - std::string_view keyName) { - auto it = projectedColumnToIdx_.find(std::string(keyName)); - if (projectedColumnToIdx_.end() != it) { - return std::nullopt; - } - return it->second; -} - -auto ClpVeloxIrUnitHandler::handle_log_event( - ::clp::ffi::KeyValuePairLogEvent log_event) - -> ::clp::ffi::ir_stream::IRErrorCode { - cursor_->addDeserializedLogEvent(std::move(log_event)); - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; -} - } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index 22a3b07d81f2..b7435f6b04e4 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -18,53 +18,13 @@ #include "ffi/ir_stream/Deserializer.hpp" #include "streaming_compression/Decompressor.hpp" -#include "streaming_compression/zstd/Decompressor.hpp" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" +#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h" +#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h" namespace facebook::velox::connector::clp::search_lib { -class ClpIrCursor; - -class ClpVeloxIrUnitHandler { - public: - ClpVeloxIrUnitHandler(std::shared_ptr cursor) - : cursor_(std::move(cursor)) {} - - // Destructor - ~ClpVeloxIrUnitHandler() = default; - - // Methods implementing `IrUnitHandlerInterface` - [[nodiscard]] auto handle_log_event( - ::clp::ffi::KeyValuePairLogEvent log_event) - -> ::clp::ffi::ir_stream::IRErrorCode; - - [[nodiscard]] static auto handle_utc_offset_change( - [[maybe_unused]] ::clp::UtcOffset utc_offset_old, - [[maybe_unused]] ::clp::UtcOffset utc_offset_new) - -> ::clp::ffi::ir_stream::IRErrorCode { - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; - } - - [[nodiscard]] static auto handle_schema_tree_node_insertion( - [[maybe_unused]] bool is_auto_generated, - [[maybe_unused]] ::clp::ffi::SchemaTree::NodeLocator - schema_tree_node_locator, - [[maybe_unused]] std::shared_ptr<::clp::ffi::SchemaTree const> const& - schema_tree) -> ::clp::ffi::ir_stream::IRErrorCode { - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; - } - - [[nodiscard]] auto handle_end_of_stream() - -> ::clp::ffi::ir_stream::IRErrorCode { - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; - } - - private: - std::shared_ptr cursor_; -}; - -class ClpIrCursor final : public BaseClpCursor, - std::enable_shared_from_this { +class ClpIrCursor final : public BaseClpCursor { public: explicit ClpIrCursor( clp_s::InputSource inputSource, @@ -80,60 +40,17 @@ class ClpIrCursor final : public BaseClpCursor, const std::vector& getProjectedColumns() const override; - void addDeserializedLogEvent(::clp::ffi::KeyValuePairLogEvent logEvent); - - std::optional findProjectedColumnIdxByKeyName( - std::string_view keyName); - - void addOrderedResolvedId( - size_t idxOfProjectedColumn, - ::clp::ffi::SchemaTree::Node::id_t nodeId, - bool isAutoGenerated); - protected: ErrorCode loadSplit() override; private: - using OrderedResolvedId = - std::vector>; std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; bool ignoreCase_; - - std::map projectedColumnToIdx_; - std::vector orderedResolvedIds_; - std::optional<::clp::ffi::KeyValuePairLogEvent> deserializedLogEvent_; - - std::function(bool, ::clp::ffi::SchemaTree::Node::id_t, std::string_view)> - handleProjectionResolution = - [this]( - [[maybe_unused]] bool isAutoGenerated, - [[maybe_unused]] ::clp::ffi::SchemaTree::Node::id_t nodeId, - [[maybe_unused]] std::string_view keyName) - -> ystdlib::error_handling::Result { - auto projectedColumnIdx = findProjectedColumnIdxByKeyName(keyName); - if (projectedColumnIdx.has_value()) { - addOrderedResolvedId(projectedColumnIdx.value(), nodeId, isAutoGenerated); - return ::clp::ffi::ir_stream::ir_error_code_to_errc( - ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success); - } - return ::clp::ffi::ir_stream::ir_error_code_to_errc( - ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Corrupted_IR); - }; - using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< - decltype(handleProjectionResolution)>; - std::shared_ptr<::clp::ffi::ir_stream:: - Deserializer> + Deserializer> irDeserializer_; - std::optional<::clp::ffi::KeyValuePairLogEvent> const& - getDeserializedLogEvent() const { - return deserializedLogEvent_; - } - std::vector const& getOrderedResolvedIds() const { - return orderedResolvedIds_; - } + ystdlib::error_handling::Result deserialize() const; }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.cpp new file mode 100644 index 000000000000..1adadd48631c --- /dev/null +++ b/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h" + +namespace facebook::velox::connector::clp::search_lib { + +void ClpVeloxIrQueryHandler::addOrderedResolvedId( + size_t idxOfProjectedColumn, + ::clp::ffi::SchemaTree::Node::id_t nodeId, + bool isAutoGenerated) { + orderedResolvedIds_.at(idxOfProjectedColumn) + .emplace_back(std::make_pair(nodeId, isAutoGenerated)); +} + +std::optional ClpVeloxIrQueryHandler::findProjectedColumnIdxByKeyName( + std::string_view keyName) { + auto it = projectedColumnToIdx_.find(std::string(keyName)); + if (projectedColumnToIdx_.end() != it) { + return std::nullopt; + } + return it->second; +} + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h b/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h new file mode 100644 index 000000000000..8207e5d6461f --- /dev/null +++ b/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "ffi/ir_stream/Deserializer.hpp" + +namespace facebook::velox::connector::clp::search_lib { + +namespace ir { +static std::function handleProjectionResolution = + []([[maybe_unused]] bool isAutoGenerated, + [[maybe_unused]] ::clp::ffi::SchemaTree::Node::id_t nodeId, + [[maybe_unused]] std::string_view keyName) + -> ystdlib::error_handling::Result { + return ystdlib::error_handling::success(); +}; +using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< + decltype(handleProjectionResolution)>; +} // namespace ir + +class ClpVeloxIrQueryHandler { + private: + std::map projectedColumnToIdx_; + std::vector>> + orderedResolvedIds_; + + std::optional findProjectedColumnIdxByKeyName( + std::string_view keyName); + + void addOrderedResolvedId( + size_t idxOfProjectedColumn, + ::clp::ffi::SchemaTree::Node::id_t nodeId, + bool isAutoGenerated); +}; + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp new file mode 100644 index 000000000000..2c6cf590197e --- /dev/null +++ b/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h" + +#include "clp_s/SchemaTree.hpp" +#include "common/base/Exceptions.h" + +namespace facebook::velox::connector::clp::search_lib { + +auto ClpVeloxIrUnitHandler::handle_log_event( + ::clp::ffi::KeyValuePairLogEvent log_event) + -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; +} + +auto ClpVeloxIrUnitHandler::handle_schema_tree_node_insertion( + bool is_auto_generated, + ::clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator, + std::shared_ptr<::clp::ffi::SchemaTree const> const& schema_tree) + -> ::clp::ffi::ir_stream::IRErrorCode { + auto parentNodeId = schema_tree_node_locator.get_parent_id(); + auto selfNodeId = static_cast<::clp::ffi::SchemaTree::Node::id_t>( + schema_tree->get_size() - 1); + if (is_auto_generated) { + std::string actualNodeName; + if (schema_tree->get_node(parentNodeId).is_root()) { + actualNodeName = schema_tree_node_locator.get_key_name(); + } else { + actualNodeName = fmt::format( + "{}.{}", + autoGenNodeIdNameMap[parentNodeId], + schema_tree_node_locator.get_key_name()); + } + autoGenNodeIdNameMap[selfNodeId] = actualNodeName; + autoGenNodeNameIdMap[actualNodeName] = selfNodeId; + } else { + std::string actualNodeName; + if (schema_tree->get_node(parentNodeId).is_root()) { + actualNodeName = schema_tree_node_locator.get_key_name(); + } else { + actualNodeName = fmt::format( + "{}.{}", + userGenNodeIdNameMap[parentNodeId], + schema_tree_node_locator.get_key_name()); + } + userGenNodeIdNameMap[selfNodeId] = actualNodeName; + userGenNodeNameIdMap[actualNodeName] = selfNodeId; + } + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; +} + +::clp::ffi::SchemaTree::Node::id_t ClpVeloxIrUnitHandler::findNodeIdByName( + std::string_view name) const { + auto name_str = std::string(name); + if (0 != autoGenNodeNameIdMap.count(name_str)) { + return autoGenNodeNameIdMap.at(name_str); + } + if (0 != userGenNodeNameIdMap.count(name_str)) { + return userGenNodeNameIdMap.at(name_str); + } + VELOX_USER_FAIL(fmt::format("No field named: {}", name_str)); +} + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h b/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h new file mode 100644 index 000000000000..a172567cdc2f --- /dev/null +++ b/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "ffi/ir_stream/Deserializer.hpp" + +namespace facebook::velox::connector::clp::search_lib { + +class ClpVeloxIrUnitHandler { + public: + ClpVeloxIrUnitHandler() { + autoGenNodeIdNameMap = + std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string>{}; + autoGenNodeNameIdMap = + std::unordered_map{}; + userGenNodeIdNameMap = + std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string>{}; + userGenNodeNameIdMap = + std::unordered_map{}; + } + + // Destructor + ~ClpVeloxIrUnitHandler() = default; + + // Methods implementing `IrUnitHandlerInterface` + [[nodiscard]] auto handle_log_event( + ::clp::ffi::KeyValuePairLogEvent log_event) + -> ::clp::ffi::ir_stream::IRErrorCode; + + [[nodiscard]] auto handle_utc_offset_change( + [[maybe_unused]] ::clp::UtcOffset utc_offset_old, + [[maybe_unused]] ::clp::UtcOffset utc_offset_new) + -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + } + + [[nodiscard]] auto handle_schema_tree_node_insertion( + [[maybe_unused]] bool is_auto_generated, + [[maybe_unused]] ::clp::ffi::SchemaTree::NodeLocator + schema_tree_node_locator, + [[maybe_unused]] std::shared_ptr<::clp::ffi::SchemaTree const> const& + schema_tree) -> ::clp::ffi::ir_stream::IRErrorCode; + + [[nodiscard]] auto handle_end_of_stream() + -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + } + + ::clp::ffi::SchemaTree::Node::id_t findNodeIdByName( + std::string_view name) const; + + private: + std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string> + autoGenNodeIdNameMap; + std::unordered_map + autoGenNodeNameIdMap; + std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string> + userGenNodeIdNameMap; + std::unordered_map + userGenNodeNameIdMap; +}; + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/tests/ClpConnectorTest.cpp b/velox/connectors/clp/tests/ClpConnectorTest.cpp index 0e20dbb23b92..096032663196 100644 --- a/velox/connectors/clp/tests/ClpConnectorTest.cpp +++ b/velox/connectors/clp/tests/ClpConnectorTest.cpp @@ -86,6 +86,40 @@ class ClpConnectorTest : public exec::test::OperatorTestBase { } }; +TEST_F(ClpConnectorTest, testIr) { + const std::shared_ptr kqlQuery = nullptr; + auto plan = + PlanBuilder() + .startTableScan() + .outputType( + ROW({"level", "message", "user"}, + {VARCHAR(), + VARCHAR(), + ROW({"uid", "ip"}, {BIGINT(), VARCHAR()})})) + .tableHandle( + std::make_shared(kClpConnectorId, "example")) + .assignments({ + {"level", + std::make_shared("level", "level", VARCHAR())}, + {"message", + std::make_shared( + "message", "message", VARCHAR())}, + {"user", + std::make_shared( + "user", "user", ROW({"uid", "ip"}, {BIGINT(), VARCHAR()}))}, + }) + .endTableScan() + .filter("level = 'INFO'") + .planNode(); + auto output = getResults( + plan, + {makeClpSplit( + getExampleFilePath("example2.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + std::cout << "Live" << std::endl; +} + TEST_F(ClpConnectorTest, test1NoPushdown) { const std::shared_ptr kqlQuery = nullptr; auto plan = PlanBuilder() diff --git a/velox/connectors/clp/tests/examples/example.clps b/velox/connectors/clp/tests/examples/example.clps new file mode 100644 index 0000000000000000000000000000000000000000..bfac0b001ae4b9a4cb7c1d609b390b65086e42ce GIT binary patch literal 261 zcmeyXzg3e_a9Xuem}8Kqqmz$oaD1?{hpVq+yr+wjl~Qtku3lwva(+RoUUE)>URYvL zW@1uKYH@IKMrv+in30kWTw}bepR>P^HVbO(tT4)GV)W3 z!wf+txdsJ$`uhQ`HPAEEGf=86N%-%`nv+_Vnp2X%;K-hvT3no%o|^j4k;T)`%|Def zQc)l{wWusJIaQ&!B(bO@HASxssF0URYvL zW@1uKYH@IKMrv+in30kWTw}bepR>P^HVbO(tT4)GV)W3 z!wf+txdsJ$`uhQ`HPAEEGf=86N%-%`nv+_Vnp2X%;K-hvT3no%o|^j4k;T)`%|Def zQc)l{wWusJIaQ&!B(bO@HASxssF0 Date: Tue, 2 Sep 2025 19:27:13 +0000 Subject: [PATCH 08/34] Move the current VectorLoader to Archive-specific --- velox/connectors/clp/ClpDataSource.cpp | 53 +--------------- velox/connectors/clp/ClpDataSource.h | 19 ------ .../connectors/clp/search_lib/BaseClpCursor.h | 31 +++++----- .../connectors/clp/search_lib/CMakeLists.txt | 4 +- .../clp/search_lib/archive/CMakeLists.txt | 2 + .../search_lib/archive/ClpArchiveCursor.cpp | 62 +++++++++++++++++++ .../clp/search_lib/archive/ClpArchiveCursor.h | 18 +++++- .../ClpArchiveVectorLoader.cpp} | 35 ++++++----- .../ClpArchiveVectorLoader.h} | 9 +-- 9 files changed, 122 insertions(+), 111 deletions(-) rename velox/connectors/clp/search_lib/{ClpVectorLoader.cpp => archive/ClpArchiveVectorLoader.cpp} (91%) rename velox/connectors/clp/search_lib/{ClpVectorLoader.h => archive/ClpArchiveVectorLoader.h} (87%) diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index 03059f584b64..87d3d9b71727 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -16,12 +16,12 @@ #include +#include "search_lib/archive/ClpArchiveVectorLoader.h" #include "velox/connectors/clp/ClpColumnHandle.h" #include "velox/connectors/clp/ClpConnectorSplit.h" #include "velox/connectors/clp/ClpDataSource.h" #include "velox/connectors/clp/ClpTableHandle.h" #include "velox/connectors/clp/search_lib/ClpS3AuthProviderBase.h" -#include "velox/connectors/clp/search_lib/ClpVectorLoader.h" #include "velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h" #include "velox/vector/FlatVector.h" @@ -130,46 +130,6 @@ void ClpDataSource::addSplit(std::shared_ptr split) { } } -VectorPtr ClpDataSource::createVector( - const TypePtr& vectorType, - size_t vectorSize, - const std::vector& projectedColumns, - const std::shared_ptr>& filteredRows, - size_t& readerIndex) { - if (vectorType->kind() == TypeKind::ROW) { - std::vector children; - auto& rowType = vectorType->as(); - for (uint32_t i = 0; i < rowType.size(); ++i) { - children.push_back(createVector( - rowType.childAt(i), - vectorSize, - projectedColumns, - filteredRows, - readerIndex)); - } - return std::make_shared( - pool_, vectorType, nullptr, vectorSize, std::move(children)); - } - auto vector = BaseVector::create(vectorType, vectorSize, pool_); - vector->setNulls(allocateNulls(vectorSize, pool_, bits::kNull)); - - VELOX_CHECK_LT( - readerIndex, projectedColumns.size(), "Reader index out of bounds"); - auto projectedColumn = projectedColumns[readerIndex]; - auto projectedType = fields_[readerIndex].type; - readerIndex++; - return std::make_shared( - pool_, - vectorType, - vectorSize, - std::make_unique( - projectedColumn, - projectedType, - filteredRows, - cursor_->getSplitType()), - std::move(vector)); -} - std::optional ClpDataSource::next( uint64_t size, ContinueFuture& future) { @@ -181,15 +141,8 @@ std::optional ClpDataSource::next( } completedRows_ += rowsScanned; size_t readerIndex = 0; - const auto& projectedColumns = cursor_->getProjectedColumns(); - VELOX_CHECK_EQ( - projectedColumns.size(), - fields_.size(), - "Projected columns size {} does not match fields size {}", - projectedColumns.size(), - fields_.size()); - return std::dynamic_pointer_cast(createVector( - outputType_, rowsFiltered, projectedColumns, filteredRows, readerIndex)); + return std::dynamic_pointer_cast(cursor_->createVector( + pool_, outputType_, rowsFiltered, filteredRows, readerIndex)); } } // namespace facebook::velox::connector::clp diff --git a/velox/connectors/clp/ClpDataSource.h b/velox/connectors/clp/ClpDataSource.h index 32611525fe20..a500219f3f82 100644 --- a/velox/connectors/clp/ClpDataSource.h +++ b/velox/connectors/clp/ClpDataSource.h @@ -74,25 +74,6 @@ class ClpDataSource : public DataSource { const TypePtr& columnType, const std::string& parentName); - /// Creates a Vector of the specified type and size. - /// - /// This method recursively creates vectors for complex types like ROW. For - /// primitive types, it creates a LazyVector that will load the data from the - /// underlying data source when it is accessed. - /// - /// @param vectorType - /// @param vectorSize - /// @param projectedColumns The readers of the projected columns. - /// @param filteredRows The rows to be read. - /// @param readerIndex The index of the column reader. - /// @return A Vector of the specified type and size. - VectorPtr createVector( - const TypePtr& vectorType, - size_t vectorSize, - const std::vector& projectedColumns, - const std::shared_ptr>& filteredRows, - size_t& readerIndex); - ClpConfig::StorageType storageType_; velox::memory::MemoryPool* pool_; RowTypePtr outputType_; diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index 91f44a258f81..0929a4196fe1 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -72,8 +72,7 @@ class BaseClpCursor { std::string_view splitPath) : errorCode_(ErrorCode::QueryNotInitialized), inputSource_(inputSource), - splitPath_(std::string(splitPath)), - splitType_(ClpConnectorSplit::SplitType::kArchive) {} + splitPath_(std::string(splitPath)) {} virtual ~BaseClpCursor() = default; /// Executes a query. This function parses, validates, and prepares the given @@ -96,19 +95,24 @@ class BaseClpCursor { uint64_t numRows, const std::shared_ptr>& filteredRowIndices) = 0; - /// Retrieves the projected columns. + /// Creates a Vector of the specified type and size. /// - /// @return A vector of BaseColumnReader pointers representing the projected - /// columns. - virtual const std::vector& getProjectedColumns() - const = 0; - - /// Get the type of the split that the cursor is processing. + /// This method recursively creates vectors for complex types like ROW. For + /// primitive types, it creates a LazyVector that will load the data from the + /// underlying data source when it is accessed. /// - /// @return The split type. - ClpConnectorSplit::SplitType getSplitType() const { - return splitType_; - } + /// @param pool The memory pool used by ClpDataSource to create the vector + /// @param vectorType + /// @param vectorSize + /// @param filteredRows The rows to be read. + /// @param readerIndex The index of the column reader. + /// @return A Vector of the specified type and size. + virtual VectorPtr createVector( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::shared_ptr>& filteredRows, + size_t& readerIndex) = 0; protected: /// @@ -119,7 +123,6 @@ class BaseClpCursor { clp_s::InputSource inputSource_{clp_s::InputSource::Filesystem}; std::string splitPath_; - ClpConnectorSplit::SplitType splitType_; std::string query_; std::vector outputColumns_; diff --git a/velox/connectors/clp/search_lib/CMakeLists.txt b/velox/connectors/clp/search_lib/CMakeLists.txt index 4575eec12849..2dd05a96cf35 100644 --- a/velox/connectors/clp/search_lib/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/CMakeLists.txt @@ -19,9 +19,7 @@ velox_add_library( ClpPackageS3AuthProvider.cpp ClpPackageS3AuthProvider.h ClpS3AuthProviderBase.cpp - ClpS3AuthProviderBase.h - ClpVectorLoader.cpp - ClpVectorLoader.h) + ClpS3AuthProviderBase.h) add_subdirectory(archive) diff --git a/velox/connectors/clp/search_lib/archive/CMakeLists.txt b/velox/connectors/clp/search_lib/archive/CMakeLists.txt index 3fc3b92add43..3f09cce6fd0a 100644 --- a/velox/connectors/clp/search_lib/archive/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/archive/CMakeLists.txt @@ -16,6 +16,8 @@ velox_add_library( STATIC ClpArchiveCursor.cpp ClpArchiveCursor.h + ClpArchiveVectorLoader.cpp + ClpArchiveVectorLoader.h ClpQueryRunner.cpp ClpQueryRunner.h) diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp index 55c5e995df2d..99c6bccd2083 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp @@ -22,6 +22,7 @@ #include "clp_s/search/EvaluateTimestampIndex.hpp" #include "clp_s/search/ast/EmptyExpr.hpp" #include "clp_s/search/ast/SearchUtils.hpp" +#include "velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h" #include "velox/connectors/clp/search_lib/archive/ClpQueryRunner.h" using namespace clp_s; @@ -87,6 +88,67 @@ uint64_t ClpArchiveCursor::fetchNext( return 0; } +VectorPtr ClpArchiveCursor::createVector( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::shared_ptr>& filteredRows, + size_t& readerIndex) { + auto projectedColumns = getProjectedColumns(); + VELOX_CHECK_EQ( + projectedColumns.size(), + outputColumns_.size(), + "Projected columns size {} does not match fields size {}", + projectedColumns.size(), + outputColumns_.size()); + return createVectorHelper( + pool, + vectorType, + vectorSize, + projectedColumns, + filteredRows, + readerIndex); +} + +VectorPtr ClpArchiveCursor::createVectorHelper( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::vector& projectedColumns, + const std::shared_ptr>& filteredRows, + size_t& readerIndex) { + if (vectorType->kind() == TypeKind::ROW) { + std::vector children; + auto& rowType = vectorType->as(); + for (uint32_t i = 0; i < rowType.size(); ++i) { + children.push_back(createVectorHelper( + pool, + rowType.childAt(i), + vectorSize, + projectedColumns, + filteredRows, + readerIndex)); + } + return std::make_shared( + pool, vectorType, nullptr, vectorSize, std::move(children)); + } + auto vector = BaseVector::create(vectorType, vectorSize, pool); + vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); + + VELOX_CHECK_LT( + readerIndex, projectedColumns.size(), "Reader index out of bounds"); + auto projectedColumn = projectedColumns[readerIndex]; + auto projectedType = outputColumns_[readerIndex].type; + readerIndex++; + return std::make_shared( + pool, + vectorType, + vectorSize, + std::make_unique( + projectedColumn, projectedType, filteredRows), + std::move(vector)); +} + const std::vector& ClpArchiveCursor::getProjectedColumns() const { if (queryRunner_) { diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h index e8b0c1362ae3..942520281965 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h @@ -45,8 +45,12 @@ class ClpArchiveCursor final : public BaseClpCursor { const std::shared_ptr>& filteredRowIndices) override; - const std::vector& getProjectedColumns() - const override; + VectorPtr createVector( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::shared_ptr>& filteredRows, + size_t& readerIndex) override; protected: ErrorCode loadSplit() override; @@ -62,6 +66,16 @@ class ClpArchiveCursor final : public BaseClpCursor { std::shared_ptr projection_; std::shared_ptr archiveReader_; + + const std::vector& getProjectedColumns() const; + + VectorPtr createVectorHelper( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::vector& projectedColumns, + const std::shared_ptr>& filteredRows, + size_t& readerIndex); }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ClpVectorLoader.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp similarity index 91% rename from velox/connectors/clp/search_lib/ClpVectorLoader.cpp rename to velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp index 1471b66a5e5e..29b5bbf50f8e 100644 --- a/velox/connectors/clp/search_lib/ClpVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp @@ -21,8 +21,8 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" +#include "ClpArchiveVectorLoader.h" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" -#include "velox/connectors/clp/search_lib/ClpVectorLoader.h" #include "velox/type/Timestamp.h" #include "velox/vector/ComplexVector.h" #include "velox/vector/FlatVector.h" @@ -122,18 +122,16 @@ auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { } // namespace -ClpVectorLoader::ClpVectorLoader( +ClpArchiveVectorLoader::ClpArchiveVectorLoader( clp_s::BaseColumnReader* columnReader, ColumnType nodeType, - std::shared_ptr> filteredRowIndices, - ClpConnectorSplit::SplitType splitType) + std::shared_ptr> filteredRowIndices) : columnReader_(columnReader), nodeType_(nodeType), - filteredRowIndices_(std::move(filteredRowIndices)), - splitType_(splitType) {} + filteredRowIndices_(std::move(filteredRowIndices)) {} template -void ClpVectorLoader::populateData(RowSet rows, VectorPtr vector) { +void ClpArchiveVectorLoader::populateData(RowSet rows, VectorPtr vector) { if (columnReader_ == nullptr) { for (int vectorIndex : rows) { vector->setNull(vectorIndex, true); @@ -158,7 +156,7 @@ void ClpVectorLoader::populateData(RowSet rows, VectorPtr vector) { } template -void ClpVectorLoader::populateTimestampData( +void ClpArchiveVectorLoader::populateTimestampData( RowSet rows, FlatVector* vector) { bool supportedNodeType{false}; @@ -202,7 +200,7 @@ void ClpVectorLoader::populateTimestampData( } } -void ClpVectorLoader::loadInternal( +void ClpArchiveVectorLoader::loadInternal( RowSet rows, ValueHook* hook, vector_size_t resultSize, @@ -310,29 +308,32 @@ void ClpVectorLoader::loadInternal( } // Explicit template instantiations for linker -template void ClpVectorLoader::populateData( +template void ClpArchiveVectorLoader::populateData( RowSet rows, FlatVector* vector); -template void ClpVectorLoader::populateData( +template void ClpArchiveVectorLoader::populateData( RowSet rows, FlatVector* vector); -template void ClpVectorLoader::populateData( +template void ClpArchiveVectorLoader::populateData( RowSet rows, FlatVector* vector); -template void ClpVectorLoader::populateData( +template void ClpArchiveVectorLoader::populateData( RowSet rows, FlatVector* vector); -template void ClpVectorLoader::populateTimestampData( +template void +ClpArchiveVectorLoader::populateTimestampData( RowSet rows, FlatVector* vector); -template void ClpVectorLoader::populateTimestampData( +template void +ClpArchiveVectorLoader::populateTimestampData( RowSet rows, FlatVector* vector); template void -ClpVectorLoader::populateTimestampData( +ClpArchiveVectorLoader::populateTimestampData( RowSet rows, FlatVector* vector); -template void ClpVectorLoader::populateTimestampData( +template void +ClpArchiveVectorLoader::populateTimestampData( RowSet rows, FlatVector* vector); diff --git a/velox/connectors/clp/search_lib/ClpVectorLoader.h b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h similarity index 87% rename from velox/connectors/clp/search_lib/ClpVectorLoader.h rename to velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h index 69a7f259f243..35ad6c5ae700 100644 --- a/velox/connectors/clp/search_lib/ClpVectorLoader.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h @@ -20,7 +20,6 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" -#include "velox/connectors/clp/ClpConnectorSplit.h" #include "velox/type/Timestamp.h" #include "velox/vector/FlatVector.h" @@ -37,13 +36,12 @@ enum class ColumnType; /// A custom Velox VectorLoader that populates Velox vectors from a CLP-based /// column reader. It supports various column types including integers, floats, /// booleans, strings, and arrays of strings. -class ClpVectorLoader : public VectorLoader { +class ClpArchiveVectorLoader : public VectorLoader { public: - ClpVectorLoader( + ClpArchiveVectorLoader( clp_s::BaseColumnReader* columnReader, ColumnType nodeType, - std::shared_ptr> filteredRowIndices, - ClpConnectorSplit::SplitType splitType); + std::shared_ptr> filteredRowIndices); private: void loadInternal( @@ -63,7 +61,6 @@ class ClpVectorLoader : public VectorLoader { clp_s::BaseColumnReader* columnReader_; ColumnType nodeType_; std::shared_ptr> filteredRowIndices_; - ClpConnectorSplit::SplitType splitType_; inline static thread_local std::unique_ptr arrayParser_ = std::make_unique(); From f53c6c75ce75dcdb9520e9d2f7fc38e6084d432b Mon Sep 17 00:00:00 2001 From: anlowee Date: Wed, 3 Sep 2025 15:31:43 +0000 Subject: [PATCH 09/34] Rename and WIP --- .../clp/search_lib/ir/CMakeLists.txt | 8 +- .../clp/search_lib/ir/ClpIrCursor.cpp | 64 +++++++++++--- .../clp/search_lib/ir/ClpIrCursor.h | 18 ++-- ...QueryHandler.cpp => ClpIrQueryHandler.cpp} | 6 +- ...oxIrQueryHandler.h => ClpIrQueryHandler.h} | 2 +- .../clp/search_lib/ir/ClpIrUnitHandler.cpp | 38 +++++++++ ...eloxIrUnitHandler.h => ClpIrUnitHandler.h} | 28 +------ .../search_lib/ir/ClpVeloxIrUnitHandler.cpp | 78 ------------------ .../clp/tests/examples/example2.clps | Bin 261 -> 261 bytes 9 files changed, 113 insertions(+), 129 deletions(-) rename velox/connectors/clp/search_lib/ir/{ClpVeloxIrQueryHandler.cpp => ClpIrQueryHandler.cpp} (85%) rename velox/connectors/clp/search_lib/ir/{ClpVeloxIrQueryHandler.h => ClpIrQueryHandler.h} (98%) create mode 100644 velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp rename velox/connectors/clp/search_lib/ir/{ClpVeloxIrUnitHandler.h => ClpIrUnitHandler.h} (63%) delete mode 100644 velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index 078fc93e32ee..33837d19fd7b 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -16,10 +16,10 @@ velox_add_library( STATIC ClpIrCursor.cpp ClpIrCursor.h - ClpVeloxIrQueryHandler.cpp - ClpVeloxIrQueryHandler.h - ClpVeloxIrUnitHandler.cpp - ClpVeloxIrUnitHandler.h) + ClpIrQueryHandler.cpp + ClpIrQueryHandler.h + ClpIrUnitHandler.cpp + ClpIrUnitHandler.h) velox_link_libraries( clp-s-ir-search diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 9687b0931e67..fbad906494f4 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -65,6 +65,15 @@ uint64_t ClpIrCursor::fetchNext( return rowsFetched; } +VectorPtr ClpIrCursor::createVector( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::shared_ptr>& filteredRows, + size_t& readerIndex) { + return nullptr; +} + ystdlib::error_handling::Result ClpIrCursor::deserialize() const { while (::clp::ffi::ir_stream::IrUnitType::EndOfStream != YSTDLIB_ERROR_HANDLING_TRYX( @@ -73,25 +82,19 @@ ystdlib::error_handling::Result ClpIrCursor::deserialize() const { return ystdlib::error_handling::success(); } -const std::vector& ClpIrCursor::getProjectedColumns() - const { - auto projectedColumns = - std::make_unique>(); - for (Field field : outputColumns_) { - auto nodeId = - irDeserializer_->get_ir_unit_handler().findNodeIdByName(field.name); - } -} - ErrorCode ClpIrCursor::loadSplit() { auto networkAuthOption = inputSource_ == InputSource::Filesystem ? NetworkAuthOption{.method = AuthMethod::None} : NetworkAuthOption{.method = AuthMethod::S3PresignedUrlV4}; - auto irHandler{ClpVeloxIrUnitHandler{}}; + auto irHandler{ClpIrUnitHandler{}}; + auto projections = splitFieldsToNamesAndTypes(); auto queryHandlerResult{ir::QueryHandlerType::create( - ir::handleProjectionResolution, std::move(expr_), {}, ignoreCase_)}; + ir::handleProjectionResolution, + std::move(expr_), + projections, + ignoreCase_)}; if (!queryHandlerResult) { VLOG(2) << "Failed to create query handler for deserialization."; return ErrorCode::InternalError; @@ -113,10 +116,45 @@ ErrorCode ClpIrCursor::loadSplit() { return ErrorCode::InternalError; } irDeserializer_ = std::make_shared<::clp::ffi::ir_stream::Deserializer< - ClpVeloxIrUnitHandler, + ClpIrUnitHandler, ir::QueryHandlerType>>(std::move(deserializerResult).value()); return ErrorCode::Success; } +std::vector> +ClpIrCursor::splitFieldsToNamesAndTypes() const { + auto result = std::vector< + std::pair>{}; + for (size_t i{0}; i < outputColumns_.size(); ++i) { + auto column = outputColumns_[i]; + clp_s::search::ast::literal_type_bitmask_t literalType; + switch (column.type) { + case ColumnType::Array: + literalType = clp_s::search::ast::LiteralType::ArrayT; + break; + case ColumnType::Boolean: + literalType = clp_s::search::ast::LiteralType::BooleanT; + break; + case ColumnType::Float: + literalType = clp_s::search::ast::LiteralType::FloatT; + break; + case ColumnType::Integer: + literalType = clp_s::search::ast::LiteralType::IntegerT; + break; + case ColumnType::String: + literalType = clp_s::search::ast::LiteralType::VarStringT; + break; + case ColumnType::Timestamp: + literalType = clp_s::search::ast::LiteralType::EpochDateT; + break; + default: + literalType = clp_s::search::ast::LiteralType::UnknownT; + break; + } + result.emplace_back(column.name, literalType); + } + return result; +} + } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index b7435f6b04e4..32f3612dc197 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -19,8 +19,8 @@ #include "ffi/ir_stream/Deserializer.hpp" #include "streaming_compression/Decompressor.hpp" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" -#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h" -#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h" namespace facebook::velox::connector::clp::search_lib { @@ -37,8 +37,12 @@ class ClpIrCursor final : public BaseClpCursor { const std::shared_ptr>& filteredRowIndices) override; - const std::vector& getProjectedColumns() - const override; + VectorPtr createVector( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::shared_ptr>& filteredRows, + size_t& readerIndex) override; protected: ErrorCode loadSplit() override; @@ -47,10 +51,14 @@ class ClpIrCursor final : public BaseClpCursor { std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; bool ignoreCase_; std::shared_ptr<::clp::ffi::ir_stream:: - Deserializer> + Deserializer> irDeserializer_; ystdlib::error_handling::Result deserialize() const; + + std::vector< + std::pair> + splitFieldsToNamesAndTypes() const; }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp similarity index 85% rename from velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.cpp rename to velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp index 1adadd48631c..3959cd8df001 100644 --- a/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h" namespace facebook::velox::connector::clp::search_lib { -void ClpVeloxIrQueryHandler::addOrderedResolvedId( +void ClpIrQueryHandler::addOrderedResolvedId( size_t idxOfProjectedColumn, ::clp::ffi::SchemaTree::Node::id_t nodeId, bool isAutoGenerated) { @@ -26,7 +26,7 @@ void ClpVeloxIrQueryHandler::addOrderedResolvedId( .emplace_back(std::make_pair(nodeId, isAutoGenerated)); } -std::optional ClpVeloxIrQueryHandler::findProjectedColumnIdxByKeyName( +std::optional ClpIrQueryHandler::findProjectedColumnIdxByKeyName( std::string_view keyName) { auto it = projectedColumnToIdx_.find(std::string(keyName)); if (projectedColumnToIdx_.end() != it) { diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h similarity index 98% rename from velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h rename to velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h index 8207e5d6461f..00fe727a8632 100644 --- a/velox/connectors/clp/search_lib/ir/ClpVeloxIrQueryHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h @@ -32,7 +32,7 @@ using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< decltype(handleProjectionResolution)>; } // namespace ir -class ClpVeloxIrQueryHandler { +class ClpIrQueryHandler { private: std::map projectedColumnToIdx_; std::vector>> diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp new file mode 100644 index 000000000000..f6969492b492 --- /dev/null +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h" + +#include "clp_s/SchemaTree.hpp" +#include "common/base/Exceptions.h" + +namespace facebook::velox::connector::clp::search_lib { + +auto ClpIrUnitHandler::handle_log_event( + ::clp::ffi::KeyValuePairLogEvent log_event) + -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; +} + +auto ClpIrUnitHandler::handle_schema_tree_node_insertion( + bool is_auto_generated, + ::clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator, + std::shared_ptr<::clp::ffi::SchemaTree const> const& schema_tree) + -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; +} + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h similarity index 63% rename from velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h rename to velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h index a172567cdc2f..9ebb740dd269 100644 --- a/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h @@ -20,21 +20,12 @@ namespace facebook::velox::connector::clp::search_lib { -class ClpVeloxIrUnitHandler { +class ClpIrUnitHandler { public: - ClpVeloxIrUnitHandler() { - autoGenNodeIdNameMap = - std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string>{}; - autoGenNodeNameIdMap = - std::unordered_map{}; - userGenNodeIdNameMap = - std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string>{}; - userGenNodeNameIdMap = - std::unordered_map{}; - } + ClpIrUnitHandler() {} // Destructor - ~ClpVeloxIrUnitHandler() = default; + ~ClpIrUnitHandler() = default; // Methods implementing `IrUnitHandlerInterface` [[nodiscard]] auto handle_log_event( @@ -59,19 +50,6 @@ class ClpVeloxIrUnitHandler { -> ::clp::ffi::ir_stream::IRErrorCode { return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } - - ::clp::ffi::SchemaTree::Node::id_t findNodeIdByName( - std::string_view name) const; - - private: - std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string> - autoGenNodeIdNameMap; - std::unordered_map - autoGenNodeNameIdMap; - std::unordered_map<::clp::ffi::SchemaTree::Node::id_t, std::string> - userGenNodeIdNameMap; - std::unordered_map - userGenNodeNameIdMap; }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp deleted file mode 100644 index 2c6cf590197e..000000000000 --- a/velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/connectors/clp/search_lib/ir/ClpVeloxIrUnitHandler.h" - -#include "clp_s/SchemaTree.hpp" -#include "common/base/Exceptions.h" - -namespace facebook::velox::connector::clp::search_lib { - -auto ClpVeloxIrUnitHandler::handle_log_event( - ::clp::ffi::KeyValuePairLogEvent log_event) - -> ::clp::ffi::ir_stream::IRErrorCode { - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; -} - -auto ClpVeloxIrUnitHandler::handle_schema_tree_node_insertion( - bool is_auto_generated, - ::clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator, - std::shared_ptr<::clp::ffi::SchemaTree const> const& schema_tree) - -> ::clp::ffi::ir_stream::IRErrorCode { - auto parentNodeId = schema_tree_node_locator.get_parent_id(); - auto selfNodeId = static_cast<::clp::ffi::SchemaTree::Node::id_t>( - schema_tree->get_size() - 1); - if (is_auto_generated) { - std::string actualNodeName; - if (schema_tree->get_node(parentNodeId).is_root()) { - actualNodeName = schema_tree_node_locator.get_key_name(); - } else { - actualNodeName = fmt::format( - "{}.{}", - autoGenNodeIdNameMap[parentNodeId], - schema_tree_node_locator.get_key_name()); - } - autoGenNodeIdNameMap[selfNodeId] = actualNodeName; - autoGenNodeNameIdMap[actualNodeName] = selfNodeId; - } else { - std::string actualNodeName; - if (schema_tree->get_node(parentNodeId).is_root()) { - actualNodeName = schema_tree_node_locator.get_key_name(); - } else { - actualNodeName = fmt::format( - "{}.{}", - userGenNodeIdNameMap[parentNodeId], - schema_tree_node_locator.get_key_name()); - } - userGenNodeIdNameMap[selfNodeId] = actualNodeName; - userGenNodeNameIdMap[actualNodeName] = selfNodeId; - } - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; -} - -::clp::ffi::SchemaTree::Node::id_t ClpVeloxIrUnitHandler::findNodeIdByName( - std::string_view name) const { - auto name_str = std::string(name); - if (0 != autoGenNodeNameIdMap.count(name_str)) { - return autoGenNodeNameIdMap.at(name_str); - } - if (0 != userGenNodeNameIdMap.count(name_str)) { - return userGenNodeNameIdMap.at(name_str); - } - VELOX_USER_FAIL(fmt::format("No field named: {}", name_str)); -} - -} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/tests/examples/example2.clps b/velox/connectors/clp/tests/examples/example2.clps index bfac0b001ae4b9a4cb7c1d609b390b65086e42ce..90526176069a89fa528d05a49a6a01a2ef687aa4 100644 GIT binary patch delta 67 zcmZo=YGs-*pOInW0ufHeR3=9jPd_*RiF@U=m>ro*GgC?um>roi3sM Date: Wed, 3 Sep 2025 18:13:43 +0000 Subject: [PATCH 10/34] Put more archive specific thing into Archive code --- velox/connectors/clp/ClpDataSource.cpp | 10 ++-- .../connectors/clp/search_lib/BaseClpCursor.h | 13 ++--- .../search_lib/archive/ClpArchiveCursor.cpp | 48 ++++++++----------- .../clp/search_lib/archive/ClpArchiveCursor.h | 18 ++++--- .../archive/ClpArchiveVectorLoader.cpp | 8 ++-- .../archive/ClpArchiveVectorLoader.h | 2 +- 6 files changed, 40 insertions(+), 59 deletions(-) diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index 87d3d9b71727..7ee225be50b0 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -133,16 +133,14 @@ void ClpDataSource::addSplit(std::shared_ptr split) { std::optional ClpDataSource::next( uint64_t size, ContinueFuture& future) { - auto filteredRows = std::make_shared>(); - auto rowsScanned = cursor_->fetchNext(size, filteredRows); - auto rowsFiltered = filteredRows->size(); + auto rowsScanned = cursor_->fetchNext(size); + auto rowsFiltered = cursor_->getNumFilteredRows(); if (rowsFiltered == 0) { return nullptr; } completedRows_ += rowsScanned; - size_t readerIndex = 0; - return std::dynamic_pointer_cast(cursor_->createVector( - pool_, outputType_, rowsFiltered, filteredRows, readerIndex)); + return std::dynamic_pointer_cast( + cursor_->createVector(pool_, outputType_, rowsFiltered)); } } // namespace facebook::velox::connector::clp diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index 0929a4196fe1..b4f40cc5340a 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -89,11 +89,8 @@ class BaseClpCursor { /// loaded, this function will perform the necessary loading. /// /// @param numRows The maximum number of rows to fetch. - /// @param filteredRowIndices A vector of row indices that match the filter. /// @return The number of rows scanned. - virtual uint64_t fetchNext( - uint64_t numRows, - const std::shared_ptr>& filteredRowIndices) = 0; + virtual uint64_t fetchNext(uint64_t numRows) = 0; /// Creates a Vector of the specified type and size. /// @@ -104,15 +101,13 @@ class BaseClpCursor { /// @param pool The memory pool used by ClpDataSource to create the vector /// @param vectorType /// @param vectorSize - /// @param filteredRows The rows to be read. - /// @param readerIndex The index of the column reader. /// @return A Vector of the specified type and size. virtual VectorPtr createVector( memory::MemoryPool* pool, const TypePtr& vectorType, - size_t vectorSize, - const std::shared_ptr>& filteredRows, - size_t& readerIndex) = 0; + size_t vectorSize) = 0; + + virtual size_t getNumFilteredRows() = 0; protected: /// diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp index 99c6bccd2083..e84aeed890b6 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp @@ -43,9 +43,10 @@ ClpArchiveCursor::~ClpArchiveCursor() { } } -uint64_t ClpArchiveCursor::fetchNext( - uint64_t numRows, - const std::shared_ptr>& filteredRowIndices) { +uint64_t ClpArchiveCursor::fetchNext(uint64_t numRows) { + filteredRowIndices_->clear(); + readerIndex_ = 0; + if (ErrorCode::Success != errorCode_) { return 0; } @@ -76,8 +77,8 @@ uint64_t ClpArchiveCursor::fetchNext( currentSchemaTableLoaded_ = true; } - auto rowsScanned = queryRunner_->fetchNext(numRows, filteredRowIndices); - if (false == filteredRowIndices->empty()) { + auto rowsScanned = queryRunner_->fetchNext(numRows, filteredRowIndices_); + if (false == filteredRowIndices_->empty()) { return rowsScanned; } @@ -88,12 +89,14 @@ uint64_t ClpArchiveCursor::fetchNext( return 0; } +size_t ClpArchiveCursor::getNumFilteredRows() { + return filteredRowIndices_->size(); +} + VectorPtr ClpArchiveCursor::createVector( memory::MemoryPool* pool, const TypePtr& vectorType, - size_t vectorSize, - const std::shared_ptr>& filteredRows, - size_t& readerIndex) { + size_t vectorSize) { auto projectedColumns = getProjectedColumns(); VELOX_CHECK_EQ( projectedColumns.size(), @@ -101,33 +104,20 @@ VectorPtr ClpArchiveCursor::createVector( "Projected columns size {} does not match fields size {}", projectedColumns.size(), outputColumns_.size()); - return createVectorHelper( - pool, - vectorType, - vectorSize, - projectedColumns, - filteredRows, - readerIndex); + return createVectorHelper(pool, vectorType, vectorSize, projectedColumns); } VectorPtr ClpArchiveCursor::createVectorHelper( memory::MemoryPool* pool, const TypePtr& vectorType, size_t vectorSize, - const std::vector& projectedColumns, - const std::shared_ptr>& filteredRows, - size_t& readerIndex) { + const std::vector& projectedColumns) { if (vectorType->kind() == TypeKind::ROW) { std::vector children; auto& rowType = vectorType->as(); for (uint32_t i = 0; i < rowType.size(); ++i) { children.push_back(createVectorHelper( - pool, - rowType.childAt(i), - vectorSize, - projectedColumns, - filteredRows, - readerIndex)); + pool, rowType.childAt(i), vectorSize, projectedColumns)); } return std::make_shared( pool, vectorType, nullptr, vectorSize, std::move(children)); @@ -136,16 +126,16 @@ VectorPtr ClpArchiveCursor::createVectorHelper( vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); VELOX_CHECK_LT( - readerIndex, projectedColumns.size(), "Reader index out of bounds"); - auto projectedColumn = projectedColumns[readerIndex]; - auto projectedType = outputColumns_[readerIndex].type; - readerIndex++; + readerIndex_, projectedColumns.size(), "Reader index out of bounds"); + auto projectedColumn = projectedColumns[readerIndex_]; + auto projectedType = outputColumns_[readerIndex_].type; + readerIndex_++; return std::make_shared( pool, vectorType, vectorSize, std::make_unique( - projectedColumn, projectedType, filteredRows), + projectedColumn, projectedType, filteredRowIndices_), std::move(vector)); } diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h index 942520281965..2e098e89ef37 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h @@ -40,22 +40,22 @@ class ClpArchiveCursor final : public BaseClpCursor { std::string_view splitPath); ~ClpArchiveCursor() override; - uint64_t fetchNext( - uint64_t numRows, - const std::shared_ptr>& filteredRowIndices) - override; + uint64_t fetchNext(uint64_t numRows) override; VectorPtr createVector( memory::MemoryPool* pool, const TypePtr& vectorType, - size_t vectorSize, - const std::shared_ptr>& filteredRows, - size_t& readerIndex) override; + size_t vectorSize) override; + + size_t getNumFilteredRows() override; protected: ErrorCode loadSplit() override; private: + size_t readerIndex_{0}; + std::shared_ptr> filteredRowIndices_ = + std::make_shared>(); std::vector matchedSchemas_; size_t currentSchemaIndex_{0}; int32_t currentSchemaId_{-1}; @@ -73,9 +73,7 @@ class ClpArchiveCursor final : public BaseClpCursor { memory::MemoryPool* pool, const TypePtr& vectorType, size_t vectorSize, - const std::vector& projectedColumns, - const std::shared_ptr>& filteredRows, - size_t& readerIndex); + const std::vector& projectedColumns); }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp index 29b5bbf50f8e..1ea6e6e8f695 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp @@ -125,7 +125,7 @@ auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { ClpArchiveVectorLoader::ClpArchiveVectorLoader( clp_s::BaseColumnReader* columnReader, ColumnType nodeType, - std::shared_ptr> filteredRowIndices) + const std::shared_ptr> filteredRowIndices) : columnReader_(columnReader), nodeType_(nodeType), filteredRowIndices_(std::move(filteredRowIndices)) {} @@ -140,7 +140,7 @@ void ClpArchiveVectorLoader::populateData(RowSet rows, VectorPtr vector) { } for (int vectorIndex : rows) { - auto messageIndex = (*filteredRowIndices_)[vectorIndex]; + auto messageIndex = filteredRowIndices_->at(vectorIndex); if constexpr (std::is_same_v) { auto string_value = @@ -177,7 +177,7 @@ void ClpArchiveVectorLoader::populateTimestampData( } for (int vectorIndex : rows) { - auto messageIndex = (*filteredRowIndices_)[vectorIndex]; + auto messageIndex = filteredRowIndices_->at(vectorIndex); if (clp_s::NodeType::Float == Type) { auto reader = static_cast(columnReader_); @@ -241,7 +241,7 @@ void ClpArchiveVectorLoader::loadInternal( vector_size_t elementIndex = 0; for (int vectorIndex : rows) { - auto messageIndex = (*filteredRowIndices_)[vectorIndex]; + auto messageIndex = filteredRowIndices_->at(vectorIndex); auto jsonString = std::get(columnReader_->extract_value(messageIndex)); diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h index 35ad6c5ae700..24992b2402e4 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h @@ -41,7 +41,7 @@ class ClpArchiveVectorLoader : public VectorLoader { ClpArchiveVectorLoader( clp_s::BaseColumnReader* columnReader, ColumnType nodeType, - std::shared_ptr> filteredRowIndices); + const std::shared_ptr> filteredRowIndices); private: void loadInternal( From d6ba0da4d7a81ad53ffbf260953a54a1698b763d Mon Sep 17 00:00:00 2001 From: anlowee Date: Wed, 3 Sep 2025 20:21:36 +0000 Subject: [PATCH 11/34] WIP --- .../clp/search_lib/ir/CMakeLists.txt | 4 +- .../clp/search_lib/ir/ClpIrCursor.cpp | 43 +++++----------- .../clp/search_lib/ir/ClpIrCursor.h | 13 +++-- .../clp/search_lib/ir/ClpIrQueryHandler.cpp | 17 ------- .../clp/search_lib/ir/ClpIrQueryHandler.h | 13 +---- .../clp/search_lib/ir/ClpIrUnitHandler.cpp | 2 + .../clp/search_lib/ir/ClpIrUnitHandler.h | 16 +++++- .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 40 +++++++++++++++ .../clp/search_lib/ir/ClpIrVectorLoader.h | 51 +++++++++++++++++++ 9 files changed, 131 insertions(+), 68 deletions(-) create mode 100644 velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp create mode 100644 velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index 33837d19fd7b..bb8a9c2f5b3b 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -19,7 +19,9 @@ velox_add_library( ClpIrQueryHandler.cpp ClpIrQueryHandler.h ClpIrUnitHandler.cpp - ClpIrUnitHandler.h) + ClpIrUnitHandler.h + ClpIrVectorLoader.cpp + ClpIrVectorLoader.h) velox_link_libraries( clp-s-ir-search diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index fbad906494f4..b336a55e1e34 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -24,9 +24,7 @@ using namespace clp_s; namespace facebook::velox::connector::clp::search_lib { -uint64_t ClpIrCursor::fetchNext( - uint64_t numRows, - const std::shared_ptr>& filteredRowIndices) { +uint64_t ClpIrCursor::fetchNext(uint64_t numRows) { if (ErrorCode::Success != errorCode_) { return 0; } @@ -38,39 +36,22 @@ uint64_t ClpIrCursor::fetchNext( } } - size_t rowsFetched{0ULL}; - deserialize(); - // while (rowsFetched < numRows && kvir_deserializer_.has_value()) { - // auto const - // result{kvir_deserializer_.value().deserialize_next_ir_unit(kvir_decompressor_)}; - // if (result.has_error() && std::errc::no_message != result.error()) { - // if (ErrorCode::Success != load_next_kvir_stream()) { - // return rowsFetched; - // } - // continue; - // } - // if (result.value() == ::clp::ffi::ir_stream::IrUnitType::EndOfStream) { - // if (ErrorCode::Success != load_next_kvir_stream()) { - // return rowsFetched; - // } - // continue; - // } - // if (result.value() == ::clp::ffi::ir_stream::IrUnitType::LogEvent) { - // auto const& ir_unit_handler = - // kvir_deserializer_.value().get_ir_unit_handler(); - // marshal_row(rowsFetched, column_vectors, ir_unit_handler); - // ++rowsFetched; - // } - // } - return rowsFetched; + auto deserializeResult = deserialize(); + if (ystdlib::error_handling::success() != deserializeResult) { + VELOX_FAIL( + "IR file {} might be broken, failed to deserialize", this->splitPath_); + } + return irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()->size(); +} + +size_t ClpIrCursor::getNumFilteredRows() { + return irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()->size(); } VectorPtr ClpIrCursor::createVector( memory::MemoryPool* pool, const TypePtr& vectorType, - size_t vectorSize, - const std::shared_ptr>& filteredRows, - size_t& readerIndex) { + size_t vectorSize) { return nullptr; } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index 32f3612dc197..c95610eb7612 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -32,17 +32,16 @@ class ClpIrCursor final : public BaseClpCursor { bool ignoreCase) : BaseClpCursor(inputSource, splitPath), ignoreCase_(ignoreCase) {} - uint64_t fetchNext( - uint64_t numRows, - const std::shared_ptr>& filteredRowIndices) - override; + // TODO: Need to expose an API in CLP to get the internal counter of scanned + // log events. Currently this returns the same number of filtered rows. + uint64_t fetchNext(uint64_t numRows) override; + + size_t getNumFilteredRows() override; VectorPtr createVector( memory::MemoryPool* pool, const TypePtr& vectorType, - size_t vectorSize, - const std::shared_ptr>& filteredRows, - size_t& readerIndex) override; + size_t vectorSize) override; protected: ErrorCode loadSplit() override; diff --git a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp index 3959cd8df001..86285cdef9dd 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp @@ -18,21 +18,4 @@ namespace facebook::velox::connector::clp::search_lib { -void ClpIrQueryHandler::addOrderedResolvedId( - size_t idxOfProjectedColumn, - ::clp::ffi::SchemaTree::Node::id_t nodeId, - bool isAutoGenerated) { - orderedResolvedIds_.at(idxOfProjectedColumn) - .emplace_back(std::make_pair(nodeId, isAutoGenerated)); -} - -std::optional ClpIrQueryHandler::findProjectedColumnIdxByKeyName( - std::string_view keyName) { - auto it = projectedColumnToIdx_.find(std::string(keyName)); - if (projectedColumnToIdx_.end() != it) { - return std::nullopt; - } - return it->second; -} - } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h index 00fe727a8632..8c36b8e3d2c3 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h @@ -34,17 +34,8 @@ using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< class ClpIrQueryHandler { private: - std::map projectedColumnToIdx_; - std::vector>> - orderedResolvedIds_; - - std::optional findProjectedColumnIdxByKeyName( - std::string_view keyName); - - void addOrderedResolvedId( - size_t idxOfProjectedColumn, - ::clp::ffi::SchemaTree::Node::id_t nodeId, - bool isAutoGenerated); + std::unordered_map + projectedColumnNameNodeIdMap_; }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp index f6969492b492..61f056f20bfd 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp @@ -24,6 +24,8 @@ namespace facebook::velox::connector::clp::search_lib { auto ClpIrUnitHandler::handle_log_event( ::clp::ffi::KeyValuePairLogEvent log_event) -> ::clp::ffi::ir_stream::IRErrorCode { + filteredLogEvents_->push_back( + std::make_unique<::clp::ffi::KeyValuePairLogEvent>(std::move(log_event))); return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h index 9ebb740dd269..5058236ca9f0 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h @@ -22,7 +22,10 @@ namespace facebook::velox::connector::clp::search_lib { class ClpIrUnitHandler { public: - ClpIrUnitHandler() {} + ClpIrUnitHandler() { + filteredLogEvents_ = std::make_shared< + std::vector>>(); + } // Destructor ~ClpIrUnitHandler() = default; @@ -50,6 +53,17 @@ class ClpIrUnitHandler { -> ::clp::ffi::ir_stream::IRErrorCode { return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } + + std::shared_ptr< + const std::vector>> + getFilteredLogEvents() { + return filteredLogEvents_; + } + + private: + std::shared_ptr< + std::vector>> + filteredLogEvents_; }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp new file mode 100644 index 000000000000..780d9b31c751 --- /dev/null +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" + +namespace facebook::velox::connector::clp::search_lib { + +void ClpIrVectorLoader::loadInternal( + RowSet rows, + ValueHook* hook, + vector_size_t resultSize, + VectorPtr* result) { + auto vector = *result; + for (int vectorIndex : rows) { + filteredLogEvents_->at(vectorIndex)-> + } + switch (nodeType_) { + case ColumnType::Integer: { + auto intVector = vector->asFlatVector(); + } + case ColumnType::Float: { + } + } +} + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h new file mode 100644 index 000000000000..c659a9648a5e --- /dev/null +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "ffi/ir_stream/Deserializer.hpp" + +#include "velox/type/Timestamp.h" +#include "velox/vector/FlatVector.h" +#include "velox/vector/LazyVector.h" + +namespace facebook::velox::connector::clp::search_lib { + +enum class ColumnType; + +class ClpIrVectorLoader : public VectorLoader { + public: + ClpIrVectorLoader( + ::clp::ffi::SchemaTree::Node::id_t nodeId, + const std::shared_ptr< + std::vector>> + filteredLogEvents) + : nodeId_(nodeId), filteredLogEvents_(filteredLogEvents) {} + + private: + void loadInternal( + RowSet rows, + ValueHook* hook, + vector_size_t resultSize, + VectorPtr* result) override; + + ::clp::ffi::SchemaTree::Node::id_t nodeId_; + std::shared_ptr< + std::vector>> + filteredLogEvents_; +}; + +} // namespace facebook::velox::connector::clp::search_lib From 221bec690ccc7730a4ff7203e41238b3b83e4997 Mon Sep 17 00:00:00 2001 From: anlowee Date: Wed, 3 Sep 2025 23:10:37 +0000 Subject: [PATCH 12/34] WIP --- CMake/resolve_dependency_modules/clp.cmake | 2 +- .../clp/search_lib/ir/CMakeLists.txt | 2 - .../clp/search_lib/ir/ClpIrCursor.cpp | 74 +++++++++--- .../clp/search_lib/ir/ClpIrCursor.h | 28 ++++- .../clp/search_lib/ir/ClpIrQueryHandler.cpp | 21 ---- .../clp/search_lib/ir/ClpIrQueryHandler.h | 41 ------- .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 105 +++++++++++++++++- .../clp/search_lib/ir/ClpIrVectorLoader.h | 16 ++- 8 files changed, 196 insertions(+), 93 deletions(-) delete mode 100644 velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp delete mode 100644 velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h diff --git a/CMake/resolve_dependency_modules/clp.cmake b/CMake/resolve_dependency_modules/clp.cmake index 5cb4c731de29..998044773bed 100644 --- a/CMake/resolve_dependency_modules/clp.cmake +++ b/CMake/resolve_dependency_modules/clp.cmake @@ -16,7 +16,7 @@ include_guard(GLOBAL) FetchContent_Declare( clp GIT_REPOSITORY https://github.com/y-scope/clp.git - GIT_TAG 581bd46198a97a89b174849851cc3f1f2100e466) + GIT_TAG 0de99a9e8485ca3dc48710ad2cae31bffe20cd62) set(CLP_BUILD_CLP_REGEX_UTILS OFF diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index bb8a9c2f5b3b..480d9346dcbf 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -16,8 +16,6 @@ velox_add_library( STATIC ClpIrCursor.cpp ClpIrCursor.h - ClpIrQueryHandler.cpp - ClpIrQueryHandler.h ClpIrUnitHandler.cpp ClpIrUnitHandler.h ClpIrVectorLoader.cpp diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index b336a55e1e34..0a0543797159 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -16,6 +16,7 @@ #include "velox/connectors/clp/search_lib/ir/ClpIrCursor.h" #include "ffi/ir_stream/search/QueryHandler.hpp" +#include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" #include "clp_s/ColumnReader.hpp" #include "clp_s/InputConfig.hpp" @@ -25,6 +26,7 @@ using namespace clp_s; namespace facebook::velox::connector::clp::search_lib { uint64_t ClpIrCursor::fetchNext(uint64_t numRows) { + readerIndex_ = 0; if (ErrorCode::Success != errorCode_) { return 0; } @@ -36,7 +38,7 @@ uint64_t ClpIrCursor::fetchNext(uint64_t numRows) { } } - auto deserializeResult = deserialize(); + auto deserializeResult = deserialize(numRows); if (ystdlib::error_handling::success() != deserializeResult) { VELOX_FAIL( "IR file {} might be broken, failed to deserialize", this->splitPath_); @@ -52,13 +54,59 @@ VectorPtr ClpIrCursor::createVector( memory::MemoryPool* pool, const TypePtr& vectorType, size_t vectorSize) { - return nullptr; + VELOX_CHECK_EQ( + projectedColumnNameNodeIdMap_.size(), + outputColumns_.size(), + "Projected columns size {} does not match fields size {}", + projectedColumnNameNodeIdMap_.size(), + outputColumns_.size()); + return createVectorHelper(pool, vectorType, vectorSize); } -ystdlib::error_handling::Result ClpIrCursor::deserialize() const { - while (::clp::ffi::ir_stream::IrUnitType::EndOfStream != - YSTDLIB_ERROR_HANDLING_TRYX( - irDeserializer_->deserialize_next_ir_unit(*irReader_))) { +VectorPtr ClpIrCursor::createVectorHelper( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize) { + if (vectorType->kind() == TypeKind::ROW) { + std::vector children; + auto& rowType = vectorType->as(); + for (uint32_t i = 0; i < rowType.size(); ++i) { + children.push_back( + createVectorHelper(pool, rowType.childAt(i), vectorSize)); + } + return std::make_shared( + pool, vectorType, nullptr, vectorSize, std::move(children)); + } + auto vector = BaseVector::create(vectorType, vectorSize, pool); + vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); + VELOX_CHECK_LT( + readerIndex_, + projectedColumnNameNodeIdMap_.size(), + "Reader index out of bounds"); + auto projectedColumn = outputColumns_[readerIndex_]; + auto projectedColumnType = projectedColumn.type; + auto projectedColumnNodeId = + projectedColumnNameNodeIdMap_.at(projectedColumn.name); + readerIndex_++; + return std::make_shared( + pool, + vectorType, + vectorSize, + std::make_unique( + projectedColumnType, + projectedColumnNodeId, + irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()), + std::move(vector)); +} + +ystdlib::error_handling::Result ClpIrCursor::deserialize( + uint64_t numRows) const { + uint64_t cnt{0}; + while (cnt < numRows && + ::clp::ffi::ir_stream::IrUnitType::EndOfStream != + YSTDLIB_ERROR_HANDLING_TRYX( + irDeserializer_->deserialize_next_ir_unit(*irReader_))) { + cnt++; } return ystdlib::error_handling::success(); } @@ -71,11 +119,8 @@ ErrorCode ClpIrCursor::loadSplit() { auto irHandler{ClpIrUnitHandler{}}; auto projections = splitFieldsToNamesAndTypes(); - auto queryHandlerResult{ir::QueryHandlerType::create( - ir::handleProjectionResolution, - std::move(expr_), - projections, - ignoreCase_)}; + auto queryHandlerResult{QueryHandlerType::create( + handleProjectionResolution, std::move(expr_), projections, ignoreCase_)}; if (!queryHandlerResult) { VLOG(2) << "Failed to create query handler for deserialization."; return ErrorCode::InternalError; @@ -96,10 +141,11 @@ ErrorCode ClpIrCursor::loadSplit() { VLOG(2) << "Failed to create deserializer for deserialization."; return ErrorCode::InternalError; } - irDeserializer_ = std::make_shared<::clp::ffi::ir_stream::Deserializer< - ClpIrUnitHandler, - ir::QueryHandlerType>>(std::move(deserializerResult).value()); + irDeserializer_ = std::make_shared< + ::clp::ffi::ir_stream::Deserializer>( + std::move(deserializerResult).value()); + currentSplitLoaded_ = true; return ErrorCode::Success; } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index c95610eb7612..0678123dfd27 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -19,7 +19,6 @@ #include "ffi/ir_stream/Deserializer.hpp" #include "streaming_compression/Decompressor.hpp" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" -#include "velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h" #include "velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h" namespace facebook::velox::connector::clp::search_lib { @@ -49,15 +48,36 @@ class ClpIrCursor final : public BaseClpCursor { private: std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; bool ignoreCase_; - std::shared_ptr<::clp::ffi::ir_stream:: - Deserializer> + std::function(bool, ::clp::ffi::SchemaTree::Node::id_t, std::string_view)> + handleProjectionResolution = + [this]( + [[maybe_unused]] bool isAutoGenerated, + [[maybe_unused]] ::clp::ffi::SchemaTree::Node::id_t nodeId, + [[maybe_unused]] std::string_view keyName) + -> ystdlib::error_handling::Result { + projectedColumnNameNodeIdMap_.insert({std::string(keyName), nodeId}); + return ystdlib::error_handling::success(); + }; + using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< + decltype(handleProjectionResolution)>; + std::shared_ptr< + ::clp::ffi::ir_stream::Deserializer> irDeserializer_; - ystdlib::error_handling::Result deserialize() const; + ystdlib::error_handling::Result deserialize(uint64_t numRows) const; + size_t readerIndex_{0}; + std::unordered_map + projectedColumnNameNodeIdMap_; std::vector< std::pair> splitFieldsToNamesAndTypes() const; + + VectorPtr createVectorHelper( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize); }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp deleted file mode 100644 index 86285cdef9dd..000000000000 --- a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.cpp +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h" - -namespace facebook::velox::connector::clp::search_lib { - -} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h deleted file mode 100644 index 8c36b8e3d2c3..000000000000 --- a/velox/connectors/clp/search_lib/ir/ClpIrQueryHandler.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "ffi/ir_stream/Deserializer.hpp" - -namespace facebook::velox::connector::clp::search_lib { - -namespace ir { -static std::function handleProjectionResolution = - []([[maybe_unused]] bool isAutoGenerated, - [[maybe_unused]] ::clp::ffi::SchemaTree::Node::id_t nodeId, - [[maybe_unused]] std::string_view keyName) - -> ystdlib::error_handling::Result { - return ystdlib::error_handling::success(); -}; -using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< - decltype(handleProjectionResolution)>; -} // namespace ir - -class ClpIrQueryHandler { - private: - std::unordered_map - projectedColumnNameNodeIdMap_; -}; - -} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index 780d9b31c751..50dd0a978691 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -26,13 +26,106 @@ void ClpIrVectorLoader::loadInternal( VectorPtr* result) { auto vector = *result; for (int vectorIndex : rows) { - filteredLogEvents_->at(vectorIndex)-> - } - switch (nodeType_) { - case ColumnType::Integer: { - auto intVector = vector->asFlatVector(); + auto& logEvent = filteredLogEvents_->at(vectorIndex); + // TODO: also need to support auto gen + auto userGenNodeIdValueMap = logEvent->get_user_gen_node_id_value_pairs(); + vector->setNull(vectorIndex, true); + if (0 == userGenNodeIdValueMap.count(nodeId_)) { + continue; + } + auto value = userGenNodeIdValueMap.at(nodeId_); + if (!value.has_value()) { + continue; } - case ColumnType::Float: { + switch (nodeType_) { + case ColumnType::String: { + auto stringVector = vector->asFlatVector(); + if (value->is()) { + auto stringValue = value->get_immutable_view(); + stringVector->set(vectorIndex, StringView(stringValue)); + } else if (value->is<::clp::ir::EightByteEncodedTextAst>()) { + auto decodeResult = + value->get_immutable_view<::clp::ir::EightByteEncodedTextAst>() + .decode_and_unparse(); + if (!decodeResult.has_value()) { + continue; + } + stringVector->set(vectorIndex, StringView(decodeResult.value())); + } else if (value->is<::clp::ir::FourByteEncodedTextAst>()) { + auto decodeResult = + value->get_immutable_view<::clp::ir::FourByteEncodedTextAst>() + .decode_and_unparse(); + if (!decodeResult.has_value()) { + continue; + } + stringVector->set(vectorIndex, StringView(decodeResult.value())); + } else { + continue; + } + vector->setNull(vectorIndex, false); + break; + } + case ColumnType::Integer: { + auto intVector = vector->asFlatVector(); + intVector->set( + vectorIndex, value->get_immutable_view<::clp::ffi::value_int_t>()); + vector->setNull(vectorIndex, false); + break; + } + case ColumnType::Float: { + auto floatVector = vector->asFlatVector(); + floatVector->set( + vectorIndex, + value->get_immutable_view<::clp::ffi::value_float_t>()); + vector->setNull(vectorIndex, false); + break; + } + case ColumnType::Boolean: { + auto boolVector = vector->asFlatVector(); + boolVector->set( + vectorIndex, value->get_immutable_view<::clp::ffi::value_bool_t>()); + vector->setNull(vectorIndex, false); + break; + } + case ColumnType::Array: { + auto arrayVector = std::dynamic_pointer_cast(vector); + std::string jsonString; + if (value->is<::clp::ir::EightByteEncodedTextAst>()) { + auto decodeResult = + value->get_immutable_view<::clp::ir::EightByteEncodedTextAst>() + .decode_and_unparse(); + if (!decodeResult.has_value()) { + continue; + } + jsonString = std::move(decodeResult.value()); + } else { + auto decodeResult = + value->get_immutable_view<::clp::ir::FourByteEncodedTextAst>() + .decode_and_unparse(); + if (!decodeResult.has_value()) { + continue; + } + jsonString = std::move(decodeResult.value()); + } + + size_t numElements{0ULL}; + auto elements = arrayVector->elements()->asFlatVector(); + auto obj = arrayParser_->iterate(jsonString); + std::vector rawElements; + for (auto arrayElement : obj.get_array()) { + auto raw_element = simdjson::to_json_string(arrayElement).value(); + rawElements.emplace_back(raw_element); + } + elements->resize(rawElements.size()); + for (auto& raw_element : rawElements) { + elements->set(numElements++, StringView(raw_element)); + } + arrayVector->setOffsetAndSize(vectorIndex, 0ULL, numElements); + arrayVector->setNull(vectorIndex, false); + break; + } + default: + VELOX_FAIL("Unsupported column type"); } } } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index c659a9648a5e..21c0b9b22b84 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -16,6 +16,8 @@ #pragma once +#include +#include "connectors/clp/search_lib/BaseClpCursor.h" #include "ffi/ir_stream/Deserializer.hpp" #include "velox/type/Timestamp.h" @@ -29,11 +31,14 @@ enum class ColumnType; class ClpIrVectorLoader : public VectorLoader { public: ClpIrVectorLoader( + ColumnType nodeType, ::clp::ffi::SchemaTree::Node::id_t nodeId, - const std::shared_ptr< - std::vector>> + std::shared_ptr< + const std::vector>> filteredLogEvents) - : nodeId_(nodeId), filteredLogEvents_(filteredLogEvents) {} + : nodeType_(nodeType), + nodeId_(nodeId), + filteredLogEvents_(filteredLogEvents) {} private: void loadInternal( @@ -42,10 +47,13 @@ class ClpIrVectorLoader : public VectorLoader { vector_size_t resultSize, VectorPtr* result) override; + ColumnType nodeType_; ::clp::ffi::SchemaTree::Node::id_t nodeId_; std::shared_ptr< - std::vector>> + const std::vector>> filteredLogEvents_; + inline static thread_local std::unique_ptr + arrayParser_ = std::make_unique(); }; } // namespace facebook::velox::connector::clp::search_lib From f8f75c3acce2e34385aef65b8ad01dfd5da24b02 Mon Sep 17 00:00:00 2001 From: anlowee Date: Thu, 4 Sep 2025 18:31:20 +0000 Subject: [PATCH 13/34] Get it work --- .../clp/search_lib/ir/ClpIrCursor.cpp | 24 ++++++++++++++----- .../clp/search_lib/ir/ClpIrCursor.h | 13 ++++++---- .../clp/search_lib/ir/ClpIrUnitHandler.cpp | 4 ++-- .../clp/search_lib/ir/ClpIrUnitHandler.h | 16 ++++++++++--- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 0a0543797159..4bc35e91061c 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -39,9 +39,13 @@ uint64_t ClpIrCursor::fetchNext(uint64_t numRows) { } auto deserializeResult = deserialize(numRows); - if (ystdlib::error_handling::success() != deserializeResult) { + if (deserializeResult.has_error()) { + auto error = deserializeResult.error(); VELOX_FAIL( - "IR file {} might be broken, failed to deserialize", this->splitPath_); + "IR file {} might be broken, failed to deserialize. {}: {}", + this->splitPath_, + error.category().name(), + error.message()); } return irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()->size(); } @@ -101,11 +105,19 @@ VectorPtr ClpIrCursor::createVectorHelper( ystdlib::error_handling::Result ClpIrCursor::deserialize( uint64_t numRows) const { + irDeserializer_->get_ir_unit_handler().clearFilteredLogEvents(); uint64_t cnt{0}; - while (cnt < numRows && - ::clp::ffi::ir_stream::IrUnitType::EndOfStream != - YSTDLIB_ERROR_HANDLING_TRYX( - irDeserializer_->deserialize_next_ir_unit(*irReader_))) { + while (cnt < numRows) { + auto deserializeResult = + irDeserializer_->deserialize_next_ir_unit(*irReader_); + if (deserializeResult.has_error()) { + auto error = deserializeResult.error(); + if (std::errc::result_out_of_range == error || + irDeserializer_->get_ir_unit_handler().isEndOfStream()) { + break; + } + return error; + } cnt++; } return ystdlib::error_handling::success(); diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index 0678123dfd27..85c9dda610c0 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -17,7 +17,6 @@ #pragma once #include "ffi/ir_stream/Deserializer.hpp" -#include "streaming_compression/Decompressor.hpp" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" #include "velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h" @@ -48,15 +47,19 @@ class ClpIrCursor final : public BaseClpCursor { private: std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; bool ignoreCase_; - std::function(bool, ::clp::ffi::SchemaTree::Node::id_t, std::string_view)> + std::function( + bool, + ::clp::ffi::SchemaTree::Node::id_t, + std::pair)> handleProjectionResolution = [this]( [[maybe_unused]] bool isAutoGenerated, [[maybe_unused]] ::clp::ffi::SchemaTree::Node::id_t nodeId, - [[maybe_unused]] std::string_view keyName) + [[maybe_unused]] std::pair + projected_key_and_index) -> ystdlib::error_handling::Result { - projectedColumnNameNodeIdMap_.insert({std::string(keyName), nodeId}); + projectedColumnNameNodeIdMap_.insert( + {std::string(projected_key_and_index.first), nodeId}); return ystdlib::error_handling::success(); }; using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp index 61f056f20bfd..bb8bea72996c 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp @@ -22,8 +22,8 @@ namespace facebook::velox::connector::clp::search_lib { auto ClpIrUnitHandler::handle_log_event( - ::clp::ffi::KeyValuePairLogEvent log_event) - -> ::clp::ffi::ir_stream::IRErrorCode { + ::clp::ffi::KeyValuePairLogEvent log_event, + size_t log_event_idx) -> ::clp::ffi::ir_stream::IRErrorCode { filteredLogEvents_->push_back( std::make_unique<::clp::ffi::KeyValuePairLogEvent>(std::move(log_event))); return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h index 5058236ca9f0..c374f9d30168 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h @@ -32,8 +32,8 @@ class ClpIrUnitHandler { // Methods implementing `IrUnitHandlerInterface` [[nodiscard]] auto handle_log_event( - ::clp::ffi::KeyValuePairLogEvent log_event) - -> ::clp::ffi::ir_stream::IRErrorCode; + ::clp::ffi::KeyValuePairLogEvent log_event, + size_t log_event_idx) -> ::clp::ffi::ir_stream::IRErrorCode; [[nodiscard]] auto handle_utc_offset_change( [[maybe_unused]] ::clp::UtcOffset utc_offset_old, @@ -51,19 +51,29 @@ class ClpIrUnitHandler { [[nodiscard]] auto handle_end_of_stream() -> ::clp::ffi::ir_stream::IRErrorCode { + endOfStream_ = true; return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } std::shared_ptr< const std::vector>> - getFilteredLogEvents() { + getFilteredLogEvents() const { return filteredLogEvents_; } + void clearFilteredLogEvents() { + filteredLogEvents_->clear(); + } + + bool isEndOfStream() { + return endOfStream_; + } + private: std::shared_ptr< std::vector>> filteredLogEvents_; + bool endOfStream_{false}; }; } // namespace facebook::velox::connector::clp::search_lib From 1c091c7ca801362f0ece3b0a4f2e7764a32dfea0 Mon Sep 17 00:00:00 2001 From: anlowee Date: Thu, 4 Sep 2025 18:59:03 +0000 Subject: [PATCH 14/34] Code clean up --- CMake/resolve_dependency_modules/clp.cmake | 2 +- .../connectors/clp/search_lib/BaseClpCursor.h | 20 +++--- .../search_lib/archive/ClpArchiveCursor.cpp | 64 +++++++++---------- .../clp/search_lib/archive/ClpArchiveCursor.h | 20 +++--- .../archive/ClpArchiveVectorLoader.cpp | 2 +- .../archive/ClpArchiveVectorLoader.h | 14 ++-- 6 files changed, 61 insertions(+), 61 deletions(-) diff --git a/CMake/resolve_dependency_modules/clp.cmake b/CMake/resolve_dependency_modules/clp.cmake index 93d57bf4da61..f2d3e5b86b59 100644 --- a/CMake/resolve_dependency_modules/clp.cmake +++ b/CMake/resolve_dependency_modules/clp.cmake @@ -16,7 +16,7 @@ include_guard(GLOBAL) FetchContent_Declare( clp GIT_REPOSITORY https://github.com/y-scope/clp.git - GIT_TAG 19cd534e629d746395efc64343a60f768b0c9a2d) + GIT_TAG 0de99a9e8485ca3dc48710ad2cae31bffe20cd62) set(CLP_BUILD_CLP_REGEX_UTILS OFF diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index b4f40cc5340a..a3f58bec2e5a 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -92,6 +92,12 @@ class BaseClpCursor { /// @return The number of rows scanned. virtual uint64_t fetchNext(uint64_t numRows) = 0; + /// Gets the count of rows that satisfy the query (used to size the result + /// vector). + /// + /// @return Count of rows matching the query. + virtual size_t getNumFilteredRows() = 0; + /// Creates a Vector of the specified type and size. /// /// This method recursively creates vectors for complex types like ROW. For @@ -107,23 +113,19 @@ class BaseClpCursor { const TypePtr& vectorType, size_t vectorSize) = 0; - virtual size_t getNumFilteredRows() = 0; - protected: + /// Loads the split from archive or IR stream. /// /// @return The error code. virtual ErrorCode loadSplit() = 0; + bool currentSplitLoaded_{false}; ErrorCode errorCode_; - + std::shared_ptr expr_; clp_s::InputSource inputSource_{clp_s::InputSource::Filesystem}; - std::string splitPath_; - std::string query_; std::vector outputColumns_; - - bool currentSplitLoaded_{false}; - - std::shared_ptr expr_; + std::string query_; + std::string splitPath_; private: /// Preprocesses the query, performing parsing, validation, and optimization. diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp index 9cca7c053cde..9210eda825b6 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp @@ -107,38 +107,6 @@ VectorPtr ClpArchiveCursor::createVector( return createVectorHelper(pool, vectorType, vectorSize, projectedColumns); } -VectorPtr ClpArchiveCursor::createVectorHelper( - memory::MemoryPool* pool, - const TypePtr& vectorType, - size_t vectorSize, - const std::vector& projectedColumns) { - if (vectorType->kind() == TypeKind::ROW) { - std::vector children; - auto& rowType = vectorType->as(); - for (uint32_t i = 0; i < rowType.size(); ++i) { - children.push_back(createVectorHelper( - pool, rowType.childAt(i), vectorSize, projectedColumns)); - } - return std::make_shared( - pool, vectorType, nullptr, vectorSize, std::move(children)); - } - auto vector = BaseVector::create(vectorType, vectorSize, pool); - vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); - - VELOX_CHECK_LT( - readerIndex_, projectedColumns.size(), "Reader index out of bounds"); - auto projectedColumn = projectedColumns[readerIndex_]; - auto projectedType = outputColumns_[readerIndex_].type; - readerIndex_++; - return std::make_shared( - pool, - vectorType, - vectorSize, - std::make_unique( - projectedColumn, projectedType, filteredRowIndices_), - std::move(vector)); -} - const std::vector& ClpArchiveCursor::getProjectedColumns() const { if (queryRunner_) { @@ -263,4 +231,36 @@ ErrorCode ClpArchiveCursor::loadSplit() { return ErrorCode::Success; } +VectorPtr ClpArchiveCursor::createVectorHelper( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize, + const std::vector& projectedColumns) { + if (vectorType->kind() == TypeKind::ROW) { + std::vector children; + auto& rowType = vectorType->as(); + for (uint32_t i = 0; i < rowType.size(); ++i) { + children.push_back(createVectorHelper( + pool, rowType.childAt(i), vectorSize, projectedColumns)); + } + return std::make_shared( + pool, vectorType, nullptr, vectorSize, std::move(children)); + } + auto vector = BaseVector::create(vectorType, vectorSize, pool); + vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); + + VELOX_CHECK_LT( + readerIndex_, projectedColumns.size(), "Reader index out of bounds"); + auto projectedColumn = projectedColumns[readerIndex_]; + auto projectedType = outputColumns_[readerIndex_].type; + readerIndex_++; + return std::make_shared( + pool, + vectorType, + vectorSize, + std::make_unique( + projectedColumn, projectedType, filteredRowIndices_), + std::move(vector)); +} + } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h index 2e098e89ef37..ca34837cb291 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h @@ -42,30 +42,28 @@ class ClpArchiveCursor final : public BaseClpCursor { uint64_t fetchNext(uint64_t numRows) override; + size_t getNumFilteredRows() override; + VectorPtr createVector( memory::MemoryPool* pool, const TypePtr& vectorType, size_t vectorSize) override; - size_t getNumFilteredRows() override; - protected: ErrorCode loadSplit() override; private: - size_t readerIndex_{0}; + std::shared_ptr archiveReader_; + int32_t currentSchemaId_{-1}; + size_t currentSchemaIndex_{0}; + bool currentSchemaTableLoaded_{false}; std::shared_ptr> filteredRowIndices_ = std::make_shared>(); std::vector matchedSchemas_; - size_t currentSchemaIndex_{0}; - int32_t currentSchemaId_{-1}; - bool currentSchemaTableLoaded_{false}; - - std::shared_ptr schemaMatch_; - std::shared_ptr queryRunner_; std::shared_ptr projection_; - - std::shared_ptr archiveReader_; + std::shared_ptr queryRunner_; + size_t readerIndex_{0}; + std::shared_ptr schemaMatch_; const std::vector& getProjectedColumns() const; diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp index 1ea6e6e8f695..9238674671a6 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp @@ -21,8 +21,8 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" -#include "ClpArchiveVectorLoader.h" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" +#include "velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h" #include "velox/type/Timestamp.h" #include "velox/vector/ComplexVector.h" #include "velox/vector/FlatVector.h" diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h index 24992b2402e4..171a2d7c8664 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h @@ -44,6 +44,13 @@ class ClpArchiveVectorLoader : public VectorLoader { const std::shared_ptr> filteredRowIndices); private: + inline static thread_local std::unique_ptr + arrayParser_ = std::make_unique(); + + clp_s::BaseColumnReader* columnReader_; + ColumnType nodeType_; + std::shared_ptr> filteredRowIndices_; + void loadInternal( RowSet rows, ValueHook* hook, @@ -57,13 +64,6 @@ class ClpArchiveVectorLoader : public VectorLoader { void populateTimestampData( RowSet rows, FlatVector* vector); - - clp_s::BaseColumnReader* columnReader_; - ColumnType nodeType_; - std::shared_ptr> filteredRowIndices_; - - inline static thread_local std::unique_ptr - arrayParser_ = std::make_unique(); }; } // namespace facebook::velox::connector::clp::search_lib From 7f252108b71f392df70783c434ce5a003000c541 Mon Sep 17 00:00:00 2001 From: anlowee Date: Thu, 4 Sep 2025 19:47:49 +0000 Subject: [PATCH 15/34] Address coderabbitai comments --- velox/connectors/clp/ClpDataSource.cpp | 1 - velox/connectors/clp/search_lib/BaseClpCursor.cpp | 1 + velox/connectors/clp/search_lib/BaseClpCursor.h | 4 ++-- .../connectors/clp/search_lib/archive/CMakeLists.txt | 11 ++++------- .../clp/search_lib/archive/ClpArchiveCursor.cpp | 2 +- .../clp/search_lib/archive/ClpArchiveCursor.h | 2 +- 6 files changed, 9 insertions(+), 12 deletions(-) diff --git a/velox/connectors/clp/ClpDataSource.cpp b/velox/connectors/clp/ClpDataSource.cpp index 7ee225be50b0..075704091117 100644 --- a/velox/connectors/clp/ClpDataSource.cpp +++ b/velox/connectors/clp/ClpDataSource.cpp @@ -16,7 +16,6 @@ #include -#include "search_lib/archive/ClpArchiveVectorLoader.h" #include "velox/connectors/clp/ClpColumnHandle.h" #include "velox/connectors/clp/ClpConnectorSplit.h" #include "velox/connectors/clp/ClpDataSource.h" diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.cpp b/velox/connectors/clp/search_lib/BaseClpCursor.cpp index b0c64bf464dd..2741fc582062 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.cpp +++ b/velox/connectors/clp/search_lib/BaseClpCursor.cpp @@ -18,6 +18,7 @@ #include "velox/connectors/clp/search_lib/BaseClpCursor.h" +#include #include "clp_s/search/ast/ConvertToExists.hpp" #include "clp_s/search/ast/EmptyExpr.hpp" #include "clp_s/search/ast/NarrowTypes.hpp" diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index a3f58bec2e5a..92940081f909 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -96,7 +96,7 @@ class BaseClpCursor { /// vector). /// /// @return Count of rows matching the query. - virtual size_t getNumFilteredRows() = 0; + virtual size_t getNumFilteredRows() const = 0; /// Creates a Vector of the specified type and size. /// @@ -122,7 +122,7 @@ class BaseClpCursor { bool currentSplitLoaded_{false}; ErrorCode errorCode_; std::shared_ptr expr_; - clp_s::InputSource inputSource_{clp_s::InputSource::Filesystem}; + clp_s::InputSource inputSource_; std::vector outputColumns_; std::string query_; std::string splitPath_; diff --git a/velox/connectors/clp/search_lib/archive/CMakeLists.txt b/velox/connectors/clp/search_lib/archive/CMakeLists.txt index 3f09cce6fd0a..9340e22ffbb7 100644 --- a/velox/connectors/clp/search_lib/archive/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/archive/CMakeLists.txt @@ -23,10 +23,7 @@ velox_add_library( velox_link_libraries( clp-s-archive-search - PUBLIC clp_s::archive_reader - PRIVATE - clp_s::clp_dependencies - clp_s::io - clp_s::search - clp_s::search::kql - velox_vector) + PUBLIC clp_s::archive_reader velox_vector + PRIVATE clp_s::clp_dependencies clp_s::io clp_s::search clp_s::search::kql) + +target_compile_features(clp-s-archive-search PRIVATE cxx_std_20) diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp index 9210eda825b6..615eaefeb2d9 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp @@ -89,7 +89,7 @@ uint64_t ClpArchiveCursor::fetchNext(uint64_t numRows) { return 0; } -size_t ClpArchiveCursor::getNumFilteredRows() { +size_t ClpArchiveCursor::getNumFilteredRows() const { return filteredRowIndices_->size(); } diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h index ca34837cb291..6760495b1581 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h @@ -42,7 +42,7 @@ class ClpArchiveCursor final : public BaseClpCursor { uint64_t fetchNext(uint64_t numRows) override; - size_t getNumFilteredRows() override; + size_t getNumFilteredRows() const override; VectorPtr createVector( memory::MemoryPool* pool, From 595e74b524ff81d12f8e5c75439b217edcc08f1a Mon Sep 17 00:00:00 2001 From: anlowee Date: Thu, 4 Sep 2025 21:41:30 +0000 Subject: [PATCH 16/34] Minor fix --- .../clp/search_lib/ir/ClpIrCursor.cpp | 144 +++++++++--------- .../clp/search_lib/ir/ClpIrCursor.h | 14 +- .../clp/search_lib/ir/ClpIrUnitHandler.cpp | 4 + .../clp/search_lib/ir/ClpIrUnitHandler.h | 10 +- .../clp/search_lib/ir/ClpIrVectorLoader.h | 15 +- 5 files changed, 92 insertions(+), 95 deletions(-) diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 4bc35e91061c..98dcbbeee5ba 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "velox/connectors/clp/search_lib/ir/ClpIrCursor.h" -#include "ffi/ir_stream/search/QueryHandler.hpp" -#include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" - #include "clp_s/ColumnReader.hpp" #include "clp_s/InputConfig.hpp" +#include "ffi/ir_stream/search/QueryHandler.hpp" +#include "velox/connectors/clp/search_lib/ir/ClpIrCursor.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" + using namespace clp_s; namespace facebook::velox::connector::clp::search_lib { @@ -47,10 +47,10 @@ uint64_t ClpIrCursor::fetchNext(uint64_t numRows) { error.category().name(), error.message()); } - return irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()->size(); + return irDeserializer_->get_num_log_events_deserialized(); } -size_t ClpIrCursor::getNumFilteredRows() { +size_t ClpIrCursor::getNumFilteredRows() const { return irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()->size(); } @@ -67,62 +67,6 @@ VectorPtr ClpIrCursor::createVector( return createVectorHelper(pool, vectorType, vectorSize); } -VectorPtr ClpIrCursor::createVectorHelper( - memory::MemoryPool* pool, - const TypePtr& vectorType, - size_t vectorSize) { - if (vectorType->kind() == TypeKind::ROW) { - std::vector children; - auto& rowType = vectorType->as(); - for (uint32_t i = 0; i < rowType.size(); ++i) { - children.push_back( - createVectorHelper(pool, rowType.childAt(i), vectorSize)); - } - return std::make_shared( - pool, vectorType, nullptr, vectorSize, std::move(children)); - } - auto vector = BaseVector::create(vectorType, vectorSize, pool); - vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); - VELOX_CHECK_LT( - readerIndex_, - projectedColumnNameNodeIdMap_.size(), - "Reader index out of bounds"); - auto projectedColumn = outputColumns_[readerIndex_]; - auto projectedColumnType = projectedColumn.type; - auto projectedColumnNodeId = - projectedColumnNameNodeIdMap_.at(projectedColumn.name); - readerIndex_++; - return std::make_shared( - pool, - vectorType, - vectorSize, - std::make_unique( - projectedColumnType, - projectedColumnNodeId, - irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()), - std::move(vector)); -} - -ystdlib::error_handling::Result ClpIrCursor::deserialize( - uint64_t numRows) const { - irDeserializer_->get_ir_unit_handler().clearFilteredLogEvents(); - uint64_t cnt{0}; - while (cnt < numRows) { - auto deserializeResult = - irDeserializer_->deserialize_next_ir_unit(*irReader_); - if (deserializeResult.has_error()) { - auto error = deserializeResult.error(); - if (std::errc::result_out_of_range == error || - irDeserializer_->get_ir_unit_handler().isEndOfStream()) { - break; - } - return error; - } - cnt++; - } - return ystdlib::error_handling::success(); -} - ErrorCode ClpIrCursor::loadSplit() { auto networkAuthOption = inputSource_ == InputSource::Filesystem ? NetworkAuthOption{.method = AuthMethod::None} @@ -161,34 +105,34 @@ ErrorCode ClpIrCursor::loadSplit() { return ErrorCode::Success; } -std::vector> +std::vector> ClpIrCursor::splitFieldsToNamesAndTypes() const { auto result = std::vector< - std::pair>{}; + std::pair>{}; for (size_t i{0}; i < outputColumns_.size(); ++i) { auto column = outputColumns_[i]; - clp_s::search::ast::literal_type_bitmask_t literalType; + search::ast::literal_type_bitmask_t literalType; switch (column.type) { case ColumnType::Array: - literalType = clp_s::search::ast::LiteralType::ArrayT; + literalType = search::ast::LiteralType::ArrayT; break; case ColumnType::Boolean: - literalType = clp_s::search::ast::LiteralType::BooleanT; + literalType = search::ast::LiteralType::BooleanT; break; case ColumnType::Float: - literalType = clp_s::search::ast::LiteralType::FloatT; + literalType = search::ast::LiteralType::FloatT; break; case ColumnType::Integer: - literalType = clp_s::search::ast::LiteralType::IntegerT; + literalType = search::ast::LiteralType::IntegerT; break; case ColumnType::String: - literalType = clp_s::search::ast::LiteralType::VarStringT; + literalType = search::ast::LiteralType::VarStringT; break; case ColumnType::Timestamp: - literalType = clp_s::search::ast::LiteralType::EpochDateT; + literalType = search::ast::LiteralType::EpochDateT; break; default: - literalType = clp_s::search::ast::LiteralType::UnknownT; + literalType = search::ast::LiteralType::UnknownT; break; } result.emplace_back(column.name, literalType); @@ -196,4 +140,60 @@ ClpIrCursor::splitFieldsToNamesAndTypes() const { return result; } +ystdlib::error_handling::Result ClpIrCursor::deserialize( + uint64_t numRows) const { + irDeserializer_->get_ir_unit_handler().clearFilteredLogEvents(); + uint64_t cnt{0}; + while (cnt < numRows) { + auto deserializeResult = + irDeserializer_->deserialize_next_ir_unit(*irReader_); + if (deserializeResult.has_error()) { + auto error = deserializeResult.error(); + if (std::errc::result_out_of_range == error || + irDeserializer_->is_stream_completed()) { + break; + } + return error; + } + cnt++; + } + return ystdlib::error_handling::success(); +} + +VectorPtr ClpIrCursor::createVectorHelper( + memory::MemoryPool* pool, + const TypePtr& vectorType, + size_t vectorSize) { + if (vectorType->kind() == TypeKind::ROW) { + std::vector children; + auto& rowType = vectorType->as(); + for (uint32_t i = 0; i < rowType.size(); ++i) { + children.push_back( + createVectorHelper(pool, rowType.childAt(i), vectorSize)); + } + return std::make_shared( + pool, vectorType, nullptr, vectorSize, std::move(children)); + } + auto vector = BaseVector::create(vectorType, vectorSize, pool); + vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); + VELOX_CHECK_LT( + readerIndex_, + projectedColumnNameNodeIdMap_.size(), + "Reader index out of bounds"); + auto projectedColumn = outputColumns_[readerIndex_]; + auto projectedColumnType = projectedColumn.type; + auto projectedColumnNodeId = + projectedColumnNameNodeIdMap_.at(projectedColumn.name); + readerIndex_++; + return std::make_shared( + pool, + vectorType, + vectorSize, + std::make_unique( + projectedColumnType, + projectedColumnNodeId, + irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()), + std::move(vector)); +} + } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index 85c9dda610c0..7ef21123a025 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -34,7 +34,7 @@ class ClpIrCursor final : public BaseClpCursor { // log events. Currently this returns the same number of filtered rows. uint64_t fetchNext(uint64_t numRows) override; - size_t getNumFilteredRows() override; + size_t getNumFilteredRows() const override; VectorPtr createVector( memory::MemoryPool* pool, @@ -45,8 +45,6 @@ class ClpIrCursor final : public BaseClpCursor { ErrorCode loadSplit() override; private: - std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; - bool ignoreCase_; std::function( bool, ::clp::ffi::SchemaTree::Node::id_t, @@ -64,19 +62,21 @@ class ClpIrCursor final : public BaseClpCursor { }; using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< decltype(handleProjectionResolution)>; + bool ignoreCase_; std::shared_ptr< ::clp::ffi::ir_stream::Deserializer> irDeserializer_; - - ystdlib::error_handling::Result deserialize(uint64_t numRows) const; - - size_t readerIndex_{0}; + std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; std::unordered_map projectedColumnNameNodeIdMap_; + size_t readerIndex_{0}; + std::vector< std::pair> splitFieldsToNamesAndTypes() const; + ystdlib::error_handling::Result deserialize(uint64_t numRows) const; + VectorPtr createVectorHelper( memory::MemoryPool* pool, const TypePtr& vectorType, diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp index bb8bea72996c..4cab686a8891 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp @@ -37,4 +37,8 @@ auto ClpIrUnitHandler::handle_schema_tree_node_insertion( return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } +void ClpIrUnitHandler::clearFilteredLogEvents() { + filteredLogEvents_->clear(); +} + } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h index c374f9d30168..b8d55aa94b0a 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h @@ -51,7 +51,6 @@ class ClpIrUnitHandler { [[nodiscard]] auto handle_end_of_stream() -> ::clp::ffi::ir_stream::IRErrorCode { - endOfStream_ = true; return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } @@ -61,19 +60,12 @@ class ClpIrUnitHandler { return filteredLogEvents_; } - void clearFilteredLogEvents() { - filteredLogEvents_->clear(); - } - - bool isEndOfStream() { - return endOfStream_; - } + void clearFilteredLogEvents(); private: std::shared_ptr< std::vector>> filteredLogEvents_; - bool endOfStream_{false}; }; } // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index 21c0b9b22b84..381bfd9e962f 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -41,19 +41,20 @@ class ClpIrVectorLoader : public VectorLoader { filteredLogEvents_(filteredLogEvents) {} private: - void loadInternal( - RowSet rows, - ValueHook* hook, - vector_size_t resultSize, - VectorPtr* result) override; + inline static thread_local std::unique_ptr + arrayParser_ = std::make_unique(); ColumnType nodeType_; ::clp::ffi::SchemaTree::Node::id_t nodeId_; std::shared_ptr< const std::vector>> filteredLogEvents_; - inline static thread_local std::unique_ptr - arrayParser_ = std::make_unique(); + + void loadInternal( + RowSet rows, + ValueHook* hook, + vector_size_t resultSize, + VectorPtr* result) override; }; } // namespace facebook::velox::connector::clp::search_lib From 1b12402d673cb50ce18d13fd953b283daaa1bb56 Mon Sep 17 00:00:00 2001 From: anlowee Date: Thu, 4 Sep 2025 23:28:48 +0000 Subject: [PATCH 17/34] Fix unitest --- .../connectors/clp/tests/ClpConnectorTest.cpp | 66 +++++++++--------- .../clp/tests/examples/example.clps | Bin 261 -> 0 bytes .../clp/tests/examples/example2.clps | Bin 261 -> 0 bytes .../clp/tests/examples/test_1_ir.clps | Bin 0 -> 1068 bytes 4 files changed, 32 insertions(+), 34 deletions(-) delete mode 100644 velox/connectors/clp/tests/examples/example.clps delete mode 100644 velox/connectors/clp/tests/examples/example2.clps create mode 100644 velox/connectors/clp/tests/examples/test_1_ir.clps diff --git a/velox/connectors/clp/tests/ClpConnectorTest.cpp b/velox/connectors/clp/tests/ClpConnectorTest.cpp index 096032663196..342379831752 100644 --- a/velox/connectors/clp/tests/ClpConnectorTest.cpp +++ b/velox/connectors/clp/tests/ClpConnectorTest.cpp @@ -86,40 +86,6 @@ class ClpConnectorTest : public exec::test::OperatorTestBase { } }; -TEST_F(ClpConnectorTest, testIr) { - const std::shared_ptr kqlQuery = nullptr; - auto plan = - PlanBuilder() - .startTableScan() - .outputType( - ROW({"level", "message", "user"}, - {VARCHAR(), - VARCHAR(), - ROW({"uid", "ip"}, {BIGINT(), VARCHAR()})})) - .tableHandle( - std::make_shared(kClpConnectorId, "example")) - .assignments({ - {"level", - std::make_shared("level", "level", VARCHAR())}, - {"message", - std::make_shared( - "message", "message", VARCHAR())}, - {"user", - std::make_shared( - "user", "user", ROW({"uid", "ip"}, {BIGINT(), VARCHAR()}))}, - }) - .endTableScan() - .filter("level = 'INFO'") - .planNode(); - auto output = getResults( - plan, - {makeClpSplit( - getExampleFilePath("example2.clps"), - ClpConnectorSplit::SplitType::kIr, - kqlQuery)}); - std::cout << "Live" << std::endl; -} - TEST_F(ClpConnectorTest, test1NoPushdown) { const std::shared_ptr kqlQuery = nullptr; auto plan = PlanBuilder() @@ -166,6 +132,30 @@ TEST_F(ClpConnectorTest, test1NoPushdown) { "GET", })}); test::assertEqualVectors(expected, output); + + // The IR stream will be deserialized in order, so the exepect vector is di + auto irExpected = makeRowVector( + {// requestId + makeFlatVector( + {"req-100", "req-102", "req-105", "req-107", "req-109"}), + // userId + makeNullableFlatVector( + {"user201", std::nullopt, "user204", "user202", "user203"}), + // method + makeFlatVector({ + "GET", + "GET", + "GET", + "GET", + "GET", + })}); + auto irOutput = getResults( + plan, + {makeClpSplit( + getExampleFilePath("test_1_ir.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + test::assertEqualVectors(irExpected, irOutput); } TEST_F(ClpConnectorTest, test1Pushdown) { @@ -206,6 +196,14 @@ TEST_F(ClpConnectorTest, test1Pushdown) { // path makeFlatVector({"/auth/login"})}); test::assertEqualVectors(expected, output); + + auto irOutput = getResults( + plan, + {makeClpSplit( + getExampleFilePath("test_1_ir.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + test::assertEqualVectors(expected, irOutput); } TEST_F(ClpConnectorTest, test2NoPushdown) { diff --git a/velox/connectors/clp/tests/examples/example.clps b/velox/connectors/clp/tests/examples/example.clps deleted file mode 100644 index bfac0b001ae4b9a4cb7c1d609b390b65086e42ce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 261 zcmeyXzg3e_a9Xuem}8Kqqmz$oaD1?{hpVq+yr+wjl~Qtku3lwva(+RoUUE)>URYvL zW@1uKYH@IKMrv+in30kWTw}bepR>P^HVbO(tT4)GV)W3 z!wf+txdsJ$`uhQ`HPAEEGf=86N%-%`nv+_Vnp2X%;K-hvT3no%o|^j4k;T)`%|Def zQc)l{wWusJIaQ&!B(bO@HASxssF0URYvL zW@1uKYH@IKMrv+in30kWTw}bepR>P^HVbO(tT4)GV)W3 z!wf+txdsJ$`uhQ`HPAEEGf=86Nnmhf%}Fgw%>l94b5o0p6Vp>u8B>`YSv>vR{38_w zf>VpiGLuskic1oUN>Wqw%76-4N{dsA3KN(enM*TMN`N#|W&uzWODd})OSof@UoZpX RBL_!LLnCuN10XbF007-9Pw)T$ diff --git a/velox/connectors/clp/tests/examples/test_1_ir.clps b/velox/connectors/clp/tests/examples/test_1_ir.clps new file mode 100644 index 0000000000000000000000000000000000000000..182de479b49c0adc7099a346eb8d669903b4112b GIT binary patch literal 1068 zcma)*-HOvd6o9j~+bZg!%iaiHNbeSGGf5g<3nF6^wt?+Wek|fmOp~!0Hpyf&6JZg- zr|`z-@EsInd34<3L2GFU^$f^RQO*LSBga}IsW7)%Bs zNk`W-Bsm&|Nj_Q!1r9a|qHY+YG+3*H=QkI9b1_{S*5cfso8D|`xH}1E7Lo{O+c{z} ziRfCXNT%aj%LOkBM#=~bzOs)JyvJ~gC=1fOjtX>D^3aN^s8>={{6D5OsLFQpfW=q* za8w|glZ+ydBj&V9^*D*~TLYy$|MGbaV??l6xp_IzH?e9ZVTd%R8DX?dl z7p7;zle*=MF#HLJ0mFPmvSvSXq(8U5q}pog0z7USKdPxwK;sPwipY_E2-96JsI{gq z{B|TIi{*rDab_EWvTI$mR_mqEQX8$N*=Ai-IYKPJ3AIo6p^jU(GeOni;R<@w8GKl` tOA%xI7g0z0wimJWd?ko{%GFvwt$AW;-tf7gEh(;FZwR@h_Ipm3{sF)MDAxc0 literal 0 HcmV?d00001 From 5db1cf9a0a3d0f7c011de5875c77496d2ed27fd2 Mon Sep 17 00:00:00 2001 From: anlowee Date: Fri, 5 Sep 2025 14:07:40 +0000 Subject: [PATCH 18/34] Fix building issues --- CMake/resolve_dependency_modules/clp.cmake | 1 - velox/connectors/clp/tests/ClpConnectorTest.cpp | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMake/resolve_dependency_modules/clp.cmake b/CMake/resolve_dependency_modules/clp.cmake index 998044773bed..f2d3e5b86b59 100644 --- a/CMake/resolve_dependency_modules/clp.cmake +++ b/CMake/resolve_dependency_modules/clp.cmake @@ -39,6 +39,5 @@ set(CLP_BUILD_TESTING FetchContent_Populate(clp) -list(APPEND CMAKE_MODULE_PATH "${clp_SOURCE_DIR}/components/core/cmake/Modules") add_subdirectory(${clp_SOURCE_DIR}/components/core ${clp_BINARY_DIR}/components/core) diff --git a/velox/connectors/clp/tests/ClpConnectorTest.cpp b/velox/connectors/clp/tests/ClpConnectorTest.cpp index 342379831752..f2f7f121796d 100644 --- a/velox/connectors/clp/tests/ClpConnectorTest.cpp +++ b/velox/connectors/clp/tests/ClpConnectorTest.cpp @@ -133,7 +133,8 @@ TEST_F(ClpConnectorTest, test1NoPushdown) { })}); test::assertEqualVectors(expected, output); - // The IR stream will be deserialized in order, so the exepect vector is di + // The IR stream will be deserialized in order, so the expected vector is + // different auto irExpected = makeRowVector( {// requestId makeFlatVector( From e71a9576b986e4e12e6a93f6feb8b5c1031a7427 Mon Sep 17 00:00:00 2001 From: anlowee Date: Fri, 5 Sep 2025 14:27:43 +0000 Subject: [PATCH 19/34] Address comments --- velox/connectors/clp/search_lib/BaseClpCursor.cpp | 5 ++--- .../connectors/clp/search_lib/archive/ClpArchiveCursor.cpp | 6 +++--- velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h | 3 +-- .../clp/search_lib/archive/ClpArchiveVectorLoader.cpp | 1 - .../clp/search_lib/archive/ClpArchiveVectorLoader.h | 7 +++---- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.cpp b/velox/connectors/clp/search_lib/BaseClpCursor.cpp index 2741fc582062..b8ab7db9457b 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.cpp +++ b/velox/connectors/clp/search_lib/BaseClpCursor.cpp @@ -14,16 +14,15 @@ * limitations under the License. */ +#include #include -#include "velox/connectors/clp/search_lib/BaseClpCursor.h" - -#include #include "clp_s/search/ast/ConvertToExists.hpp" #include "clp_s/search/ast/EmptyExpr.hpp" #include "clp_s/search/ast/NarrowTypes.hpp" #include "clp_s/search/ast/OrOfAndForm.hpp" #include "clp_s/search/kql/kql.hpp" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" using namespace clp_s; using namespace clp_s::search; diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp index 615eaefeb2d9..115cc20bc614 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.cpp @@ -16,12 +16,11 @@ #include -#include "velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h" - #include "clp_s/ArchiveReader.hpp" #include "clp_s/search/EvaluateTimestampIndex.hpp" #include "clp_s/search/ast/EmptyExpr.hpp" #include "clp_s/search/ast/SearchUtils.hpp" +#include "velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h" #include "velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h" #include "velox/connectors/clp/search_lib/archive/ClpQueryRunner.h" @@ -35,7 +34,8 @@ ClpArchiveCursor::ClpArchiveCursor( clp_s::InputSource inputSource, std::string_view splitPath) : BaseClpCursor(inputSource, splitPath), - archiveReader_(std::make_shared()) {} + archiveReader_(std::make_shared()), + filteredRowIndices_(std::make_shared>()) {} ClpArchiveCursor::~ClpArchiveCursor() { if (currentSplitLoaded_) { diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h index 6760495b1581..09bc042d740f 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h @@ -57,8 +57,7 @@ class ClpArchiveCursor final : public BaseClpCursor { int32_t currentSchemaId_{-1}; size_t currentSchemaIndex_{0}; bool currentSchemaTableLoaded_{false}; - std::shared_ptr> filteredRowIndices_ = - std::make_shared>(); + std::shared_ptr> filteredRowIndices_; std::vector matchedSchemas_; std::shared_ptr projection_; std::shared_ptr queryRunner_; diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp index 9238674671a6..1a48f9a6a911 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp @@ -20,7 +20,6 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" - #include "velox/connectors/clp/search_lib/BaseClpCursor.h" #include "velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h" #include "velox/type/Timestamp.h" diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h index 171a2d7c8664..99fe5c6b5052 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h @@ -20,7 +20,6 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" - #include "velox/type/Timestamp.h" #include "velox/vector/FlatVector.h" #include "velox/vector/LazyVector.h" @@ -33,9 +32,9 @@ namespace facebook::velox::connector::clp::search_lib { enum class ColumnType; -/// A custom Velox VectorLoader that populates Velox vectors from a CLP-based -/// column reader. It supports various column types including integers, floats, -/// booleans, strings, and arrays of strings. +/// A custom Velox VectorLoader that populates Velox vectors using a CLP-based +/// column reader over archives. It supports various column types including +/// integers, floats, booleans, strings, and arrays of strings. class ClpArchiveVectorLoader : public VectorLoader { public: ClpArchiveVectorLoader( From 3ca736102ceea695eeeb111dd4427c66c8d934a1 Mon Sep 17 00:00:00 2001 From: anlowee Date: Fri, 5 Sep 2025 15:42:37 +0000 Subject: [PATCH 20/34] Try out zhihao fix --- CMake/resolve_dependency_modules/clp.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMake/resolve_dependency_modules/clp.cmake b/CMake/resolve_dependency_modules/clp.cmake index f2d3e5b86b59..86a99e176e70 100644 --- a/CMake/resolve_dependency_modules/clp.cmake +++ b/CMake/resolve_dependency_modules/clp.cmake @@ -15,8 +15,8 @@ include_guard(GLOBAL) FetchContent_Declare( clp - GIT_REPOSITORY https://github.com/y-scope/clp.git - GIT_TAG 0de99a9e8485ca3dc48710ad2cae31bffe20cd62) + GIT_REPOSITORY https://github.com/LinZhihao-723/clp.git + GIT_TAG f5922f5eb292784a865872b810399da7e7b94a52) set(CLP_BUILD_CLP_REGEX_UTILS OFF From 184f9de7ef2ad0f5eb40fa108fcf0a95c54427fd Mon Sep 17 00:00:00 2001 From: anlowee Date: Fri, 5 Sep 2025 19:21:28 +0000 Subject: [PATCH 21/34] Apply Zhihao fix --- CMake/resolve_dependency_modules/clp.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMake/resolve_dependency_modules/clp.cmake b/CMake/resolve_dependency_modules/clp.cmake index 86a99e176e70..996c6b9f75d8 100644 --- a/CMake/resolve_dependency_modules/clp.cmake +++ b/CMake/resolve_dependency_modules/clp.cmake @@ -15,8 +15,8 @@ include_guard(GLOBAL) FetchContent_Declare( clp - GIT_REPOSITORY https://github.com/LinZhihao-723/clp.git - GIT_TAG f5922f5eb292784a865872b810399da7e7b94a52) + GIT_REPOSITORY https://github.com/y-scope/clp.git + GIT_TAG bfd4f60ffe9c5d69618cc8416ec6729c76ee9862) set(CLP_BUILD_CLP_REGEX_UTILS OFF From f9e140993227d562e1180a4ff2ab2b43ab23a4dc Mon Sep 17 00:00:00 2001 From: anlowee Date: Fri, 5 Sep 2025 20:22:42 +0000 Subject: [PATCH 22/34] Address some coderabbitai comments after merging the first PR --- .../clp/search_lib/ir/ClpIrCursor.cpp | 21 +++++++++++++------ .../clp/search_lib/ir/ClpIrCursor.h | 13 +++++++----- .../clp/search_lib/ir/ClpIrUnitHandler.cpp | 3 +-- .../clp/search_lib/ir/ClpIrUnitHandler.h | 3 +++ .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 2 +- .../clp/search_lib/ir/ClpIrVectorLoader.h | 9 +++----- 6 files changed, 31 insertions(+), 20 deletions(-) diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 98dcbbeee5ba..6785967ee2fc 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -72,11 +72,14 @@ ErrorCode ClpIrCursor::loadSplit() { ? NetworkAuthOption{.method = AuthMethod::None} : NetworkAuthOption{.method = AuthMethod::S3PresignedUrlV4}; - auto irHandler{ClpIrUnitHandler{}}; + auto irHandler = ClpIrUnitHandler{}; auto projections = splitFieldsToNamesAndTypes(); auto queryHandlerResult{QueryHandlerType::create( - handleProjectionResolution, std::move(expr_), projections, ignoreCase_)}; + projectionResolutionCallback_, + std::move(expr_), + projections, + ignoreCase_)}; if (!queryHandlerResult) { VLOG(2) << "Failed to create query handler for deserialization."; return ErrorCode::InternalError; @@ -92,7 +95,7 @@ ErrorCode ClpIrCursor::loadSplit() { } auto deserializerResult = ::clp::ffi::ir_stream::make_deserializer( - *irReader_, irHandler, std::move(queryHandler)); + *irReader_, std::move(irHandler), std::move(queryHandler)); if (!deserializerResult) { VLOG(2) << "Failed to create deserializer for deserialization."; return ErrorCode::InternalError; @@ -129,6 +132,8 @@ ClpIrCursor::splitFieldsToNamesAndTypes() const { literalType = search::ast::LiteralType::VarStringT; break; case ColumnType::Timestamp: + // TODO: IR timestamp support pending; constrain to Unknown to avoid + // mismatched projections. literalType = search::ast::LiteralType::EpochDateT; break; default: @@ -141,7 +146,7 @@ ClpIrCursor::splitFieldsToNamesAndTypes() const { } ystdlib::error_handling::Result ClpIrCursor::deserialize( - uint64_t numRows) const { + uint64_t numRows) { irDeserializer_->get_ir_unit_handler().clearFilteredLogEvents(); uint64_t cnt{0}; while (cnt < numRows) { @@ -182,8 +187,12 @@ VectorPtr ClpIrCursor::createVectorHelper( "Reader index out of bounds"); auto projectedColumn = outputColumns_[readerIndex_]; auto projectedColumnType = projectedColumn.type; - auto projectedColumnNodeId = - projectedColumnNameNodeIdMap_.at(projectedColumn.name); + auto it = projectedColumnNameNodeIdMap_.find(projectedColumn.name); + VELOX_CHECK( + it != projectedColumnNameNodeIdMap_.end(), + "Projected column '{}' not found in node id map", + projectedColumn.name); + auto projectedColumnNodeId = it->second; readerIndex_++; return std::make_shared( pool, diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index 7ef21123a025..f0f319740930 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -30,8 +30,11 @@ class ClpIrCursor final : public BaseClpCursor { bool ignoreCase) : BaseClpCursor(inputSource, splitPath), ignoreCase_(ignoreCase) {} - // TODO: Need to expose an API in CLP to get the internal counter of scanned - // log events. Currently this returns the same number of filtered rows. + ClpIrCursor(const ClpIrCursor&) = delete; + ClpIrCursor& operator=(const ClpIrCursor&) = delete; + ClpIrCursor(ClpIrCursor&&) = delete; + ClpIrCursor& operator=(ClpIrCursor&&) = delete; + uint64_t fetchNext(uint64_t numRows) override; size_t getNumFilteredRows() const override; @@ -49,7 +52,7 @@ class ClpIrCursor final : public BaseClpCursor { bool, ::clp::ffi::SchemaTree::Node::id_t, std::pair)> - handleProjectionResolution = + projectionResolutionCallback_ = [this]( [[maybe_unused]] bool isAutoGenerated, [[maybe_unused]] ::clp::ffi::SchemaTree::Node::id_t nodeId, @@ -61,7 +64,7 @@ class ClpIrCursor final : public BaseClpCursor { return ystdlib::error_handling::success(); }; using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< - decltype(handleProjectionResolution)>; + decltype(projectionResolutionCallback_)>; bool ignoreCase_; std::shared_ptr< ::clp::ffi::ir_stream::Deserializer> @@ -75,7 +78,7 @@ class ClpIrCursor final : public BaseClpCursor { std::pair> splitFieldsToNamesAndTypes() const; - ystdlib::error_handling::Result deserialize(uint64_t numRows) const; + ystdlib::error_handling::Result deserialize(uint64_t numRows); VectorPtr createVectorHelper( memory::MemoryPool* pool, diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp index 4cab686a8891..e063265e029c 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp @@ -15,9 +15,8 @@ */ #include "velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h" - #include "clp_s/SchemaTree.hpp" -#include "common/base/Exceptions.h" +#include "velox/common/base/Exceptions.h" namespace facebook::velox::connector::clp::search_lib { diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h index b8d55aa94b0a..b6b5208fb9b1 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include "ffi/ir_stream/Deserializer.hpp" namespace facebook::velox::connector::clp::search_lib { diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index 50dd0a978691..787fea5e27db 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -27,7 +27,7 @@ void ClpIrVectorLoader::loadInternal( auto vector = *result; for (int vectorIndex : rows) { auto& logEvent = filteredLogEvents_->at(vectorIndex); - // TODO: also need to support auto gen + // TODO: also need to support auto-generated keys auto userGenNodeIdValueMap = logEvent->get_user_gen_node_id_value_pairs(); vector->setNull(vectorIndex, true); if (0 == userGenNodeIdValueMap.count(nodeId_)) { diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index 381bfd9e962f..67d105c5505c 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -17,24 +17,21 @@ #pragma once #include + #include "connectors/clp/search_lib/BaseClpCursor.h" #include "ffi/ir_stream/Deserializer.hpp" - -#include "velox/type/Timestamp.h" #include "velox/vector/FlatVector.h" #include "velox/vector/LazyVector.h" namespace facebook::velox::connector::clp::search_lib { -enum class ColumnType; - class ClpIrVectorLoader : public VectorLoader { public: ClpIrVectorLoader( ColumnType nodeType, ::clp::ffi::SchemaTree::Node::id_t nodeId, - std::shared_ptr< - const std::vector>> + const std::shared_ptr< + const std::vector>>& filteredLogEvents) : nodeType_(nodeType), nodeId_(nodeId), From 4ef6e96938210c463785a9bdd588a1985a69ca9b Mon Sep 17 00:00:00 2001 From: anlowee Date: Tue, 9 Sep 2025 15:26:36 +0000 Subject: [PATCH 23/34] Address comments --- .../clp/search_lib/ir/CMakeLists.txt | 3 +- .../clp/search_lib/ir/ClpIrCursor.cpp | 24 ++--- .../clp/search_lib/ir/ClpIrCursor.h | 8 +- .../clp/search_lib/ir/ClpIrUnitHandler.cpp | 43 --------- .../clp/search_lib/ir/ClpIrUnitHandler.h | 18 +++- .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 16 ++-- .../clp/search_lib/ir/ClpIrVectorLoader.h | 8 +- .../connectors/clp/tests/ClpConnectorTest.cpp | 84 ++++++++++++++++++ .../clp/tests/examples/test_2_ir.clps | Bin 0 -> 1845 bytes 9 files changed, 131 insertions(+), 73 deletions(-) delete mode 100644 velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp create mode 100644 velox/connectors/clp/tests/examples/test_2_ir.clps diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index 480d9346dcbf..35a49d592613 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -16,7 +16,6 @@ velox_add_library( STATIC ClpIrCursor.cpp ClpIrCursor.h - ClpIrUnitHandler.cpp ClpIrUnitHandler.h ClpIrVectorLoader.cpp ClpIrVectorLoader.h) @@ -25,6 +24,8 @@ velox_link_libraries( clp-s-ir-search PUBLIC clp_s::archive_reader PRIVATE + # Once the IR-stream-related targets are exported, this should be updated to + # use more fine-grained dependencies. clp_s::clp_dependencies clp_s::io clp_s::search diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 6785967ee2fc..33be75a9dbd2 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -59,10 +59,10 @@ VectorPtr ClpIrCursor::createVector( const TypePtr& vectorType, size_t vectorSize) { VELOX_CHECK_EQ( - projectedColumnNameNodeIdMap_.size(), + projectedColumnIdNodeIdMap_.size(), outputColumns_.size(), "Projected columns size {} does not match fields size {}", - projectedColumnNameNodeIdMap_.size(), + projectedColumnIdNodeIdMap_.size(), outputColumns_.size()); return createVectorHelper(pool, vectorType, vectorSize); } @@ -129,7 +129,8 @@ ClpIrCursor::splitFieldsToNamesAndTypes() const { literalType = search::ast::LiteralType::IntegerT; break; case ColumnType::String: - literalType = search::ast::LiteralType::VarStringT; + literalType = search::ast::LiteralType::VarStringT | + search::ast::LiteralType::ClpStringT; break; case ColumnType::Timestamp: // TODO: IR timestamp support pending; constrain to Unknown to avoid @@ -182,23 +183,22 @@ VectorPtr ClpIrCursor::createVectorHelper( auto vector = BaseVector::create(vectorType, vectorSize, pool); vector->setNulls(allocateNulls(vectorSize, pool, bits::kNull)); VELOX_CHECK_LT( - readerIndex_, - projectedColumnNameNodeIdMap_.size(), - "Reader index out of bounds"); + readerIndex_, outputColumns_.size(), "Reader index out of bounds"); auto projectedColumn = outputColumns_[readerIndex_]; auto projectedColumnType = projectedColumn.type; - auto it = projectedColumnNameNodeIdMap_.find(projectedColumn.name); - VELOX_CHECK( - it != projectedColumnNameNodeIdMap_.end(), - "Projected column '{}' not found in node id map", - projectedColumn.name); - auto projectedColumnNodeId = it->second; + auto it = projectedColumnIdNodeIdMap_.find(readerIndex_); + bool isResolved = it != projectedColumnIdNodeIdMap_.end(); + ::clp::ffi::SchemaTree::Node::id_t projectedColumnNodeId; + if (isResolved) { + projectedColumnNodeId = it->second; + } readerIndex_++; return std::make_shared( pool, vectorType, vectorSize, std::make_unique( + isResolved, projectedColumnType, projectedColumnNodeId, irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()), diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index f0f319740930..ebc33223e2fc 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -59,8 +59,8 @@ class ClpIrCursor final : public BaseClpCursor { [[maybe_unused]] std::pair projected_key_and_index) -> ystdlib::error_handling::Result { - projectedColumnNameNodeIdMap_.insert( - {std::string(projected_key_and_index.first), nodeId}); + projectedColumnIdNodeIdMap_.insert( + {projected_key_and_index.second, nodeId}); return ystdlib::error_handling::success(); }; using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< @@ -70,8 +70,8 @@ class ClpIrCursor final : public BaseClpCursor { ::clp::ffi::ir_stream::Deserializer> irDeserializer_; std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; - std::unordered_map - projectedColumnNameNodeIdMap_; + std::unordered_map + projectedColumnIdNodeIdMap_; size_t readerIndex_{0}; std::vector< diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp deleted file mode 100644 index e063265e029c..000000000000 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h" -#include "clp_s/SchemaTree.hpp" -#include "velox/common/base/Exceptions.h" - -namespace facebook::velox::connector::clp::search_lib { - -auto ClpIrUnitHandler::handle_log_event( - ::clp::ffi::KeyValuePairLogEvent log_event, - size_t log_event_idx) -> ::clp::ffi::ir_stream::IRErrorCode { - filteredLogEvents_->push_back( - std::make_unique<::clp::ffi::KeyValuePairLogEvent>(std::move(log_event))); - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; -} - -auto ClpIrUnitHandler::handle_schema_tree_node_insertion( - bool is_auto_generated, - ::clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator, - std::shared_ptr<::clp::ffi::SchemaTree const> const& schema_tree) - -> ::clp::ffi::ir_stream::IRErrorCode { - return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; -} - -void ClpIrUnitHandler::clearFilteredLogEvents() { - filteredLogEvents_->clear(); -} - -} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h index b6b5208fb9b1..e4c2ca77fe3d 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h @@ -19,7 +19,10 @@ #include #include +#include "clp_s/SchemaTree.hpp" #include "ffi/ir_stream/Deserializer.hpp" +#include "velox/common/base/Exceptions.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrUnitHandler.h" namespace facebook::velox::connector::clp::search_lib { @@ -36,7 +39,12 @@ class ClpIrUnitHandler { // Methods implementing `IrUnitHandlerInterface` [[nodiscard]] auto handle_log_event( ::clp::ffi::KeyValuePairLogEvent log_event, - size_t log_event_idx) -> ::clp::ffi::ir_stream::IRErrorCode; + size_t log_event_idx) -> ::clp::ffi::ir_stream::IRErrorCode { + filteredLogEvents_->push_back( + std::make_unique<::clp::ffi::KeyValuePairLogEvent>( + std::move(log_event))); + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + } [[nodiscard]] auto handle_utc_offset_change( [[maybe_unused]] ::clp::UtcOffset utc_offset_old, @@ -50,7 +58,9 @@ class ClpIrUnitHandler { [[maybe_unused]] ::clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator, [[maybe_unused]] std::shared_ptr<::clp::ffi::SchemaTree const> const& - schema_tree) -> ::clp::ffi::ir_stream::IRErrorCode; + schema_tree) -> ::clp::ffi::ir_stream::IRErrorCode { + return ::clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + } [[nodiscard]] auto handle_end_of_stream() -> ::clp::ffi::ir_stream::IRErrorCode { @@ -63,7 +73,9 @@ class ClpIrUnitHandler { return filteredLogEvents_; } - void clearFilteredLogEvents(); + void clearFilteredLogEvents() { + filteredLogEvents_->clear(); + } private: std::shared_ptr< diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index 787fea5e27db..c082aecb85d0 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -26,17 +26,19 @@ void ClpIrVectorLoader::loadInternal( VectorPtr* result) { auto vector = *result; for (int vectorIndex : rows) { - auto& logEvent = filteredLogEvents_->at(vectorIndex); - // TODO: also need to support auto-generated keys - auto userGenNodeIdValueMap = logEvent->get_user_gen_node_id_value_pairs(); vector->setNull(vectorIndex, true); - if (0 == userGenNodeIdValueMap.count(nodeId_)) { + if (!isResolved_) { continue; } - auto value = userGenNodeIdValueMap.at(nodeId_); - if (!value.has_value()) { + auto& logEvent = filteredLogEvents_->at(vectorIndex); + // TODO: also need to support auto-generated keys + auto userGenNodeIdValueMap = logEvent->get_user_gen_node_id_value_pairs(); + auto const value_it{userGenNodeIdValueMap.find(nodeId_)}; + if (userGenNodeIdValueMap.end() == value_it || + false == value_it->second.has_value()) { continue; } + auto const& value{value_it->second}; switch (nodeType_) { case ColumnType::String: { auto stringVector = vector->asFlatVector(); @@ -110,7 +112,7 @@ void ClpIrVectorLoader::loadInternal( size_t numElements{0ULL}; auto elements = arrayVector->elements()->asFlatVector(); - auto obj = arrayParser_->iterate(jsonString); + auto obj = arrayParser_.iterate(jsonString); std::vector rawElements; for (auto arrayElement : obj.get_array()) { auto raw_element = simdjson::to_json_string(arrayElement).value(); diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index 67d105c5505c..bdbf3fd8a58c 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -28,19 +28,21 @@ namespace facebook::velox::connector::clp::search_lib { class ClpIrVectorLoader : public VectorLoader { public: ClpIrVectorLoader( + bool isResolved, ColumnType nodeType, ::clp::ffi::SchemaTree::Node::id_t nodeId, const std::shared_ptr< const std::vector>>& filteredLogEvents) - : nodeType_(nodeType), + : isResolved_(isResolved), + nodeType_(nodeType), nodeId_(nodeId), filteredLogEvents_(filteredLogEvents) {} private: - inline static thread_local std::unique_ptr - arrayParser_ = std::make_unique(); + simdjson::ondemand::parser arrayParser_; + bool isResolved_; ColumnType nodeType_; ::clp::ffi::SchemaTree::Node::id_t nodeId_; std::shared_ptr< diff --git a/velox/connectors/clp/tests/ClpConnectorTest.cpp b/velox/connectors/clp/tests/ClpConnectorTest.cpp index f2f7f121796d..3bec10d9ef5d 100644 --- a/velox/connectors/clp/tests/ClpConnectorTest.cpp +++ b/velox/connectors/clp/tests/ClpConnectorTest.cpp @@ -256,6 +256,50 @@ TEST_F(ClpConnectorTest, test2NoPushdown) { makeFlatVector({"WARNING"}), })}); test::assertEqualVectors(expected, output); + + // IR stream currently does not support TIMESTAMP type; will merge into the + // plan above as soon as this feature is implemented + auto irPlan = + PlanBuilder(pool_.get()) + .startTableScan() + .outputType( + ROW({"event"}, + {ROW( + {"type", "subtype", "severity", "tags"}, + {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())})})) + .tableHandle( + std::make_shared(kClpConnectorId, "test_2")) + .assignments( + {{"event", + std::make_shared( + "event", + "event", + ROW({"type", "subtype", "severity", "tags"}, + {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())}))}}) + .endTableScan() + .filter( + "event.severity IN ('WARNING', 'ERROR') AND " + "((event.type = 'network' AND event.subtype = 'connection') OR " + "(event.type = 'storage' AND event.subtype LIKE 'disk_usage%'))") + .planNode(); + auto irOutput = getResults( + irPlan, + {makeClpSplit( + getExampleFilePath("test_2_ir.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + auto irExpected = makeRowVector( + {// event + makeRowVector( + {// event.type + makeFlatVector({"storage"}), + // event.subtype + makeFlatVector({"disk_usage"}), + // event.severity + makeFlatVector({"WARNING"}), + // event.tags + makeArrayVector({{"\"backup\"", "\"daily\""}})})}); + test::assertEqualVectors(irExpected, irOutput); } TEST_F(ClpConnectorTest, test2Pushdown) { @@ -305,6 +349,46 @@ TEST_F(ClpConnectorTest, test2Pushdown) { makeFlatVector({"WARNING"}), })}); test::assertEqualVectors(expected, output); + + // IR stream currently does not support TIMESTAMP type; will merge into the + // plan above as soon as this feature is implemented + auto irPlan = + PlanBuilder(pool_.get()) + .startTableScan() + .outputType( + ROW({"event"}, + {ROW( + {"type", "subtype", "severity", "tags"}, + {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())})})) + .tableHandle( + std::make_shared(kClpConnectorId, "test_2")) + .assignments( + {{"event", + std::make_shared( + "event", + "event", + ROW({"type", "subtype", "severity", "tags"}, + {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())}))}}) + .endTableScan() + .planNode(); + auto irOutput = getResults( + irPlan, + {makeClpSplit( + getExampleFilePath("test_2_ir.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + auto irExpected = makeRowVector( + {// event + makeRowVector({// event.type + makeFlatVector({"storage"}), + // event.subtype + makeFlatVector({"disk_usage"}), + // event.severity + makeFlatVector({"WARNING"}), + // event.tags + makeArrayVector( + {{"\"filesystem\"", "\"monitoring\""}})})}); + test::assertEqualVectors(irExpected, irOutput); } TEST_F(ClpConnectorTest, test2Hybrid) { diff --git a/velox/connectors/clp/tests/examples/test_2_ir.clps b/velox/connectors/clp/tests/examples/test_2_ir.clps new file mode 100644 index 0000000000000000000000000000000000000000..06c3691ef34e5417e2ab35fdddb46a8ab80061b9 GIT binary patch literal 1845 zcmaJ>U31$+6ipM^az07gl&?aFRSLA!wrn>{8+gc)xQ%BL8`~K(#&oQ;_QqZ!X=Uxo z8N={5_#M3PFW`m$!wh%jI5mu?deMH~d-j}j?>~h<9^cvd?H8*}npN`cS*6u$9iLXt zNw0ch9a^sN^AqWc5b~}U=G!#lwC{mzxkK=2yJS7Rz1XYNj_W7Y+N<7q<>It{(t2xz zO5h5{gTXndp8SE<`~i>>kPy5fGREm<*e!D5&n!& zL<;V}3K)=rG_1lpY>?fOT{_6w&vIq^!hU{uU?1A{55x>WU5n_7thpixz}2CMQLVkK zcd|QOiwAuX2Mk}R+|YVWHi|Dw`Qlg4^W}WeCYhq0*Y7=Oe5Vjj%Ez-x153q{4q)VL z5i@3J#)I@}la6w9vLd7CA$H8qrj_ELy;YK28 z8h>kjj95aXoV(L}dXV-LTWAdF2&mM`XcXZbihHmN_u<_|NRWjs8ml2v#@$3%VqT&H z;s{8p(rnh7o$PMc!guljDUjgC+B66&HHgs-WJSthC%esqG4(jx52=)>FC!ZzyR5}3 zmh&Y$Q48OxATd@PYNw8z_fY{rA22xwb{pP<4;DZzY0~iZW9|Za6NANajx|OB^+AY3 zrL=QW`p<|N#X*3IHMh2PY9oq0bo7T>ri3!&egIe{CJI6)o}lyUJ%Eqk<3&)AHLRa0 z6(mJH)PxwP)mNv;MAuS76hdjXMxJ&`t8rG%*>;hv7HG%|j4Fzxwkl_p{jEIeEQwv4=?NqiAaZt8@j zYxU+iIqPIMyH*6#GMCot#&-KCyOezGqrK@sGqZ}VEffJ-M8iQ16gGi@avsfG|A=IS z$J*43QB<4+aGgx%(A|Si;WPLgzF341>D)vcT=GF2O+A=)hcv*#yy*<+K+60Z2PWAO zE)uWTNJbOdrxJ*%Z6#vW*lT=~0LNxL4X9^ENYzje)^XM6Qex|oG0sz$9=5;#fPD;4 z;7fSA0PHiQ)6i{!q^X+}@SBElf7&om`7q*RJTm}>I87~X5%o9F7mTbIK)D4uq)c3s z)#y4m{W@8^E+LRy@iAZw8Z%x0n)8*PL|-|%S=Hxe{?~gMc0=08v2W`s@r~p^qfbiz z^GO~GP=xZLr=bq%K6S6+aF$>uz}>Dj-9TIn+k Date: Tue, 9 Sep 2025 19:36:26 +0000 Subject: [PATCH 24/34] WIP timestamp --- .../clp/search_lib/BaseClpCursor.cpp | 67 +++++++++++++ .../connectors/clp/search_lib/BaseClpCursor.h | 37 +++++++ .../archive/ClpArchiveVectorLoader.cpp | 93 ------------------ .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 32 +++++- .../connectors/clp/tests/ClpConnectorTest.cpp | 92 ++++------------- .../clp/tests/examples/test_3_ir.clps | Bin 0 -> 261 bytes 6 files changed, 155 insertions(+), 166 deletions(-) create mode 100644 velox/connectors/clp/tests/examples/test_3_ir.clps diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.cpp b/velox/connectors/clp/search_lib/BaseClpCursor.cpp index b8ab7db9457b..acae8d0b2a27 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.cpp +++ b/velox/connectors/clp/search_lib/BaseClpCursor.cpp @@ -30,6 +30,73 @@ using namespace clp_s::search::ast; namespace facebook::velox::connector::clp::search_lib { +template +auto estimatePrecision(T timestamp) -> TimestampPrecision { + constexpr int64_t kEpochMilliseconds1971{31536000000}; + constexpr int64_t kEpochMicroseconds1971{31536000000000}; + constexpr int64_t kEpochNanoseconds1971{31536000000000000}; + auto absTimestamp = timestamp >= 0 ? timestamp : -timestamp; + + if (absTimestamp > kEpochNanoseconds1971) { + return TimestampPrecision::Nanoseconds; + } else if (absTimestamp > kEpochMicroseconds1971) { + return TimestampPrecision::Microseconds; + } else if (absTimestamp > kEpochMilliseconds1971) { + return TimestampPrecision::Milliseconds; + } else { + return TimestampPrecision::Seconds; + } +} + +auto convertToVeloxTimestamp(double timestamp) -> Timestamp { + switch (estimatePrecision(timestamp)) { + case TimestampPrecision::Nanoseconds: + timestamp /= Timestamp::kNanosInSecond; + break; + case TimestampPrecision::Microseconds: + timestamp /= Timestamp::kMicrosecondsInSecond; + break; + case TimestampPrecision::Milliseconds: + timestamp /= Timestamp::kMillisecondsInSecond; + break; + case TimestampPrecision::Seconds: + break; + } + double seconds{std::floor(timestamp)}; + double nanoseconds{(timestamp - seconds) * Timestamp::kNanosInSecond}; + return Timestamp( + static_cast(seconds), static_cast(nanoseconds)); +} + +auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { + int64_t precisionDifference{Timestamp::kNanosInSecond}; + switch (estimatePrecision(timestamp)) { + case TimestampPrecision::Nanoseconds: + break; + case TimestampPrecision::Microseconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMicrosecond; + break; + case TimestampPrecision::Milliseconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMillisecond; + break; + case TimestampPrecision::Seconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosInSecond; + break; + } + int64_t seconds{timestamp / precisionDifference}; + int64_t nanoseconds{ + (timestamp % precisionDifference) * + (Timestamp::kNanosInSecond / precisionDifference)}; + if (nanoseconds < 0) { + seconds -= 1; + nanoseconds += Timestamp::kNanosInSecond; + } + return Timestamp(seconds, static_cast(nanoseconds)); +} + void BaseClpCursor::executeQuery( const std::string& query, const std::vector& outputColumns) { diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index 92940081f909..c9a37a04f1d1 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -60,6 +60,43 @@ struct Field { std::string name; }; +enum class TimestampPrecision : uint8_t { + Seconds, + Milliseconds, + Microseconds, + Nanoseconds +}; + +/// Estimates the precision of an epoch timestamp as seconds, milliseconds, +/// microseconds, or nanoseconds. +/// +/// This heuristic relies on the fact that 1 year of epoch nanoseconds is +/// approximately 1000 years of epoch microseconds and so on. This heuristic +/// can be unreliable for timestamps sufficiently close to the epoch, but +/// should otherwise be accurate for the next 1000 years. +/// +/// Note: Future versions of the clp-s archive format will adopt a +/// nanosecond-precision integer timestamp format (as opposed to the current +/// format which allows other precisions), at which point we can remove this +/// heuristic. +/// +/// @param timestamp +/// @return the estimated timestamp precision +template +auto estimatePrecision(T timestamp) -> TimestampPrecision; + +/// Converts a float value into a Velox timestamp. +/// +/// @param timestamp the input timestamp as a float +/// @return the corresponding Velox timestamp +auto convertToVeloxTimestamp(double timestamp) -> Timestamp; + +/// Converts an integer value into a Velox timestamp. +/// +/// @param timestamp the input timestamp as an integer +/// @return the corresponding Velox timestamp +auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp; + /// A query execution interface that manages the lifecycle of a query on a CLP-S /// split (archive or IR), including parsing and validating the query, loading /// the relevant splits, applying filters, and iterating over the results. It diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp index 1a48f9a6a911..907fa95954dd 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp @@ -28,99 +28,6 @@ namespace facebook::velox::connector::clp::search_lib { -namespace { - -enum class TimestampPrecision : uint8_t { - Seconds, - Milliseconds, - Microseconds, - Nanoseconds -}; - -/// Estimates the precision of an epoch timestamp as seconds, milliseconds, -/// microseconds, or nanoseconds. -/// -/// This heuristic relies on the fact that 1 year of epoch nanoseconds is -/// approximately 1000 years of epoch microseconds and so on. This heuristic -/// can be unreliable for timestamps sufficiently close to the epoch, but -/// should otherwise be accurate for the next 1000 years. -/// -/// Note: Future versions of the clp-s archive format will adopt a -/// nanosecond-precision integer timestamp format (as opposed to the current -/// format which allows other precisions), at which point we can remove this -/// heuristic. -/// -/// @param timestamp -/// @return the estimated timestamp precision -template -auto estimatePrecision(T timestamp) -> TimestampPrecision { - constexpr int64_t kEpochMilliseconds1971{31536000000}; - constexpr int64_t kEpochMicroseconds1971{31536000000000}; - constexpr int64_t kEpochNanoseconds1971{31536000000000000}; - auto absTimestamp = timestamp >= 0 ? timestamp : -timestamp; - - if (absTimestamp > kEpochNanoseconds1971) { - return TimestampPrecision::Nanoseconds; - } else if (absTimestamp > kEpochMicroseconds1971) { - return TimestampPrecision::Microseconds; - } else if (absTimestamp > kEpochMilliseconds1971) { - return TimestampPrecision::Milliseconds; - } else { - return TimestampPrecision::Seconds; - } -} - -auto convertToVeloxTimestamp(double timestamp) -> Timestamp { - switch (estimatePrecision(timestamp)) { - case TimestampPrecision::Nanoseconds: - timestamp /= Timestamp::kNanosInSecond; - break; - case TimestampPrecision::Microseconds: - timestamp /= Timestamp::kMicrosecondsInSecond; - break; - case TimestampPrecision::Milliseconds: - timestamp /= Timestamp::kMillisecondsInSecond; - break; - case TimestampPrecision::Seconds: - break; - } - double seconds{std::floor(timestamp)}; - double nanoseconds{(timestamp - seconds) * Timestamp::kNanosInSecond}; - return Timestamp( - static_cast(seconds), static_cast(nanoseconds)); -} - -auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { - int64_t precisionDifference{Timestamp::kNanosInSecond}; - switch (estimatePrecision(timestamp)) { - case TimestampPrecision::Nanoseconds: - break; - case TimestampPrecision::Microseconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMicrosecond; - break; - case TimestampPrecision::Milliseconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMillisecond; - break; - case TimestampPrecision::Seconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosInSecond; - break; - } - int64_t seconds{timestamp / precisionDifference}; - int64_t nanoseconds{ - (timestamp % precisionDifference) * - (Timestamp::kNanosInSecond / precisionDifference)}; - if (nanoseconds < 0) { - seconds -= 1; - nanoseconds += Timestamp::kNanosInSecond; - } - return Timestamp(seconds, static_cast(nanoseconds)); -} - -} // namespace - ClpArchiveVectorLoader::ClpArchiveVectorLoader( clp_s::BaseColumnReader* columnReader, ColumnType nodeType, diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index c082aecb85d0..a3db463dff21 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -14,8 +14,10 @@ * limitations under the License. */ -#include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" +#include + #include "velox/connectors/clp/search_lib/BaseClpCursor.h" +#include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" namespace facebook::velox::connector::clp::search_lib { @@ -89,6 +91,34 @@ void ClpIrVectorLoader::loadInternal( vector->setNull(vectorIndex, false); break; } + case ColumnType::Timestamp: { + auto timestampVector = vector->asFlatVector(); + if (value->is()) { + timestampVector->set( + vectorIndex, + convertToVeloxTimestamp(value->get_immutable_view())); + } else if (value->is()) { + timestampVector->set( + vectorIndex, + convertToVeloxTimestamp(value->get_immutable_view())); + } else if (value->is()) { + auto stringValue = + std::string(value->get_immutable_view().data()); + std::istringstream in{stringValue.substr(0, 19)}; + std::tm tm = {}; + in >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%S"); + if (in.fail()) { + VELOX_FAIL( + "Failed to parse the timestamp format: %Y-%m-%dT%H:%M:%S"); + } + time_t epoch_seconds = timegm(&tm); + timestampVector->set( + vectorIndex, convertToVeloxTimestamp(epoch_seconds)); + } else { + VELOX_FAIL("Unsupported timestamp type"); + } + break; + } case ColumnType::Array: { auto arrayVector = std::dynamic_pointer_cast(vector); std::string jsonString; diff --git a/velox/connectors/clp/tests/ClpConnectorTest.cpp b/velox/connectors/clp/tests/ClpConnectorTest.cpp index 3bec10d9ef5d..23a52e5f8978 100644 --- a/velox/connectors/clp/tests/ClpConnectorTest.cpp +++ b/velox/connectors/clp/tests/ClpConnectorTest.cpp @@ -257,49 +257,13 @@ TEST_F(ClpConnectorTest, test2NoPushdown) { })}); test::assertEqualVectors(expected, output); - // IR stream currently does not support TIMESTAMP type; will merge into the - // plan above as soon as this feature is implemented - auto irPlan = - PlanBuilder(pool_.get()) - .startTableScan() - .outputType( - ROW({"event"}, - {ROW( - {"type", "subtype", "severity", "tags"}, - {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())})})) - .tableHandle( - std::make_shared(kClpConnectorId, "test_2")) - .assignments( - {{"event", - std::make_shared( - "event", - "event", - ROW({"type", "subtype", "severity", "tags"}, - {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())}))}}) - .endTableScan() - .filter( - "event.severity IN ('WARNING', 'ERROR') AND " - "((event.type = 'network' AND event.subtype = 'connection') OR " - "(event.type = 'storage' AND event.subtype LIKE 'disk_usage%'))") - .planNode(); auto irOutput = getResults( - irPlan, + plan, {makeClpSplit( getExampleFilePath("test_2_ir.clps"), ClpConnectorSplit::SplitType::kIr, kqlQuery)}); - auto irExpected = makeRowVector( - {// event - makeRowVector( - {// event.type - makeFlatVector({"storage"}), - // event.subtype - makeFlatVector({"disk_usage"}), - // event.severity - makeFlatVector({"WARNING"}), - // event.tags - makeArrayVector({{"\"backup\"", "\"daily\""}})})}); - test::assertEqualVectors(irExpected, irOutput); + test::assertEqualVectors(expected, irOutput); } TEST_F(ClpConnectorTest, test2Pushdown) { @@ -350,45 +314,13 @@ TEST_F(ClpConnectorTest, test2Pushdown) { })}); test::assertEqualVectors(expected, output); - // IR stream currently does not support TIMESTAMP type; will merge into the - // plan above as soon as this feature is implemented - auto irPlan = - PlanBuilder(pool_.get()) - .startTableScan() - .outputType( - ROW({"event"}, - {ROW( - {"type", "subtype", "severity", "tags"}, - {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())})})) - .tableHandle( - std::make_shared(kClpConnectorId, "test_2")) - .assignments( - {{"event", - std::make_shared( - "event", - "event", - ROW({"type", "subtype", "severity", "tags"}, - {VARCHAR(), VARCHAR(), VARCHAR(), ARRAY(VARCHAR())}))}}) - .endTableScan() - .planNode(); auto irOutput = getResults( - irPlan, + plan, {makeClpSplit( getExampleFilePath("test_2_ir.clps"), ClpConnectorSplit::SplitType::kIr, kqlQuery)}); - auto irExpected = makeRowVector( - {// event - makeRowVector({// event.type - makeFlatVector({"storage"}), - // event.subtype - makeFlatVector({"disk_usage"}), - // event.severity - makeFlatVector({"WARNING"}), - // event.tags - makeArrayVector( - {{"\"filesystem\"", "\"monitoring\""}})})}); - test::assertEqualVectors(irExpected, irOutput); + test::assertEqualVectors(expected, irOutput); } TEST_F(ClpConnectorTest, test2Hybrid) { @@ -442,6 +374,14 @@ TEST_F(ClpConnectorTest, test2Hybrid) { }); test::assertEqualVectors(expected, output); + + auto irOutput = getResults( + plan, + {makeClpSplit( + getExampleFilePath("test_2_ir.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + test::assertEqualVectors(expected, irOutput); } TEST_F(ClpConnectorTest, test3TimestampMarshalling) { @@ -473,6 +413,14 @@ TEST_F(ClpConnectorTest, test3TimestampMarshalling) { Timestamp(kTestTimestampSeconds, kTestTimestampNanoseconds)}), }); test::assertEqualVectors(expected, output); + + auto irOutput = getResults( + plan, + {makeClpSplit( + getExampleFilePath("test_3_ir.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + test::assertEqualVectors(expected, irOutput); } } // namespace diff --git a/velox/connectors/clp/tests/examples/test_3_ir.clps b/velox/connectors/clp/tests/examples/test_3_ir.clps new file mode 100644 index 0000000000000000000000000000000000000000..7f185fb9b3ea1253522f50af6ada48fed473e43c GIT binary patch literal 261 zcmeyXzg3e_a9Xuem}8Kqqmz$oaD1?{hpVq+yr+wjl~Qtku3lwva(+RoUUE)>URYvL zW@1uKYH@IKMrv+in30kWTw}bepR>P^HVbO(tT4)GV)W3 z!wf+txdsJ$`uhQ`HPAEEGf=86Nnmi~EXmAGEiOsSEl6c_lrSQ zHLx-;)iW?Kh$=!=$rR>zi=~p^fq|hARf0K$fz9|g!{d_ Date: Wed, 10 Sep 2025 18:37:19 +0000 Subject: [PATCH 25/34] WIP --- .../clp/search_lib/ir/CMakeLists.txt | 2 +- .../clp/search_lib/ir/ClpIrCursor.cpp | 10 ++++--- .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 17 ++++-------- .../clp/search_lib/ir/ClpIrVectorLoader.h | 26 +++++++++++-------- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index 35a49d592613..c6ec4aa0f633 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -22,7 +22,7 @@ velox_add_library( velox_link_libraries( clp-s-ir-search - PUBLIC clp_s::archive_reader + PUBLIC clp_s::archive_reader clp_s::archive_writer PRIVATE # Once the IR-stream-related targets are exported, this should be updated to # use more fine-grained dependencies. diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 124fa5e0b063..4ea2061c7e64 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -135,7 +135,10 @@ ClpIrCursor::splitFieldsToNamesAndTypes() const { case ColumnType::Timestamp: // TODO: IR timestamp support pending; constrain to Unknown to avoid // mismatched projections. - literalType = search::ast::LiteralType::EpochDateT; + literalType = search::ast::LiteralType::FloatT | + search::ast::LiteralType::IntegerT | + search::ast::LiteralType::VarStringT | + search::ast::LiteralType::ClpStringT; break; default: literalType = search::ast::LiteralType::UnknownT; @@ -201,10 +204,11 @@ VectorPtr ClpIrCursor::createVectorHelper( vectorType, vectorSize, std::make_unique( + irDeserializer_->get_ir_unit_handler().getFilteredLogEvents(), isResolved, - projectedColumnType, projectedColumnNodeId, - irDeserializer_->get_ir_unit_handler().getFilteredLogEvents()), + projectedColumn.name, + projectedColumnType), std::move(vector)); } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index a3db463dff21..e7fbdb1eaf4d 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -102,18 +102,11 @@ void ClpIrVectorLoader::loadInternal( vectorIndex, convertToVeloxTimestamp(value->get_immutable_view())); } else if (value->is()) { - auto stringValue = - std::string(value->get_immutable_view().data()); - std::istringstream in{stringValue.substr(0, 19)}; - std::tm tm = {}; - in >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%S"); - if (in.fail()) { - VELOX_FAIL( - "Failed to parse the timestamp format: %Y-%m-%dT%H:%M:%S"); - } - time_t epoch_seconds = timegm(&tm); - timestampVector->set( - vectorIndex, convertToVeloxTimestamp(epoch_seconds)); + auto stringValue = value->get_immutable_view().data(); + uint64_t encodingId{}; + auto const timestamp = timestampDict_.ingest_entry( + nodeName_, nodeId_, stringValue, encodingId); + timestampVector->set(vectorIndex, convertToVeloxTimestamp(timestamp)); } else { VELOX_FAIL("Unsupported timestamp type"); } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index bdbf3fd8a58c..757d8ba6b7c4 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -18,6 +18,7 @@ #include +#include "clp_s/TimestampDictionaryWriter.hpp" #include "connectors/clp/search_lib/BaseClpCursor.h" #include "ffi/ir_stream/Deserializer.hpp" #include "velox/vector/FlatVector.h" @@ -28,26 +29,29 @@ namespace facebook::velox::connector::clp::search_lib { class ClpIrVectorLoader : public VectorLoader { public: ClpIrVectorLoader( - bool isResolved, - ColumnType nodeType, - ::clp::ffi::SchemaTree::Node::id_t nodeId, const std::shared_ptr< const std::vector>>& - filteredLogEvents) - : isResolved_(isResolved), - nodeType_(nodeType), + filteredLogEvents, + bool isResolved, + ::clp::ffi::SchemaTree::Node::id_t nodeId, + std::string_view nodeName, + ColumnType nodeType) + : filteredLogEvents_(filteredLogEvents), + isResolved_(isResolved), nodeId_(nodeId), - filteredLogEvents_(filteredLogEvents) {} + nodeName_(nodeName), + nodeType_(nodeType) {} private: simdjson::ondemand::parser arrayParser_; - - bool isResolved_; - ColumnType nodeType_; - ::clp::ffi::SchemaTree::Node::id_t nodeId_; std::shared_ptr< const std::vector>> filteredLogEvents_; + bool isResolved_; + ::clp::ffi::SchemaTree::Node::id_t nodeId_; + std::string nodeName_; + ColumnType nodeType_; + clp_s::TimestampDictionaryWriter timestampDict_; void loadInternal( RowSet rows, From b25793818ab9a4bf0fa130fada1607b88c422d04 Mon Sep 17 00:00:00 2001 From: anlowee Date: Mon, 15 Sep 2025 19:28:55 +0000 Subject: [PATCH 26/34] Fix the unit tests --- .../clp/search_lib/ir/CMakeLists.txt | 1 + .../clp/search_lib/ir/ClpIrCursor.cpp | 18 +++--- .../clp/search_lib/ir/ClpIrCursor.h | 15 +++-- .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 19 ++++-- .../clp/search_lib/ir/ClpIrVectorLoader.h | 6 +- .../connectors/clp/tests/ClpConnectorTest.cpp | 56 +++++++++++++++++- .../clp/tests/examples/test_1_ir.clps | Bin 1068 -> 898 bytes .../clp/tests/examples/test_2_ir.clps | Bin 1845 -> 1675 bytes .../clp/tests/examples/test_4.ndjson | 6 ++ .../clp/tests/examples/test_4_ir.clps | Bin 0 -> 194 bytes 10 files changed, 96 insertions(+), 25 deletions(-) create mode 100644 velox/connectors/clp/tests/examples/test_4.ndjson create mode 100644 velox/connectors/clp/tests/examples/test_4_ir.clps diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index c6ec4aa0f633..0dfae454c641 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -29,5 +29,6 @@ velox_link_libraries( clp_s::clp_dependencies clp_s::io clp_s::search + clp_s::search::ast clp_s::search::kql velox_vector) diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 4ea2061c7e64..5c2fbb72048a 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -59,10 +59,10 @@ VectorPtr ClpIrCursor::createVector( const TypePtr& vectorType, size_t vectorSize) { VELOX_CHECK_EQ( - projectedColumnIdxNodeIdMap_.size(), + projectedColumnIdxNodeIdsMap_.size(), outputColumns_.size(), "Projected columns size {} does not match fields size {}", - projectedColumnIdxNodeIdMap_.size(), + projectedColumnIdxNodeIdsMap_.size(), outputColumns_.size()); return createVectorHelper(pool, vectorType, vectorSize); } @@ -136,9 +136,7 @@ ClpIrCursor::splitFieldsToNamesAndTypes() const { // TODO: IR timestamp support pending; constrain to Unknown to avoid // mismatched projections. literalType = search::ast::LiteralType::FloatT | - search::ast::LiteralType::IntegerT | - search::ast::LiteralType::VarStringT | - search::ast::LiteralType::ClpStringT; + search::ast::LiteralType::IntegerT; break; default: literalType = search::ast::LiteralType::UnknownT; @@ -192,11 +190,11 @@ VectorPtr ClpIrCursor::createVectorHelper( readerIndex_, outputColumns_.size(), "Reader index out of bounds"); auto projectedColumn = outputColumns_[readerIndex_]; auto projectedColumnType = projectedColumn.type; - auto it = projectedColumnIdxNodeIdMap_.find(readerIndex_); - bool isResolved = it != projectedColumnIdxNodeIdMap_.end(); - ::clp::ffi::SchemaTree::Node::id_t projectedColumnNodeId; + auto it = projectedColumnIdxNodeIdsMap_.find(readerIndex_); + bool isResolved = it != projectedColumnIdxNodeIdsMap_.end(); + std::vector<::clp::ffi::SchemaTree::Node::id_t> projectedColumnNodeIds; if (isResolved) { - projectedColumnNodeId = it->second; + projectedColumnNodeIds = it->second; } readerIndex_++; return std::make_shared( @@ -206,7 +204,7 @@ VectorPtr ClpIrCursor::createVectorHelper( std::make_unique( irDeserializer_->get_ir_unit_handler().getFilteredLogEvents(), isResolved, - projectedColumnNodeId, + projectedColumnNodeIds, projectedColumn.name, projectedColumnType), std::move(vector)); diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h index a132bc3ba351..c583a5b228ea 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.h @@ -59,8 +59,15 @@ class ClpIrCursor final : public BaseClpCursor { [[maybe_unused]] std::pair projected_key_and_index) -> ystdlib::error_handling::Result { - projectedColumnIdxNodeIdMap_.insert( - {projected_key_and_index.second, nodeId}); + auto it = + projectedColumnIdxNodeIdsMap_.find(projected_key_and_index.second); + if (it == projectedColumnIdxNodeIdsMap_.end()) { + projectedColumnIdxNodeIdsMap_.insert( + {projected_key_and_index.second, + std::vector<::clp::ffi::SchemaTree::Node::id_t>{nodeId}}); + return ystdlib::error_handling::success(); + } + it->second.emplace_back(nodeId); return ystdlib::error_handling::success(); }; using QueryHandlerType = ::clp::ffi::ir_stream::search::QueryHandler< @@ -70,8 +77,8 @@ class ClpIrCursor final : public BaseClpCursor { ::clp::ffi::ir_stream::Deserializer> irDeserializer_; std::shared_ptr<::clp::ReaderInterface> irReader_{nullptr}; - std::unordered_map - projectedColumnIdxNodeIdMap_; + std::unordered_map> + projectedColumnIdxNodeIdsMap_; size_t readerIndex_{0}; std::vector< diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index e7fbdb1eaf4d..5c110aaaceb7 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -35,12 +35,21 @@ void ClpIrVectorLoader::loadInternal( auto& logEvent = filteredLogEvents_->at(vectorIndex); // TODO: also need to support auto-generated keys auto userGenNodeIdValueMap = logEvent->get_user_gen_node_id_value_pairs(); - auto const value_it{userGenNodeIdValueMap.find(nodeId_)}; - if (userGenNodeIdValueMap.end() == value_it || - false == value_it->second.has_value()) { + std::unordered_map>::iterator + valueIt; + ::clp::ffi::SchemaTree::Node::id_t nodeId; + for (size_t i{0}; i < nodeIds_.size(); ++i) { + valueIt = userGenNodeIdValueMap.find(nodeIds_[i]); + if (valueIt != userGenNodeIdValueMap.end()) { + nodeId = nodeIds_[i]; + break; + } + } + if (userGenNodeIdValueMap.end() == valueIt || + false == valueIt->second.has_value()) { continue; } - auto const& value{value_it->second}; + auto const& value{valueIt->second}; switch (nodeType_) { case ColumnType::String: { auto stringVector = vector->asFlatVector(); @@ -105,7 +114,7 @@ void ClpIrVectorLoader::loadInternal( auto stringValue = value->get_immutable_view().data(); uint64_t encodingId{}; auto const timestamp = timestampDict_.ingest_entry( - nodeName_, nodeId_, stringValue, encodingId); + nodeName_, nodeId, stringValue, encodingId); timestampVector->set(vectorIndex, convertToVeloxTimestamp(timestamp)); } else { VELOX_FAIL("Unsupported timestamp type"); diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index 757d8ba6b7c4..a0dc279d65f4 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -33,12 +33,12 @@ class ClpIrVectorLoader : public VectorLoader { const std::vector>>& filteredLogEvents, bool isResolved, - ::clp::ffi::SchemaTree::Node::id_t nodeId, + std::vector<::clp::ffi::SchemaTree::Node::id_t> nodeIds, std::string_view nodeName, ColumnType nodeType) : filteredLogEvents_(filteredLogEvents), isResolved_(isResolved), - nodeId_(nodeId), + nodeIds_(nodeIds), nodeName_(nodeName), nodeType_(nodeType) {} @@ -48,7 +48,7 @@ class ClpIrVectorLoader : public VectorLoader { const std::vector>> filteredLogEvents_; bool isResolved_; - ::clp::ffi::SchemaTree::Node::id_t nodeId_; + std::vector<::clp::ffi::SchemaTree::Node::id_t> nodeIds_; std::string nodeName_; ColumnType nodeType_; clp_s::TimestampDictionaryWriter timestampDict_; diff --git a/velox/connectors/clp/tests/ClpConnectorTest.cpp b/velox/connectors/clp/tests/ClpConnectorTest.cpp index 23a52e5f8978..dc12b270eff2 100644 --- a/velox/connectors/clp/tests/ClpConnectorTest.cpp +++ b/velox/connectors/clp/tests/ClpConnectorTest.cpp @@ -413,14 +413,64 @@ TEST_F(ClpConnectorTest, test3TimestampMarshalling) { Timestamp(kTestTimestampSeconds, kTestTimestampNanoseconds)}), }); test::assertEqualVectors(expected, output); +} - auto irOutput = getResults( +TEST_F(ClpConnectorTest, test4IrTimestampNoPushdown) { + const std::shared_ptr kqlQuery = nullptr; + auto plan = PlanBuilder(pool_.get()) + .startTableScan() + .outputType(ROW({"timestamp"}, {TIMESTAMP()})) + .tableHandle(std::make_shared( + kClpConnectorId, "test_4")) + .assignments( + {{"timestamp", + std::make_shared( + "timestamp", "timestamp", TIMESTAMP())}}) + .endTableScan() + .filter("\"timestamp\" < timestamp '2025-08-24 02:36:45'") + .planNode(); + + auto output = getResults( plan, {makeClpSplit( - getExampleFilePath("test_3_ir.clps"), + getExampleFilePath("test_4_ir.clps"), ClpConnectorSplit::SplitType::kIr, kqlQuery)}); - test::assertEqualVectors(expected, irOutput); + auto expected = makeRowVector({ + // timestamp + makeFlatVector( + {Timestamp(kTestTimestampSeconds, kTestTimestampNanoseconds)}), + }); + test::assertEqualVectors(expected, output); +} + +TEST_F(ClpConnectorTest, test4IrTimestampPushdown) { + const std::shared_ptr kqlQuery = + std::make_shared("(timestamp < 1756003005000000)"); + auto plan = PlanBuilder(pool_.get()) + .startTableScan() + .outputType(ROW({"timestamp"}, {TIMESTAMP()})) + .tableHandle(std::make_shared( + kClpConnectorId, "test_4")) + .assignments( + {{"timestamp", + std::make_shared( + "timestamp", "timestamp", TIMESTAMP())}}) + .endTableScan() + .planNode(); + + auto output = getResults( + plan, + {makeClpSplit( + getExampleFilePath("test_4_ir.clps"), + ClpConnectorSplit::SplitType::kIr, + kqlQuery)}); + auto expected = makeRowVector({ + // timestamp + makeFlatVector( + {Timestamp(kTestTimestampSeconds, kTestTimestampNanoseconds)}), + }); + test::assertEqualVectors(expected, output); } } // namespace diff --git a/velox/connectors/clp/tests/examples/test_1_ir.clps b/velox/connectors/clp/tests/examples/test_1_ir.clps index 182de479b49c0adc7099a346eb8d669903b4112b..e09efcfb870baa87fe52a40cc7dcdf7811677539 100644 GIT binary patch delta 111 zcmZ3((ZoJsE@R=ud1x0EV0Q48BM?}>B-zo5Lt!EkxUR- l)ybQftiiHclbM+fz%2dAp3DwlmdWJ#%r;<_<>Zgd)&QRQe`9h;OfP?s}H4^}Clt}859b(sKl6|f$~ MrppAV%Ze=!02ZG@egFUf diff --git a/velox/connectors/clp/tests/examples/test_4.ndjson b/velox/connectors/clp/tests/examples/test_4.ndjson new file mode 100644 index 000000000000..287efb953b0a --- /dev/null +++ b/velox/connectors/clp/tests/examples/test_4.ndjson @@ -0,0 +1,6 @@ +{"timestamp": 1746003005.0} +{"timestamp": 1746003005000000} +{"timestamp": 1746003005000000000} +{"timestamp": 1766003005.0} +{"timestamp": 1766003005000000} +{"timestamp": 1766003005000000000} diff --git a/velox/connectors/clp/tests/examples/test_4_ir.clps b/velox/connectors/clp/tests/examples/test_4_ir.clps new file mode 100644 index 0000000000000000000000000000000000000000..938be4e7809123358ae6a52646ce144d88236f4b GIT binary patch literal 194 zcmeyXzg3e_a9Xuem}8Kqqmz$oaD1?{hpVq+yr+wjl~Qtku3lwva(+RoUUE)>URYvL zW@1uKYH@IKMrv+in30kWTw}bepR>P^HVbO(tT4)GV)W3 z!wf+txdsJ$`uhQ`HPAEEGf=86Oki;2EXmAGEiOsSEl6byVPG@<&G7i71DNTiJ@+K1 G0|NjPX+AXo literal 0 HcmV?d00001 From 10e222ac911fcfd441da184d3a9e50630ffa24e7 Mon Sep 17 00:00:00 2001 From: anlowee Date: Mon, 15 Sep 2025 20:10:24 +0000 Subject: [PATCH 27/34] Add string type timestamp logevent in test ir file --- .../clp/tests/examples/test_4_ir.clps | Bin 194 -> 236 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/velox/connectors/clp/tests/examples/test_4_ir.clps b/velox/connectors/clp/tests/examples/test_4_ir.clps index 938be4e7809123358ae6a52646ce144d88236f4b..1494cbc1c63529a485c6977d2fea4bfb8766f553 100644 GIT binary patch delta 86 zcmX@a_=a)9T>p{;21m}4%-q!ClEmDCR7OV$BLgE-T>}$cV}lR_3oBCtD+5zK0|SGo cLR6JZAq;HBzZo8%bVvm;-L&VP Date: Mon, 15 Sep 2025 20:44:29 +0000 Subject: [PATCH 28/34] Add docs --- velox/docs/develop/connectors.rst | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/velox/docs/develop/connectors.rst b/velox/docs/develop/connectors.rst index ec9946d6ab03..181598181b9e 100644 --- a/velox/docs/develop/connectors.rst +++ b/velox/docs/develop/connectors.rst @@ -134,7 +134,13 @@ ClpConnectorSplit ``ClpConnectorSplit`` describes a data chunk using ``path``. This path may be the absolute file path to the split file if it is stored on a local file system, or the complete (or partial) URL of the split if it is stored on S3. In the latter case, when only a partial URL is provided, ``ClpS3AuthProviderBase`` provides a hook in ``ClpDataSource`` to -assist in constructing the full URL. Refer to :ref:`ClpS3AuthProviderBase` for details. +assist in constructing the full URL. Refer to :ref:`ClpS3AuthProviderBase` for details. It also +includes a ``type`` property that specifies whether the split is an archive or an IR stream. + +BaseClpCursor +~~~~~~~~~~~~~ +``BaseClpCursor``` is responsible for preparing pushdown operations, loading splits, filtering data, and returning the +results. Each split type—archive and IR stream—has its own corresponding subclass. ClpDataSource ~~~~~~~~~~~~~ @@ -146,10 +152,11 @@ each output column, accessing its handle to get its type and original name. For traverses the nested structure to process each field; for non-row types, it directly maps the Velox column type to a CLP column type. -When a split is added, a ``ClpCursor`` is created with the split path and input source. The query is parsed -and simplified into an AST. On ``next``, the cursor finds matching row indices and, if any exist, -``ClpDataSource`` recursively creates a row vector composed of lazy vectors, which use CLP column readers to -decode and load data as needed during execution. +When a split is added, a ``BaseClpCursor`` instance is created with the split path and input source, which +may be either a ``ClpArchiveCursor`` or a ``ClpIrCursor``. The query is parsed and simplified into an AST. +On ``next``, the cursor finds matching row indices and, if any exist, ``ClpDataSource`` recursively creates +a row vector composed of lazy vectors, which use CLP column readers to decode and load data as needed during +execution. .. _ClpS3AuthProviderBase From 1982c3bc5403f1c3ce7209ead5354c3a481f87c1 Mon Sep 17 00:00:00 2001 From: anlowee Date: Tue, 16 Sep 2025 16:16:42 +0000 Subject: [PATCH 29/34] Address coderabbitai comments --- .../clp/search_lib/BaseClpCursor.cpp | 67 ----------------- .../connectors/clp/search_lib/BaseClpCursor.h | 73 +++++++++++++++++-- .../clp/search_lib/ir/ClpIrCursor.cpp | 2 +- .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 9 +-- .../clp/search_lib/ir/ClpIrVectorLoader.h | 2 +- .../clp/tests/examples/test_4.ndjson | 5 +- velox/docs/develop/connectors.rst | 8 +- 7 files changed, 78 insertions(+), 88 deletions(-) diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.cpp b/velox/connectors/clp/search_lib/BaseClpCursor.cpp index acae8d0b2a27..b8ab7db9457b 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.cpp +++ b/velox/connectors/clp/search_lib/BaseClpCursor.cpp @@ -30,73 +30,6 @@ using namespace clp_s::search::ast; namespace facebook::velox::connector::clp::search_lib { -template -auto estimatePrecision(T timestamp) -> TimestampPrecision { - constexpr int64_t kEpochMilliseconds1971{31536000000}; - constexpr int64_t kEpochMicroseconds1971{31536000000000}; - constexpr int64_t kEpochNanoseconds1971{31536000000000000}; - auto absTimestamp = timestamp >= 0 ? timestamp : -timestamp; - - if (absTimestamp > kEpochNanoseconds1971) { - return TimestampPrecision::Nanoseconds; - } else if (absTimestamp > kEpochMicroseconds1971) { - return TimestampPrecision::Microseconds; - } else if (absTimestamp > kEpochMilliseconds1971) { - return TimestampPrecision::Milliseconds; - } else { - return TimestampPrecision::Seconds; - } -} - -auto convertToVeloxTimestamp(double timestamp) -> Timestamp { - switch (estimatePrecision(timestamp)) { - case TimestampPrecision::Nanoseconds: - timestamp /= Timestamp::kNanosInSecond; - break; - case TimestampPrecision::Microseconds: - timestamp /= Timestamp::kMicrosecondsInSecond; - break; - case TimestampPrecision::Milliseconds: - timestamp /= Timestamp::kMillisecondsInSecond; - break; - case TimestampPrecision::Seconds: - break; - } - double seconds{std::floor(timestamp)}; - double nanoseconds{(timestamp - seconds) * Timestamp::kNanosInSecond}; - return Timestamp( - static_cast(seconds), static_cast(nanoseconds)); -} - -auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { - int64_t precisionDifference{Timestamp::kNanosInSecond}; - switch (estimatePrecision(timestamp)) { - case TimestampPrecision::Nanoseconds: - break; - case TimestampPrecision::Microseconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMicrosecond; - break; - case TimestampPrecision::Milliseconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMillisecond; - break; - case TimestampPrecision::Seconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosInSecond; - break; - } - int64_t seconds{timestamp / precisionDifference}; - int64_t nanoseconds{ - (timestamp % precisionDifference) * - (Timestamp::kNanosInSecond / precisionDifference)}; - if (nanoseconds < 0) { - seconds -= 1; - nanoseconds += Timestamp::kNanosInSecond; - } - return Timestamp(seconds, static_cast(nanoseconds)); -} - void BaseClpCursor::executeQuery( const std::string& query, const std::vector& outputColumns) { diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index c9a37a04f1d1..d88389cd5d25 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -23,6 +23,7 @@ #include "clp_s/InputConfig.hpp" #include "velox/connectors/clp/ClpConnectorSplit.h" +#include "velox/type/Timestamp.h" namespace clp_s { class BaseColumnReader; @@ -83,19 +84,79 @@ enum class TimestampPrecision : uint8_t { /// @param timestamp /// @return the estimated timestamp precision template -auto estimatePrecision(T timestamp) -> TimestampPrecision; - -/// Converts a float value into a Velox timestamp. +auto estimatePrecision(T timestamp) -> TimestampPrecision { + constexpr int64_t kEpochMilliseconds1971{31536000000}; + constexpr int64_t kEpochMicroseconds1971{31536000000000}; + constexpr int64_t kEpochNanoseconds1971{31536000000000000}; + auto absTimestamp = timestamp >= 0 ? timestamp : -timestamp; + + if (absTimestamp > kEpochNanoseconds1971) { + return TimestampPrecision::Nanoseconds; + } else if (absTimestamp > kEpochMicroseconds1971) { + return TimestampPrecision::Microseconds; + } else if (absTimestamp > kEpochMilliseconds1971) { + return TimestampPrecision::Milliseconds; + } else { + return TimestampPrecision::Seconds; + } +} + +/// Converts a double value into a Velox timestamp. /// -/// @param timestamp the input timestamp as a float +/// @param timestamp the input timestamp as a double /// @return the corresponding Velox timestamp -auto convertToVeloxTimestamp(double timestamp) -> Timestamp; +auto inline convertToVeloxTimestamp(double timestamp) -> Timestamp { + switch (estimatePrecision(timestamp)) { + case TimestampPrecision::Nanoseconds: + timestamp /= Timestamp::kNanosInSecond; + break; + case TimestampPrecision::Microseconds: + timestamp /= Timestamp::kMicrosecondsInSecond; + break; + case TimestampPrecision::Milliseconds: + timestamp /= Timestamp::kMillisecondsInSecond; + break; + case TimestampPrecision::Seconds: + break; + } + double seconds{std::floor(timestamp)}; + double nanoseconds{(timestamp - seconds) * Timestamp::kNanosInSecond}; + return Timestamp( + static_cast(seconds), static_cast(nanoseconds)); +} /// Converts an integer value into a Velox timestamp. /// /// @param timestamp the input timestamp as an integer /// @return the corresponding Velox timestamp -auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp; +auto inline convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { + int64_t precisionDifference{Timestamp::kNanosInSecond}; + switch (estimatePrecision(timestamp)) { + case TimestampPrecision::Nanoseconds: + break; + case TimestampPrecision::Microseconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMicrosecond; + break; + case TimestampPrecision::Milliseconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMillisecond; + break; + case TimestampPrecision::Seconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosInSecond; + break; + } + int64_t seconds{timestamp / precisionDifference}; + int64_t nanoseconds{ + (timestamp % precisionDifference) * + (Timestamp::kNanosInSecond / precisionDifference)}; + if (nanoseconds < 0) { + seconds -= 1; + nanoseconds += Timestamp::kNanosInSecond; + } + return Timestamp(seconds, static_cast(nanoseconds)); +} /// A query execution interface that manages the lifecycle of a query on a CLP-S /// split (archive or IR), including parsing and validating the query, loading diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 5c2fbb72048a..4724ef792e46 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -204,7 +204,7 @@ VectorPtr ClpIrCursor::createVectorHelper( std::make_unique( irDeserializer_->get_ir_unit_handler().getFilteredLogEvents(), isResolved, - projectedColumnNodeIds, + std::move(projectedColumnNodeIds), projectedColumn.name, projectedColumnType), std::move(vector)); diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index 5c110aaaceb7..a76ec4b2e4cf 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -14,10 +14,8 @@ * limitations under the License. */ -#include - -#include "velox/connectors/clp/search_lib/BaseClpCursor.h" #include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" namespace facebook::velox::connector::clp::search_lib { @@ -35,9 +33,8 @@ void ClpIrVectorLoader::loadInternal( auto& logEvent = filteredLogEvents_->at(vectorIndex); // TODO: also need to support auto-generated keys auto userGenNodeIdValueMap = logEvent->get_user_gen_node_id_value_pairs(); - std::unordered_map>::iterator - valueIt; - ::clp::ffi::SchemaTree::Node::id_t nodeId; + auto valueIt = userGenNodeIdValueMap.end(); + ::clp::ffi::SchemaTree::Node::id_t nodeId{}; for (size_t i{0}; i < nodeIds_.size(); ++i) { valueIt = userGenNodeIdValueMap.find(nodeIds_[i]); if (valueIt != userGenNodeIdValueMap.end()) { diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index a0dc279d65f4..9843c81f0f29 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -38,7 +38,7 @@ class ClpIrVectorLoader : public VectorLoader { ColumnType nodeType) : filteredLogEvents_(filteredLogEvents), isResolved_(isResolved), - nodeIds_(nodeIds), + nodeIds_(std::move(nodeIds)), nodeName_(nodeName), nodeType_(nodeType) {} diff --git a/velox/connectors/clp/tests/examples/test_4.ndjson b/velox/connectors/clp/tests/examples/test_4.ndjson index 287efb953b0a..950b7e7da58d 100644 --- a/velox/connectors/clp/tests/examples/test_4.ndjson +++ b/velox/connectors/clp/tests/examples/test_4.ndjson @@ -1,6 +1,3 @@ -{"timestamp": 1746003005.0} +{"timestamp": "2025-04-30T08:50:05.000Z"} {"timestamp": 1746003005000000} -{"timestamp": 1746003005000000000} -{"timestamp": 1766003005.0} {"timestamp": 1766003005000000} -{"timestamp": 1766003005000000000} diff --git a/velox/docs/develop/connectors.rst b/velox/docs/develop/connectors.rst index 181598181b9e..834af68657b6 100644 --- a/velox/docs/develop/connectors.rst +++ b/velox/docs/develop/connectors.rst @@ -135,12 +135,14 @@ ClpConnectorSplit if it is stored on a local file system, or the complete (or partial) URL of the split if it is stored on S3. In the latter case, when only a partial URL is provided, ``ClpS3AuthProviderBase`` provides a hook in ``ClpDataSource`` to assist in constructing the full URL. Refer to :ref:`ClpS3AuthProviderBase` for details. It also -includes a ``type`` property that specifies whether the split is an archive or an IR stream. +includes a ``type`` property that specifies whether the split is an archive or an IR (Intermediate Representation) +stream. BaseClpCursor ~~~~~~~~~~~~~ -``BaseClpCursor``` is responsible for preparing pushdown operations, loading splits, filtering data, and returning the -results. Each split type—archive and IR stream—has its own corresponding subclass. +``BaseClpCursor`` is responsible for preparing pushdown operations, loading splits, filtering data, and returning the +results. Each split type—archive and IR stream—has its own corresponding subclass. See also: ``ClpArchiveCursor``, +``ClpIrCursor``. ClpDataSource ~~~~~~~~~~~~~ From 947fded676bb565d155836bddbb7c3e7ee8278e2 Mon Sep 17 00:00:00 2001 From: anlowee Date: Tue, 16 Sep 2025 17:41:46 +0000 Subject: [PATCH 30/34] Address coderabbitai comments --- .../connectors/clp/search_lib/BaseClpCursor.h | 32 +++++++++---------- .../clp/search_lib/ir/CMakeLists.txt | 2 +- .../clp/search_lib/ir/ClpIrCursor.cpp | 7 ++-- .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 6 ---- .../clp/search_lib/ir/ClpIrVectorLoader.h | 4 +-- 5 files changed, 22 insertions(+), 29 deletions(-) diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index d88389cd5d25..bf0e09ee615d 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -61,7 +61,7 @@ struct Field { std::string name; }; -enum class TimestampPrecision : uint8_t { +enum class InputTimestampPrecision : uint8_t { Seconds, Milliseconds, Microseconds, @@ -84,20 +84,20 @@ enum class TimestampPrecision : uint8_t { /// @param timestamp /// @return the estimated timestamp precision template -auto estimatePrecision(T timestamp) -> TimestampPrecision { +auto estimatePrecision(T timestamp) -> InputTimestampPrecision { constexpr int64_t kEpochMilliseconds1971{31536000000}; constexpr int64_t kEpochMicroseconds1971{31536000000000}; constexpr int64_t kEpochNanoseconds1971{31536000000000000}; auto absTimestamp = timestamp >= 0 ? timestamp : -timestamp; if (absTimestamp > kEpochNanoseconds1971) { - return TimestampPrecision::Nanoseconds; + return InputTimestampPrecision::Nanoseconds; } else if (absTimestamp > kEpochMicroseconds1971) { - return TimestampPrecision::Microseconds; + return InputTimestampPrecision::Microseconds; } else if (absTimestamp > kEpochMilliseconds1971) { - return TimestampPrecision::Milliseconds; + return InputTimestampPrecision::Milliseconds; } else { - return TimestampPrecision::Seconds; + return InputTimestampPrecision::Seconds; } } @@ -105,18 +105,18 @@ auto estimatePrecision(T timestamp) -> TimestampPrecision { /// /// @param timestamp the input timestamp as a double /// @return the corresponding Velox timestamp -auto inline convertToVeloxTimestamp(double timestamp) -> Timestamp { +inline auto convertToVeloxTimestamp(double timestamp) -> Timestamp { switch (estimatePrecision(timestamp)) { - case TimestampPrecision::Nanoseconds: + case InputTimestampPrecision::Nanoseconds: timestamp /= Timestamp::kNanosInSecond; break; - case TimestampPrecision::Microseconds: + case InputTimestampPrecision::Microseconds: timestamp /= Timestamp::kMicrosecondsInSecond; break; - case TimestampPrecision::Milliseconds: + case InputTimestampPrecision::Milliseconds: timestamp /= Timestamp::kMillisecondsInSecond; break; - case TimestampPrecision::Seconds: + case InputTimestampPrecision::Seconds: break; } double seconds{std::floor(timestamp)}; @@ -129,20 +129,20 @@ auto inline convertToVeloxTimestamp(double timestamp) -> Timestamp { /// /// @param timestamp the input timestamp as an integer /// @return the corresponding Velox timestamp -auto inline convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { +inline auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { int64_t precisionDifference{Timestamp::kNanosInSecond}; switch (estimatePrecision(timestamp)) { - case TimestampPrecision::Nanoseconds: + case InputTimestampPrecision::Nanoseconds: break; - case TimestampPrecision::Microseconds: + case InputTimestampPrecision::Microseconds: precisionDifference = Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMicrosecond; break; - case TimestampPrecision::Milliseconds: + case InputTimestampPrecision::Milliseconds: precisionDifference = Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMillisecond; break; - case TimestampPrecision::Seconds: + case InputTimestampPrecision::Seconds: precisionDifference = Timestamp::kNanosInSecond / Timestamp::kNanosInSecond; break; diff --git a/velox/connectors/clp/search_lib/ir/CMakeLists.txt b/velox/connectors/clp/search_lib/ir/CMakeLists.txt index 0dfae454c641..1dce25d1af67 100644 --- a/velox/connectors/clp/search_lib/ir/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/ir/CMakeLists.txt @@ -22,7 +22,7 @@ velox_add_library( velox_link_libraries( clp-s-ir-search - PUBLIC clp_s::archive_reader clp_s::archive_writer + PUBLIC clp_s::archive_reader PRIVATE # Once the IR-stream-related targets are exported, this should be updated to # use more fine-grained dependencies. diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 4724ef792e46..d076bb68a525 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -61,7 +61,7 @@ VectorPtr ClpIrCursor::createVector( VELOX_CHECK_EQ( projectedColumnIdxNodeIdsMap_.size(), outputColumns_.size(), - "Projected columns size {} does not match fields size {}", + "Resolved node-id map size ({}) must not exceed projected columns ({})", projectedColumnIdxNodeIdsMap_.size(), outputColumns_.size()); return createVectorHelper(pool, vectorType, vectorSize); @@ -191,8 +191,9 @@ VectorPtr ClpIrCursor::createVectorHelper( auto projectedColumn = outputColumns_[readerIndex_]; auto projectedColumnType = projectedColumn.type; auto it = projectedColumnIdxNodeIdsMap_.find(readerIndex_); - bool isResolved = it != projectedColumnIdxNodeIdsMap_.end(); - std::vector<::clp::ffi::SchemaTree::Node::id_t> projectedColumnNodeIds; + std::vector<::clp::ffi::SchemaTree::Node::id_t> projectedColumnNodeIds{}; + bool isResolved = + it != projectedColumnIdxNodeIdsMap_.end() && !it->second.empty(); if (isResolved) { projectedColumnNodeIds = it->second; } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index a76ec4b2e4cf..4595ac72b5d2 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -107,12 +107,6 @@ void ClpIrVectorLoader::loadInternal( timestampVector->set( vectorIndex, convertToVeloxTimestamp(value->get_immutable_view())); - } else if (value->is()) { - auto stringValue = value->get_immutable_view().data(); - uint64_t encodingId{}; - auto const timestamp = timestampDict_.ingest_entry( - nodeName_, nodeId, stringValue, encodingId); - timestampVector->set(vectorIndex, convertToVeloxTimestamp(timestamp)); } else { VELOX_FAIL("Unsupported timestamp type"); } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index 9843c81f0f29..690e1c9bddec 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -18,9 +18,8 @@ #include -#include "clp_s/TimestampDictionaryWriter.hpp" -#include "connectors/clp/search_lib/BaseClpCursor.h" #include "ffi/ir_stream/Deserializer.hpp" +#include "velox/connectors/clp/search_lib/BaseClpCursor.h" #include "velox/vector/FlatVector.h" #include "velox/vector/LazyVector.h" @@ -51,7 +50,6 @@ class ClpIrVectorLoader : public VectorLoader { std::vector<::clp::ffi::SchemaTree::Node::id_t> nodeIds_; std::string nodeName_; ColumnType nodeType_; - clp_s::TimestampDictionaryWriter timestampDict_; void loadInternal( RowSet rows, From 03d251d74864b5aac50f2368dd00306fbfc38104 Mon Sep 17 00:00:00 2001 From: anlowee Date: Mon, 22 Sep 2025 15:18:54 +0000 Subject: [PATCH 31/34] Address comments --- .../connectors/clp/search_lib/BaseClpCursor.h | 98 -------------- .../connectors/clp/search_lib/CMakeLists.txt | 3 +- .../clp/search_lib/ClpTimestampsUtils.h | 120 ++++++++++++++++++ .../archive/ClpArchiveVectorLoader.cpp | 1 + .../clp/search_lib/ir/ClpIrCursor.cpp | 1 + .../clp/search_lib/ir/ClpIrVectorLoader.cpp | 8 +- .../clp/search_lib/ir/ClpIrVectorLoader.h | 2 +- 7 files changed, 130 insertions(+), 103 deletions(-) create mode 100644 velox/connectors/clp/search_lib/ClpTimestampsUtils.h diff --git a/velox/connectors/clp/search_lib/BaseClpCursor.h b/velox/connectors/clp/search_lib/BaseClpCursor.h index bf0e09ee615d..92940081f909 100644 --- a/velox/connectors/clp/search_lib/BaseClpCursor.h +++ b/velox/connectors/clp/search_lib/BaseClpCursor.h @@ -23,7 +23,6 @@ #include "clp_s/InputConfig.hpp" #include "velox/connectors/clp/ClpConnectorSplit.h" -#include "velox/type/Timestamp.h" namespace clp_s { class BaseColumnReader; @@ -61,103 +60,6 @@ struct Field { std::string name; }; -enum class InputTimestampPrecision : uint8_t { - Seconds, - Milliseconds, - Microseconds, - Nanoseconds -}; - -/// Estimates the precision of an epoch timestamp as seconds, milliseconds, -/// microseconds, or nanoseconds. -/// -/// This heuristic relies on the fact that 1 year of epoch nanoseconds is -/// approximately 1000 years of epoch microseconds and so on. This heuristic -/// can be unreliable for timestamps sufficiently close to the epoch, but -/// should otherwise be accurate for the next 1000 years. -/// -/// Note: Future versions of the clp-s archive format will adopt a -/// nanosecond-precision integer timestamp format (as opposed to the current -/// format which allows other precisions), at which point we can remove this -/// heuristic. -/// -/// @param timestamp -/// @return the estimated timestamp precision -template -auto estimatePrecision(T timestamp) -> InputTimestampPrecision { - constexpr int64_t kEpochMilliseconds1971{31536000000}; - constexpr int64_t kEpochMicroseconds1971{31536000000000}; - constexpr int64_t kEpochNanoseconds1971{31536000000000000}; - auto absTimestamp = timestamp >= 0 ? timestamp : -timestamp; - - if (absTimestamp > kEpochNanoseconds1971) { - return InputTimestampPrecision::Nanoseconds; - } else if (absTimestamp > kEpochMicroseconds1971) { - return InputTimestampPrecision::Microseconds; - } else if (absTimestamp > kEpochMilliseconds1971) { - return InputTimestampPrecision::Milliseconds; - } else { - return InputTimestampPrecision::Seconds; - } -} - -/// Converts a double value into a Velox timestamp. -/// -/// @param timestamp the input timestamp as a double -/// @return the corresponding Velox timestamp -inline auto convertToVeloxTimestamp(double timestamp) -> Timestamp { - switch (estimatePrecision(timestamp)) { - case InputTimestampPrecision::Nanoseconds: - timestamp /= Timestamp::kNanosInSecond; - break; - case InputTimestampPrecision::Microseconds: - timestamp /= Timestamp::kMicrosecondsInSecond; - break; - case InputTimestampPrecision::Milliseconds: - timestamp /= Timestamp::kMillisecondsInSecond; - break; - case InputTimestampPrecision::Seconds: - break; - } - double seconds{std::floor(timestamp)}; - double nanoseconds{(timestamp - seconds) * Timestamp::kNanosInSecond}; - return Timestamp( - static_cast(seconds), static_cast(nanoseconds)); -} - -/// Converts an integer value into a Velox timestamp. -/// -/// @param timestamp the input timestamp as an integer -/// @return the corresponding Velox timestamp -inline auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { - int64_t precisionDifference{Timestamp::kNanosInSecond}; - switch (estimatePrecision(timestamp)) { - case InputTimestampPrecision::Nanoseconds: - break; - case InputTimestampPrecision::Microseconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMicrosecond; - break; - case InputTimestampPrecision::Milliseconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMillisecond; - break; - case InputTimestampPrecision::Seconds: - precisionDifference = - Timestamp::kNanosInSecond / Timestamp::kNanosInSecond; - break; - } - int64_t seconds{timestamp / precisionDifference}; - int64_t nanoseconds{ - (timestamp % precisionDifference) * - (Timestamp::kNanosInSecond / precisionDifference)}; - if (nanoseconds < 0) { - seconds -= 1; - nanoseconds += Timestamp::kNanosInSecond; - } - return Timestamp(seconds, static_cast(nanoseconds)); -} - /// A query execution interface that manages the lifecycle of a query on a CLP-S /// split (archive or IR), including parsing and validating the query, loading /// the relevant splits, applying filters, and iterating over the results. It diff --git a/velox/connectors/clp/search_lib/CMakeLists.txt b/velox/connectors/clp/search_lib/CMakeLists.txt index bb9452b8cc25..69451d2fc79f 100644 --- a/velox/connectors/clp/search_lib/CMakeLists.txt +++ b/velox/connectors/clp/search_lib/CMakeLists.txt @@ -19,7 +19,8 @@ velox_add_library( ClpPackageS3AuthProvider.cpp ClpPackageS3AuthProvider.h ClpS3AuthProviderBase.cpp - ClpS3AuthProviderBase.h) + ClpS3AuthProviderBase.h + ClpTimestampsUtils.h) add_subdirectory(archive) add_subdirectory(ir) diff --git a/velox/connectors/clp/search_lib/ClpTimestampsUtils.h b/velox/connectors/clp/search_lib/ClpTimestampsUtils.h new file mode 100644 index 000000000000..db870fcbf64c --- /dev/null +++ b/velox/connectors/clp/search_lib/ClpTimestampsUtils.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/type/Timestamp.h" + +namespace facebook::velox::connector::clp::search_lib { + +enum class InputTimestampPrecision : uint8_t { + Seconds, + Milliseconds, + Microseconds, + Nanoseconds +}; + +/// Estimates the precision of an epoch timestamp as seconds, milliseconds, +/// microseconds, or nanoseconds. +/// +/// This heuristic relies on the fact that 1 year of epoch nanoseconds is +/// approximately 1000 years of epoch microseconds and so on. This heuristic +/// can be unreliable for timestamps sufficiently close to the epoch, but +/// should otherwise be accurate for the next 1000 years. +/// +/// Note: Future versions of the clp-s archive format will adopt a +/// nanosecond-precision integer timestamp format (as opposed to the current +/// format which allows other precisions), at which point we can remove this +/// heuristic. +/// +/// @param timestamp +/// @return the estimated timestamp precision +template +auto estimatePrecision(T timestamp) -> InputTimestampPrecision { + constexpr int64_t kEpochMilliseconds1971{31536000000}; + constexpr int64_t kEpochMicroseconds1971{31536000000000}; + constexpr int64_t kEpochNanoseconds1971{31536000000000000}; + auto absTimestamp = timestamp >= 0 ? timestamp : -timestamp; + + if (absTimestamp > kEpochNanoseconds1971) { + return InputTimestampPrecision::Nanoseconds; + } else if (absTimestamp > kEpochMicroseconds1971) { + return InputTimestampPrecision::Microseconds; + } else if (absTimestamp > kEpochMilliseconds1971) { + return InputTimestampPrecision::Milliseconds; + } else { + return InputTimestampPrecision::Seconds; + } +} + +/// Converts a double value into a Velox timestamp. +/// +/// @param timestamp the input timestamp as a double +/// @return the corresponding Velox timestamp +inline auto convertToVeloxTimestamp(double timestamp) -> Timestamp { + switch (estimatePrecision(timestamp)) { + case InputTimestampPrecision::Nanoseconds: + timestamp /= Timestamp::kNanosInSecond; + break; + case InputTimestampPrecision::Microseconds: + timestamp /= Timestamp::kMicrosecondsInSecond; + break; + case InputTimestampPrecision::Milliseconds: + timestamp /= Timestamp::kMillisecondsInSecond; + break; + case InputTimestampPrecision::Seconds: + break; + } + double seconds{std::floor(timestamp)}; + double nanoseconds{(timestamp - seconds) * Timestamp::kNanosInSecond}; + return Timestamp( + static_cast(seconds), static_cast(nanoseconds)); +} + +/// Converts an integer value into a Velox timestamp. +/// +/// @param timestamp the input timestamp as an integer +/// @return the corresponding Velox timestamp +inline auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { + int64_t precisionDifference{Timestamp::kNanosInSecond}; + switch (estimatePrecision(timestamp)) { + case InputTimestampPrecision::Nanoseconds: + break; + case InputTimestampPrecision::Microseconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMicrosecond; + break; + case InputTimestampPrecision::Milliseconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosecondsInMillisecond; + break; + case InputTimestampPrecision::Seconds: + precisionDifference = + Timestamp::kNanosInSecond / Timestamp::kNanosInSecond; + break; + } + int64_t seconds{timestamp / precisionDifference}; + int64_t nanoseconds{ + (timestamp % precisionDifference) * + (Timestamp::kNanosInSecond / precisionDifference)}; + if (nanoseconds < 0) { + seconds -= 1; + nanoseconds += Timestamp::kNanosInSecond; + } + return Timestamp(seconds, static_cast(nanoseconds)); +} + +} // namespace facebook::velox::connector::clp::search_lib diff --git a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp index 907fa95954dd..c8b362da6ac6 100644 --- a/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.cpp @@ -21,6 +21,7 @@ #include "clp_s/ColumnReader.hpp" #include "clp_s/SchemaTree.hpp" #include "velox/connectors/clp/search_lib/BaseClpCursor.h" +#include "velox/connectors/clp/search_lib/ClpTimestampsUtils.h" #include "velox/connectors/clp/search_lib/archive/ClpArchiveVectorLoader.h" #include "velox/type/Timestamp.h" #include "velox/vector/ComplexVector.h" diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index d076bb68a525..f598566157ab 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -18,6 +18,7 @@ #include "clp_s/InputConfig.hpp" #include "ffi/ir_stream/search/QueryHandler.hpp" +#include "velox/connectors/clp/search_lib/ClpTimestampsUtils.h" #include "velox/connectors/clp/search_lib/ir/ClpIrCursor.h" #include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp index 4595ac72b5d2..09528a2ccfb0 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.cpp @@ -15,7 +15,9 @@ */ #include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" + #include "velox/connectors/clp/search_lib/BaseClpCursor.h" +#include "velox/connectors/clp/search_lib/ClpTimestampsUtils.h" namespace facebook::velox::connector::clp::search_lib { @@ -35,10 +37,10 @@ void ClpIrVectorLoader::loadInternal( auto userGenNodeIdValueMap = logEvent->get_user_gen_node_id_value_pairs(); auto valueIt = userGenNodeIdValueMap.end(); ::clp::ffi::SchemaTree::Node::id_t nodeId{}; - for (size_t i{0}; i < nodeIds_.size(); ++i) { - valueIt = userGenNodeIdValueMap.find(nodeIds_[i]); + for (auto const candidateNodeId : nodeIds_) { + valueIt = userGenNodeIdValueMap.find(candidateNodeId); if (valueIt != userGenNodeIdValueMap.end()) { - nodeId = nodeIds_[i]; + nodeId = candidateNodeId; break; } } diff --git a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h index 690e1c9bddec..5e8d6ea4e663 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h +++ b/velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h @@ -46,7 +46,7 @@ class ClpIrVectorLoader : public VectorLoader { std::shared_ptr< const std::vector>> filteredLogEvents_; - bool isResolved_; + bool isResolved_{}; std::vector<::clp::ffi::SchemaTree::Node::id_t> nodeIds_; std::string nodeName_; ColumnType nodeType_; From da66e9997d80af0892b2ac82d02239f26a49019c Mon Sep 17 00:00:00 2001 From: anlowee Date: Thu, 25 Sep 2025 18:15:56 +0000 Subject: [PATCH 32/34] Address comment --- velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index f598566157ab..4f68db024d0c 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -134,8 +134,6 @@ ClpIrCursor::splitFieldsToNamesAndTypes() const { search::ast::LiteralType::ClpStringT; break; case ColumnType::Timestamp: - // TODO: IR timestamp support pending; constrain to Unknown to avoid - // mismatched projections. literalType = search::ast::LiteralType::FloatT | search::ast::LiteralType::IntegerT; break; From ae977665c7f748431f9c95a8a9ac50263d926309 Mon Sep 17 00:00:00 2001 From: anlowee Date: Thu, 25 Sep 2025 20:16:45 +0000 Subject: [PATCH 33/34] Address comment --- velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp index 4f68db024d0c..1f51bf83cfda 100644 --- a/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp +++ b/velox/connectors/clp/search_lib/ir/ClpIrCursor.cpp @@ -18,7 +18,6 @@ #include "clp_s/InputConfig.hpp" #include "ffi/ir_stream/search/QueryHandler.hpp" -#include "velox/connectors/clp/search_lib/ClpTimestampsUtils.h" #include "velox/connectors/clp/search_lib/ir/ClpIrCursor.h" #include "velox/connectors/clp/search_lib/ir/ClpIrVectorLoader.h" From 4c3e4e865610adcc421b5105680a1a4497bb06e4 Mon Sep 17 00:00:00 2001 From: "Xiaochong(Eddy) Wei" <40865608+anlowee@users.noreply.github.com> Date: Mon, 29 Sep 2025 10:05:51 -0400 Subject: [PATCH 34/34] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- velox/docs/develop/connectors.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/velox/docs/develop/connectors.rst b/velox/docs/develop/connectors.rst index b12f606fa8d8..22886134e3b2 100644 --- a/velox/docs/develop/connectors.rst +++ b/velox/docs/develop/connectors.rst @@ -134,15 +134,15 @@ ClpConnectorSplit ``ClpConnectorSplit`` describes a data chunk using ``path``. This path may be the absolute file path to the split file if it is stored on a local file system, or the complete (or partial) URL of the split if it is stored on S3. In the latter case, when only a partial URL is provided, ``ClpS3AuthProviderBase`` provides a hook in ``ClpDataSource`` to -assist in constructing the full URL. Refer to :ref:`ClpS3AuthProviderBase` for details. It also -includes a ``type`` property that specifies whether the split is an archive or an IR (Intermediate Representation) -stream. +assist in constructing the full URL. Refer to :ref:`ClpS3AuthProviderBase` for details. +``ClpConnectorSplit`` also includes a ``type`` property that specifies whether the split is an archive or an IR +(Internal Representation) stream. BaseClpCursor ~~~~~~~~~~~~~ -``BaseClpCursor`` is responsible for preparing pushdown operations, loading splits, filtering data, and returning the -results. Each split type—archive and IR stream—has its own corresponding subclass. See also: ``ClpArchiveCursor``, -``ClpIrCursor``. +``BaseClpCursor`` is responsible for preparing pushdown operations, loading splits, filtering data, and returning +results. Each split type (archive or IR stream) has its own corresponding subclass (``ClpArchiveCursor`` and +``ClpIrCursor``). ClpDataSource ~~~~~~~~~~~~~ @@ -154,8 +154,8 @@ each output column, accessing its handle to get its type and original name. For traverses the nested structure to process each field; for non-row types, it directly maps the Velox column type to a CLP column type. -When a split is added, a ``BaseClpCursor`` instance is created with the split path and input source, which -may be either a ``ClpArchiveCursor`` or a ``ClpIrCursor``. The query is parsed and simplified into an AST. +When a split is added, a ``BaseClpCursor`` instance is created with the split path and input source (which +may be either a ``ClpArchiveCursor`` or a ``ClpIrCursor``). The query is parsed and simplified into an AST. On ``next``, the cursor finds matching row indices and, if any exist, ``ClpDataSource`` recursively creates a row vector composed of lazy vectors, which use CLP column readers to decode and load data as needed during execution.