-
Notifications
You must be signed in to change notification settings - Fork 85
Feat/disk based sorting #3514
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: stable-23.10
Are you sure you want to change the base?
Feat/disk based sorting #3514
Changes from all commits
9caaa79
25d3387
5af5e1f
1c4328a
6a31569
cb48d02
a0cf507
9c3b826
8258716
7e32958
3176eff
d9187e6
929289e
bf4c17f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,72 @@ | ||||||||||
| /* Copyright (C) 2025 MariaDB Corp. | ||||||||||
|
|
||||||||||
| This program is free software; you can redistribute it and/or | ||||||||||
| modify it under the terms of the GNU General Public License | ||||||||||
| as published by the Free Software Foundation; version 2 of | ||||||||||
| the License. | ||||||||||
|
|
||||||||||
| This program is distributed in the hope that it will be useful, | ||||||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||||||||
| GNU General Public License for more details. | ||||||||||
|
|
||||||||||
| You should have received a copy of the GNU General Public License | ||||||||||
| along with this program; if not, write to the Free Software | ||||||||||
| Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | ||||||||||
| MA 02110-1301, USA. */ | ||||||||||
|
|
||||||||||
| #include <vector> | ||||||||||
|
|
||||||||||
| #include "dumper.h" | ||||||||||
| #include "disk-based-topnorderby.h" | ||||||||||
| namespace joblist | ||||||||||
| { | ||||||||||
|
|
||||||||||
| // The caller ensures lifetime of dl and rg | ||||||||||
| void DiskBasedTopNOrderBy::flushCurrentToDisk(RowGroupDL& dl, rowgroup::RowGroup rg, const size_t numberOfRGs, const bool firstFlush) | ||||||||||
| { | ||||||||||
| size_t rgid = (firstFlush) ? numberOfRGs : 0; | ||||||||||
| rowgroup::RGData rgData; | ||||||||||
|
|
||||||||||
| size_t generation = (firstFlush) ? getGenerationCounter() : 0; // WIP | ||||||||||
|
|
||||||||||
| bool more = dl.next(0, &rgData); | ||||||||||
| while (more) | ||||||||||
| { | ||||||||||
| saveRG(rgid, generation, rg, &rgData); | ||||||||||
| more = dl.next(0, &rgData); | ||||||||||
| rgid = (firstFlush) ? rgid - 1 : rgid + 1; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| if (firstFlush) | ||||||||||
| { | ||||||||||
| incrementGenerationCounter(); | ||||||||||
| } | ||||||||||
| else | ||||||||||
| { | ||||||||||
|
|
||||||||||
| } | ||||||||||
| } | ||||||||||
| void DiskBasedTopNOrderBy::diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& /*dataLists*/) | ||||||||||
| { | ||||||||||
| } | ||||||||||
|
|
||||||||||
| std::vector<std::string> DiskBasedTopNOrderBy::getGenerationFileNamesNextBatch(const size_t batchSize) | ||||||||||
| { | ||||||||||
| // assert(getGenerationFilesNumber() > batchSize); | ||||||||||
| auto totalNumberOfFilesYetToMerge = getGenerationFilesNumber() - batchSize; | ||||||||||
| auto batchSizeOrFilesLeftNumber = std::max(getGenerationFilesNumber(), batchSize); | ||||||||||
| auto actualBatchSize = std::min(totalNumberOfFilesYetToMerge, batchSizeOrFilesLeftNumber); | ||||||||||
| // add state for the starting offset + wraparound | ||||||||||
| size_t startOffset = 0; | ||||||||||
| std::vector<std::string> res; | ||||||||||
| res.reserve(actualBatchSize); | ||||||||||
| for (size_t i = 0; i < startOffset + actualBatchSize; ++i) | ||||||||||
| { | ||||||||||
| res.push_back(makeRGFilePrefix(i)); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| return res; | ||||||||||
|
|
||||||||||
|
|
||||||||||
| } // namespace joblist | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This closing brace appears to close the
Suggested change
|
||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,77 @@ | ||||||||||||||
| /* Copyright (C) 2025 MariaDB Corp. | ||||||||||||||
|
|
||||||||||||||
| This program is free software; you can redistribute it and/or | ||||||||||||||
| modify it under the terms of the GNU General Public License | ||||||||||||||
| as published by the Free Software Foundation; version 2 of | ||||||||||||||
| the License. | ||||||||||||||
|
|
||||||||||||||
| This program is distributed in the hope that it will be useful, | ||||||||||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||||||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||||||||||||
| GNU General Public License for more details. | ||||||||||||||
|
|
||||||||||||||
| You should have received a copy of the GNU General Public License | ||||||||||||||
| along with this program; if not, write to the Free Software | ||||||||||||||
| Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | ||||||||||||||
| MA 02110-1301, USA. */ | ||||||||||||||
|
|
||||||||||||||
| #pragma once | ||||||||||||||
|
|
||||||||||||||
| #include <cstdint> | ||||||||||||||
| #include <queue> | ||||||||||||||
| #include <string> | ||||||||||||||
| #include <vector> | ||||||||||||||
|
|
||||||||||||||
| #include "dumper.h" | ||||||||||||||
| #include "elementtype.h" | ||||||||||||||
| #include "resourcemanager.h" | ||||||||||||||
| namespace joblist | ||||||||||||||
| { | ||||||||||||||
|
|
||||||||||||||
| class DiskBasedTopNOrderBy : public rowgroup::RGDumper | ||||||||||||||
| { | ||||||||||||||
| // std::string fTmpDir = | ||||||||||||||
| // config::Config::makeConfig()->getTempFileDir(config::Config::TempDirPurpose::Aggregates); | ||||||||||||||
| // std::string fCompStr = config::Config::makeConfig()->getConfig("RowAggregation", "Compression"); | ||||||||||||||
| public: | ||||||||||||||
| // TODO Parametrize compression, tmpdir and memory manager (can be temp) | ||||||||||||||
| DiskBasedTopNOrderBy(ResourceManager* /*rm*/) | ||||||||||||||
| : RGDumper(compress::getCompressInterfaceByName("LZ4"), std::make_unique<rowgroup::MemManager>(), | ||||||||||||||
| config::Config::makeConfig()->getTempFileDir(config::Config::TempDirPurpose::Sorting), | ||||||||||||||
| "Sorting", reinterpret_cast<std::uintptr_t>(this)) | ||||||||||||||
| { | ||||||||||||||
| } | ||||||||||||||
| ~DiskBasedTopNOrderBy() = default; | ||||||||||||||
|
|
||||||||||||||
| void incrementGenerationCounter() | ||||||||||||||
| { | ||||||||||||||
| ++fGenerationCounter; | ||||||||||||||
| uint64_t newGeneration = (fGenerations.empty()) ? 1 : fGenerations.back() + 1; | ||||||||||||||
| fGenerations.push(newGeneration); | ||||||||||||||
| } | ||||||||||||||
| uint64_t getGenerationCounter() const | ||||||||||||||
| { | ||||||||||||||
| return (fGenerations.empty()) ? 0 : fGenerations.back(); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| bool isDiskBased() const | ||||||||||||||
| { | ||||||||||||||
| return fGenerationCounter > 0; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| size_t getGenerationFilesNumber() const | ||||||||||||||
| { | ||||||||||||||
| return 0; | ||||||||||||||
| } | ||||||||||||||
| std::vector<std::string> getGenerationFileNamesNextBatch(const size_t batchSize); | ||||||||||||||
|
|
||||||||||||||
| // The caller ensures lifetime of dl and rg | ||||||||||||||
| void flushCurrentToDisk(RowGroupDL& dl, rowgroup::RowGroup rg, const size_t numberOfRGs, const bool firstFlush); | ||||||||||||||
| void diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists); | ||||||||||||||
|
|
||||||||||||||
| // private: | ||||||||||||||
| uint64_t fGenerationCounter{0}; | ||||||||||||||
| std::queue<uint64_t> fGenerations; | ||||||||||||||
|
Comment on lines
+72
to
+74
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These member variables are currently public, but the commented-out
Suggested change
|
||||||||||||||
| }; | ||||||||||||||
|
|
||||||||||||||
| } // namespace joblist | ||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This logic for calculating
actualBatchSizeis confusing and contains a bug.totalNumberOfFilesYetToMergewill underflow ifgetGenerationFilesNumber()is less thanbatchSize. Given thatgetGenerationFilesNumber()is a stub returning 0, this is a definite issue. Please revise this logic for correctness and clarity. A safer way to calculatetotalNumberOfFilesYetToMergewould besize_t totalNumberOfFilesYetToMerge = (num_files > batchSize) ? num_files - batchSize : 0;. The overall logic might need simplification depending on the intended behavior.