Skip to content

Commit 3927cb3

Browse files
authored
snapshots seg compressor: pattern covering search (#1794)
1 parent 0cdd97b commit 3927cb3

File tree

3 files changed

+464
-0
lines changed

3 files changed

+464
-0
lines changed
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
/*
2+
Copyright 2024 The Silkworm Authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
#include "pattern_covering.hpp"
18+
19+
#include <cstdint>
20+
#include <limits>
21+
22+
#include <boost/circular_buffer.hpp>
23+
24+
namespace silkworm::snapshots::seg {
25+
26+
//! A result of dynamic programming for a certain starting position.
27+
struct DynamicCell {
28+
size_t optim_start{};
29+
size_t cover_start{};
30+
int compression{};
31+
uint64_t score{};
32+
size_t pattern_index{};
33+
};
34+
35+
using Ring = boost::circular_buffer_space_optimized<DynamicCell>;
36+
using Result = PatternCoveringSearch::Result;
37+
38+
class PatternCoveringSearchImpl {
39+
public:
40+
PatternCoveringSearchImpl(
41+
const PatriciaTree& patterns_tree,
42+
absl::FunctionRef<uint64_t(void*)> pattern_score_getter)
43+
: match_finder_(patterns_tree),
44+
pattern_score_getter_(pattern_score_getter),
45+
cell_ring_(std::numeric_limits<size_t>::max()) {}
46+
47+
const Result& cover_word(ByteView word);
48+
49+
private:
50+
PatriciaTreeMatchFinder match_finder_;
51+
absl::FunctionRef<uint64_t(void*)> pattern_score_getter_;
52+
Ring cell_ring_;
53+
std::vector<size_t> pattern_indexes_;
54+
Result result_;
55+
};
56+
57+
void Result::clear() {
58+
pattern_positions.clear();
59+
uncovered_ranges.clear();
60+
}
61+
62+
const Result& PatternCoveringSearchImpl::cover_word(ByteView word) {
63+
result_.clear();
64+
65+
auto& matches = match_finder_.find_longest_matches(word);
66+
if (matches.empty()) {
67+
result_.uncovered_ranges.emplace_back(0, word.size());
68+
return result_;
69+
}
70+
71+
cell_ring_.clear();
72+
pattern_indexes_.clear();
73+
74+
// This is a linked list of pattern matches indexes organized in pairs:
75+
// * each even element is a match index;
76+
// * each odd element is an index of the next entry within the list, or zero for a tail entry.
77+
// The list starts with a sentinel entry - [0, 0].
78+
auto& patterns = pattern_indexes_;
79+
patterns.push_back(0);
80+
patterns.push_back(0);
81+
82+
const auto& last_match = matches.back();
83+
for (size_t i = last_match.start; i < last_match.end; i++) {
84+
DynamicCell cell{
85+
.optim_start = i + 1,
86+
.cover_start = word.size(),
87+
};
88+
cell_ring_.push_back(cell);
89+
}
90+
91+
// Starting from the last match
92+
for (size_t i = matches.size(); i > 0; i--) {
93+
const auto& match = matches[i - 1];
94+
uint64_t pattern_score = pattern_score_getter_(match.value);
95+
auto& first_cell = cell_ring_[0];
96+
int max_compression = first_cell.compression;
97+
uint64_t max_score = first_cell.score;
98+
DynamicCell max_cell = first_cell;
99+
bool max_include = false;
100+
101+
for (size_t e = 0; e < cell_ring_.size(); e++) {
102+
auto& cell = cell_ring_[e];
103+
int comp = cell.compression - 4;
104+
105+
if (cell.cover_start >= match.end) {
106+
comp += static_cast<int>(match.end - match.start);
107+
} else {
108+
comp += static_cast<int>(cell.cover_start - match.start);
109+
}
110+
uint64_t score = cell.score + pattern_score;
111+
112+
if ((comp > max_compression) || ((comp == max_compression) && (score > max_score))) {
113+
max_compression = comp;
114+
max_score = score;
115+
max_include = true;
116+
max_cell = cell;
117+
} else if (cell.optim_start > match.end) {
118+
cell_ring_.resize(e);
119+
break;
120+
}
121+
}
122+
123+
DynamicCell cell{
124+
.optim_start = match.start,
125+
.compression = max_compression,
126+
.score = max_score,
127+
};
128+
129+
if (max_include) {
130+
cell.cover_start = match.start;
131+
cell.pattern_index = patterns.size();
132+
133+
patterns.push_back(i - 1);
134+
patterns.push_back(max_cell.pattern_index);
135+
} else {
136+
cell.cover_start = max_cell.cover_start;
137+
cell.pattern_index = max_cell.pattern_index;
138+
}
139+
140+
cell_ring_.push_front(cell);
141+
}
142+
143+
auto& optimal_cell = cell_ring_[0];
144+
size_t last_uncovered = 0;
145+
auto& uncovered = result_.uncovered_ranges;
146+
147+
for (size_t pattern_index = optimal_cell.pattern_index; pattern_index != 0; pattern_index = patterns[pattern_index + 1]) {
148+
size_t match_index = patterns[pattern_index];
149+
auto& match = matches[match_index];
150+
151+
if (match.start > last_uncovered) {
152+
uncovered.emplace_back(last_uncovered, match.start);
153+
}
154+
last_uncovered = match.end;
155+
156+
result_.pattern_positions.emplace_back(match.start, match.value);
157+
}
158+
159+
if (word.size() > last_uncovered) {
160+
uncovered.emplace_back(last_uncovered, word.size());
161+
}
162+
163+
return result_;
164+
}
165+
166+
PatternCoveringSearch::PatternCoveringSearch(
167+
const PatriciaTree& patterns_tree,
168+
absl::FunctionRef<uint64_t(void*)> pattern_score_getter)
169+
: p_impl_(std::make_unique<PatternCoveringSearchImpl>(patterns_tree, pattern_score_getter)) {}
170+
PatternCoveringSearch::~PatternCoveringSearch() { static_assert(true); }
171+
172+
const Result& PatternCoveringSearch::cover_word(ByteView word) {
173+
return p_impl_->cover_word(word);
174+
}
175+
176+
} // namespace silkworm::snapshots::seg
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
Copyright 2024 The Silkworm Authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <memory>
20+
#include <utility>
21+
#include <vector>
22+
23+
#include <absl/functional/function_ref.h>
24+
25+
#include <silkworm/core/common/bytes.hpp>
26+
27+
#include "patricia_tree.hpp"
28+
29+
namespace silkworm::snapshots::seg {
30+
31+
class PatternCoveringSearchImpl;
32+
33+
class PatternCoveringSearch {
34+
public:
35+
PatternCoveringSearch(
36+
const PatriciaTree& patterns_tree,
37+
absl::FunctionRef<uint64_t(void*)> pattern_score_getter);
38+
~PatternCoveringSearch();
39+
40+
struct Result {
41+
/**
42+
* Positions of patterns found in a word.
43+
* Patterns are represented by their corresponding values in the PatriciaTree.
44+
*/
45+
std::vector<std::pair<size_t, void*>> pattern_positions;
46+
47+
/**
48+
* Ranges in a word that were not covered by patterns.
49+
* Each range has a start and end index.
50+
*/
51+
std::vector<std::pair<size_t, size_t>> uncovered_ranges;
52+
53+
void clear();
54+
};
55+
56+
/**
57+
* Find an optimal covering of a given word with patterns.
58+
* Ideally we want a covering that has maximal score and no intersections.
59+
*/
60+
const Result& cover_word(ByteView word);
61+
62+
private:
63+
std::unique_ptr<PatternCoveringSearchImpl> p_impl_;
64+
};
65+
66+
} // namespace silkworm::snapshots::seg

0 commit comments

Comments
 (0)