Skip to content

Commit 9c780e6

Browse files
authored
Initial working sample (#3)
1 parent 7aa1206 commit 9c780e6

File tree

9 files changed

+261
-78
lines changed

9 files changed

+261
-78
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,4 @@
3535
build/
3636
.cache/
3737
.vscode/
38+
samples/

CMakePresets.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"description": "Preset for ARM macOS",
2727
"generator": "Ninja",
2828
"cacheVariables": {
29+
"CMAKE_BUILD_TYPE": "Release",
2930
"CMAKE_SYSTEM_NAME": "Darwin",
3031
"VCPKG_TARGET_TRIPLET": "arm64-osx"
3132
}

include/sparser.h

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ constexpr size_t kMaxPred = 10;
2121
constexpr size_t kMaxConj = 10;
2222
constexpr size_t kTotalMaxRfs = kMaxRfsInPred * kMaxPred * kMaxConj;
2323

24+
class InputReader {
25+
public:
26+
static std::string ReadFile(const std::string& filename);
27+
static std::vector<std::string_view> ReadRecords(const std::string& input);
28+
};
29+
2430
struct EstimationResult {
2531
double total_parser_runtime;
2632
std::array<double, kTotalMaxRfs> total_rf_runtimes;
@@ -32,10 +38,6 @@ struct RawFilterData {
3238
std::array<std::array<size_t, kMaxPred>, kMaxConj> rf_count = {};
3339
std::array<size_t, kMaxConj> pred_count = {};
3440
size_t conj_count = 0;
35-
36-
static size_t GetFlatIdx(size_t conj_idx, size_t pred_idx, size_t rf_idx) {
37-
return conj_idx * kMaxPred * kMaxRfsInPred + pred_idx * kMaxRfsInPred + rf_idx;
38-
}
3941
};
4042

4143
class RawFilterQueryGenerator {
@@ -44,18 +46,6 @@ class RawFilterQueryGenerator {
4446
static std::vector<std::string_view> GenerateRawFiltersFromPredicate(const std::string_view& input);
4547
};
4648

47-
class Sparser {
48-
public:
49-
explicit Sparser(std::unique_ptr<JsonQueryDriver>&& json_query_driver = {})
50-
: json_query_driver_(std::move(json_query_driver)) {}
51-
52-
EstimationResult Calibrate(const std::vector<std::string_view>& input, const JsonQuery& json_query,
53-
const RawFilterData& rf_data);
54-
55-
private:
56-
std::unique_ptr<JsonQueryDriver> json_query_driver_;
57-
};
58-
5949
enum class NodeType { INTER, FAIL, PARSE };
6050

6151
struct Node {
@@ -76,6 +66,23 @@ struct Node {
7666
type(node_type) {}
7767
};
7868

69+
class Sparser {
70+
public:
71+
explicit Sparser(std::unique_ptr<JsonQueryDriver>&& json_query_driver = {})
72+
: json_query_driver_(std::move(json_query_driver)) {}
73+
74+
void Run(const std::string& input_path, const JsonQuery& json_query);
75+
76+
EstimationResult Calibrate(const std::vector<std::string_view>& input, const JsonQuery& json_query,
77+
const RawFilterData& rf_data);
78+
void SearchCascade(const std::vector<std::string_view>& input, const JsonQuery& json_query,
79+
const RawFilterData& rf_data, const std::shared_ptr<Node>);
80+
void SearchNaive(const std::vector<std::string_view>& input, const JsonQuery& json_query);
81+
82+
private:
83+
std::unique_ptr<JsonQueryDriver> json_query_driver_;
84+
};
85+
7986
class CascadeBuilder {
8087
public:
8188
CascadeBuilder(const PredicateDisjunction& disjunction, const RawFilterData& raw_filter_data)
@@ -88,8 +95,8 @@ class CascadeBuilder {
8895
std::shared_ptr<Node> parse_node = std::make_shared<Node>(0, 0, 0, nullptr, nullptr, NodeType::PARSE);
8996
const PredicateDisjunction& disjunction_;
9097
const RawFilterData& rf_data_;
91-
std::bitset<kMaxDepth> used_conjunctions_;
92-
std::array<std::array<std::bitset<10>, kMaxDepth>, kMaxDepth> used_predicates_; // TODO: Add correct dimensions
98+
std::bitset<kMaxConj> used_conjunctions_;
99+
std::array<std::array<std::bitset<kMaxRfsInPred>, kMaxPred>, kMaxConj> used_rfs_;
93100

94101
std::vector<std::shared_ptr<Node>> HandleFail(const size_t current_depth);
95102
std::vector<std::shared_ptr<Node>> HandleSuccess(const size_t current_depth, const size_t conjunction_idx);
@@ -112,4 +119,8 @@ class CascadeEvaluator {
112119
void EvaluateNodeRec(std::shared_ptr<Node> node, std::bitset<kSampleSize> cumulative_bitset);
113120
};
114121

122+
static inline size_t GetFlatIdx(size_t conj_idx, size_t pred_idx, size_t rf_idx) {
123+
return conj_idx * kMaxPred * kMaxRfsInPred + pred_idx * kMaxRfsInPred + rf_idx;
124+
}
125+
115126
#endif // SPARSER_H_

scripts/generate_sample_data.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import json
2+
import random
3+
from typing import List, Dict, Any
4+
5+
6+
def generate_json_records(
7+
num_records: int, schema: Dict[str, Any], key: str, value: Any, percentage: float
8+
) -> List[str]:
9+
"""
10+
Generate newline-delimited JSON records.
11+
12+
:param num_records: Number of JSON records to generate.
13+
:param schema: A dictionary defining the schema for JSON records.
14+
:param key: The key for which a specific value will be used.
15+
:param value: The value to use for the specified key.
16+
:param percentage: Percentage of records that will have the specified value for the key.
17+
:return: A list of JSON strings (one per line).
18+
"""
19+
records = []
20+
for _ in range(num_records):
21+
record = {}
22+
for field, field_type in schema.items():
23+
if field == key and random.random() < percentage / 100:
24+
record[field] = value
25+
else:
26+
record[field] = generate_value(field_type)
27+
records.append(json.dumps(record))
28+
return records
29+
30+
31+
def generate_value(field_type: Any) -> Any:
32+
"""
33+
Generate a random value based on the field type.
34+
35+
:param field_type: Type of the field, can be str, int, float, list, or dict.
36+
:return: A randomly generated value.
37+
"""
38+
if field_type == str:
39+
return random.choice(["alpha", "beta", "gamma", "delta"])
40+
elif field_type == int:
41+
return random.randint(0, 100)
42+
elif field_type == float:
43+
return round(random.uniform(0, 100), 2)
44+
elif field_type == list:
45+
return random.sample(["a", "b", "c", "d", "e"], k=random.randint(1, 3))
46+
elif field_type == dict:
47+
return {
48+
"subfield1": random.choice(["x", "y", "z"]),
49+
"subfield2": random.randint(1, 10),
50+
}
51+
else:
52+
return None
53+
54+
55+
# Example usage
56+
if __name__ == "__main__":
57+
schema_definition = {
58+
"id": int,
59+
"name": str,
60+
"score": float,
61+
"tags": list,
62+
"details": dict,
63+
}
64+
65+
json_records = generate_json_records(
66+
num_records=100000,
67+
schema=schema_definition,
68+
key="name",
69+
value="Trump",
70+
percentage=30,
71+
)
72+
73+
with open("samples/generated_records.json", "w") as file:
74+
file.write("\n".join(json_records))
75+
76+
print("JSON records have been written to 'samples/generated_records.json'")

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ add_subdirectory(sparser)
22

33
add_executable(SparserMain main.cpp)
44
target_link_libraries(SparserMain PRIVATE SparserCpp)
5+
target_link_libraries(SparserMain PRIVATE rapidjson)

src/main.cpp

Lines changed: 10 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,9 @@
1-
#include <exception>
2-
#include <fstream>
3-
#include <iostream>
41
#include <span>
5-
#include <stdexcept>
6-
#include <string>
7-
#include <string_view>
8-
9-
constexpr double GIGABYTE = 1e9;
10-
11-
/**
12-
* Reads the contents of a file into a dynamically allocated buffer.
13-
*
14-
* @param filename The name of the file to be read.
15-
* @return A string containing the contents of the file.
16-
*/
17-
std::string readFile(std::string filename) {
18-
std::ifstream file(filename, std::ios::binary | std::ios::ate);
19-
20-
if (!file) {
21-
throw std::runtime_error("Error opening file: " + std::string(filename));
22-
}
23-
24-
auto fileSize = file.tellg();
25-
file.seekg(0, std::ios::beg);
262

27-
std::string buffer(fileSize, '\0');
28-
29-
if (!file.read(buffer.data(), fileSize)) {
30-
throw std::runtime_error("Error reading file: " + std::string(filename));
31-
}
3+
#include "json_facade.h"
4+
#include "sparser.h"
325

33-
return buffer;
34-
}
6+
// constexpr double GIGABYTE = 1e9;
357

368
int main(int argc, char* argv[]) {
379
try {
@@ -44,13 +16,14 @@ int main(int argc, char* argv[]) {
4416

4517
const std::string filename = args[1];
4618

47-
std::cout << "Reading file: " << filename << "\n";
48-
auto buffer = readFile(filename);
49-
std::cout << "Done reading! File size: " << static_cast<double>(buffer.size()) / GIGABYTE << " GB" << "\n";
19+
Predicate pred1{.key = "name", .value = "Trump"};
5020

51-
if (buffer.empty()) {
52-
return 1;
53-
}
21+
PredicateConjunction conj1{{pred1}};
22+
PredicateDisjunction disj{{conj1}};
23+
24+
auto json_query_driver = new JsonQueryDriver(std::make_unique<RapidJsonFacade>());
25+
auto sparser = Sparser(std::unique_ptr<JsonQueryDriver>(json_query_driver));
26+
sparser.Run(filename, JsonQuery(disj));
5427

5528
} catch (const std::exception& e) {
5629
std::cerr << "Exception caught: " << e.what() << "\n";

src/sparser/json_facade.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ std::ostream& operator<<(std::ostream& os, const JsonQuery& query) {
3737
}
3838

3939
void RapidJsonFacade::Parse(std::string_view jsonStr) {
40-
rapidjson::ParseResult ok = doc_.Parse(jsonStr.data());
40+
rapidjson::ParseResult ok = doc_.Parse(jsonStr.data(), jsonStr.size());
4141
if (!ok || !doc_.IsObject()) {
4242
throw std::runtime_error("Failed to parse JSON string");
4343
}

0 commit comments

Comments
 (0)