1+ #include " filter_database.hh"
2+
3+ #include < fcntl.h>
4+ #include < sys/mman.h>
5+ #include < sys/stat.h>
6+ #include < unistd.h>
7+
8+ #include < cstring>
9+ #include < filesystem>
10+ #include < fstream>
11+ #include < iostream>
12+ #include < string>
13+ #include < thread>
14+
15+ #include " mmap_file.hh"
16+ #include " pdqsort.h"
17+
18+ namespace {
19+
20+ void copy_subset (const std::filesystem::path& source_path,
21+ const std::filesystem::path& destination_path,
22+ const std::vector<size_t >& offsets) {
23+ MmapFile source (source_path);
24+
25+ absl::Span<const uint64_t > byte_offsets = source.data <uint64_t >();
26+
27+ std::string_view data = source.bytes ();
28+
29+ std::ofstream destination (destination_path, std::ios_base::out |
30+ std::ios_base::binary |
31+ std::ios_base::trunc);
32+
33+ std::vector<const char *> result_pointers;
34+ result_pointers.reserve (offsets.size ());
35+
36+ std::vector<uint64_t > result_byte_offsets;
37+ result_byte_offsets.reserve (offsets.size () + 1 );
38+
39+ uint64_t current_offset = sizeof (uint64_t ) * (offsets.size () + 1 );
40+
41+ for (size_t offset : offsets) {
42+ uint64_t start = byte_offsets[offset];
43+ uint64_t end = byte_offsets[offset + 1 ];
44+
45+ result_byte_offsets.push_back (current_offset);
46+ result_pointers.push_back (data.data () + start);
47+
48+ current_offset += (end - start);
49+ }
50+
51+ result_byte_offsets.push_back (current_offset);
52+
53+ destination.write ((const char *)result_byte_offsets.data (),
54+ sizeof (uint64_t ) * result_byte_offsets.size ());
55+
56+ for (size_t i = 0 ; i < result_pointers.size (); i++) {
57+ uint64_t length = result_byte_offsets[i + 1 ] - result_byte_offsets[i];
58+ destination.write (result_pointers[i], length);
59+ }
60+ }
61+
62+ void filter_database_property (const std::filesystem::path& source_path,
63+ const std::filesystem::path& destination_path,
64+ const std::vector<size_t >& offsets,
65+ const std::string& property_name) {
66+ std::filesystem::path source_property_path = source_path / property_name;
67+ std::filesystem::path destination_property_path =
68+ destination_path / property_name;
69+
70+ std::filesystem::create_directory (destination_property_path);
71+
72+ std::filesystem::path source_dictionary_path =
73+ source_property_path / " dictionary" ;
74+
75+ if (std::filesystem::exists (source_dictionary_path)) {
76+ std::filesystem::copy (source_dictionary_path,
77+ destination_property_path / " dictionary" );
78+ }
79+
80+ std::filesystem::copy (source_property_path / " zdict" ,
81+ destination_property_path / " zdict" );
82+
83+ copy_subset (source_property_path / " data" ,
84+ destination_property_path / " data" , offsets);
85+ }
86+
87+ } // namespace
88+
89+ void filter_database (const char * source, const char * destination,
90+ const char * subject_ids_file, int num_threads) {
91+ std::filesystem::path source_path (source);
92+ std::filesystem::path destination_path (destination);
93+
94+ MmapFile subject_ids_data{std::string (subject_ids_file)};
95+
96+ absl::Span<const int64_t > unsorted_subject_ids =
97+ subject_ids_data.data <int64_t >();
98+
99+ std::vector<int64_t > sorted_subject_ids (std::begin (unsorted_subject_ids),
100+ std::end (unsorted_subject_ids));
101+
102+ pdqsort (std::begin (sorted_subject_ids), std::end (sorted_subject_ids));
103+
104+ absl::Span<const int64_t > subject_ids (sorted_subject_ids.data (),
105+ sorted_subject_ids.size ());
106+
107+ std::filesystem::create_directory (destination_path);
108+
109+ std::filesystem::copy (source_path / " metadata" ,
110+ destination_path / " metadata" );
111+
112+ {
113+ std::ofstream subject_ids_file (
114+ destination_path / " subject_id" ,
115+ std::ios_base::out | std::ios_base::binary | std::ios_base::trunc);
116+
117+ subject_ids_file.write ((const char *)subject_ids.data (),
118+ sizeof (int64_t ) * subject_ids.size ());
119+ }
120+
121+ std::vector<size_t > offsets;
122+ offsets.reserve (subject_ids.size ());
123+
124+ {
125+ MmapFile source_subject_ids_file (source_path / " subject_id" );
126+ absl::Span<const int64_t > source_subject_ids =
127+ source_subject_ids_file.data <int64_t >();
128+
129+ auto first =
130+ std::lower_bound (std::begin (source_subject_ids),
131+ std::end (source_subject_ids), subject_ids.front ());
132+ auto last =
133+ std::upper_bound (std::begin (source_subject_ids),
134+ std::end (source_subject_ids), subject_ids.back ());
135+
136+ for (int64_t subject_id : subject_ids) {
137+ auto iter = std::lower_bound (first, last, subject_id);
138+ if (*iter != subject_id) {
139+ throw std::runtime_error (
140+ std::string (" Could not find subject_id " ) +
141+ std::to_string (subject_id) + " in database " +
142+ std::to_string (*iter));
143+ }
144+
145+ offsets.push_back (iter - std::begin (source_subject_ids));
146+
147+ first = ++iter;
148+ }
149+ }
150+
151+ {
152+ MmapFile source_subject_lengths_file (source_path /
153+ " meds_reader.length" );
154+ absl::Span<const uint32_t > source_subject_lengths =
155+ source_subject_lengths_file.data <uint32_t >();
156+ std::vector<uint32_t > subject_lengths;
157+ subject_lengths.reserve (subject_ids.size ());
158+
159+ for (size_t offset : offsets) {
160+ subject_lengths.push_back (source_subject_lengths[offset]);
161+ }
162+
163+ std::ofstream subject_lengths_file (
164+ destination_path / " meds_reader.length" ,
165+ std::ios_base::out | std::ios_base::binary | std::ios_base::trunc);
166+
167+ subject_lengths_file.write ((const char *)subject_lengths.data (),
168+ sizeof (uint32_t ) * subject_lengths.size ());
169+ }
170+
171+ std::vector<std::string> properties;
172+
173+ {
174+ MmapFile property_file (source_path / " meds_reader.properties" );
175+
176+ const char * data = property_file.bytes ().data ();
177+ const char * end =
178+ property_file.bytes ().data () + property_file.bytes ().size ();
179+
180+ while (data != end) {
181+ size_t name_length = *(const size_t *)data;
182+ data += sizeof (size_t );
183+ properties.push_back (std::string (data, name_length));
184+ data += name_length;
185+ data += sizeof (int64_t );
186+
187+ std::cout << " Got property " << properties.back () << std::endl;
188+ }
189+ }
190+
191+ properties.push_back (" meds_reader.null_map" );
192+
193+ for (const auto & property : properties) {
194+ filter_database_property (source_path, destination_path, offsets,
195+ property);
196+ }
197+
198+ std::filesystem::copy (source_path / " meds_reader.properties" ,
199+ destination_path / " meds_reader.properties" );
200+
201+ std::filesystem::copy (source_path / " meds_reader.version" ,
202+ destination_path / " meds_reader.version" );
203+ }
0 commit comments