1414#include " alignment/sequence_mapper_notifier.hpp"
1515#include " alignment/sequence_mapper.hpp"
1616
17+ #include < algorithm>
18+ #include < cmath>
1719#include < random>
1820#include < string>
1921#include < unordered_set>
@@ -199,31 +201,29 @@ class FrameBarcodeIndexBuilder {
199201 template <class ReadType >
200202 void ConstructBarcodeIndex (io::ReadStreamList<ReadType> read_streams,
201203 FrameBarcodeIndex<Graph> &barcode_index,
202- const io::SequencingLibraryBase &lib,
203204 bool is_tellseq);
204205
205- void DownsampleBarcodeIndex (FrameBarcodeIndex<Graph> &downsampled_index, FrameBarcodeIndex<Graph> &original_index, double sampling_factor) {
206+ void DownsampleBarcodeIndex (FrameBarcodeIndex<Graph> &downsampled_index,
207+ FrameBarcodeIndex<Graph> &original_index,
208+ double sampling_factor,
209+ int seed) {
206210 std::unordered_set<BarcodeId> barcodes;
207- std::unordered_set<BarcodeId> passed_barcodes;
208- BarcodeId min_barcode = std::numeric_limits<BarcodeId>::max ();
209- BarcodeId max_barcode = std::numeric_limits<BarcodeId>::min ();
210211 for (auto it = original_index.begin (); it != original_index.end (); ++it) {
211212 const auto &barcode_distribution = it->second .GetDistribution ();
212213 for (const auto &entry: barcode_distribution) {
213- BarcodeId current_barcode = entry.first ;
214- barcodes.insert (current_barcode);
215- min_barcode = std::min (min_barcode, current_barcode);
216- max_barcode = std::max (max_barcode, current_barcode);
214+ barcodes.insert (entry.first );
217215 }
218216 }
219217 INFO (" Number of encountered barcodes: " << barcodes.size ());
220- INFO (" Barcode id range: " << min_barcode << " , " << max_barcode);
221- double barcode_thr = static_cast <double >(max_barcode - min_barcode) * sampling_factor;
222- for (const auto &barcode: barcodes) {
223- if (math::le (static_cast <double >(barcode - min_barcode), barcode_thr)) {
224- passed_barcodes.insert (barcode);
225- }
226- }
218+ size_t target = static_cast <size_t >(std::round (static_cast <double >(barcodes.size ()) * sampling_factor));
219+ std::unordered_set<BarcodeId> passed_barcodes;
220+ passed_barcodes.reserve (target);
221+ std::mt19937 rng (seed);
222+ std::sample (barcodes.begin (),
223+ barcodes.end (),
224+ std::inserter (passed_barcodes, passed_barcodes.end ()),
225+ target,
226+ rng);
227227 INFO (" Passed barcodes: " << passed_barcodes.size ());
228228
229229 downsampled_index.InitialFillMap ();
@@ -248,7 +248,6 @@ class FrameBarcodeIndexBuilder {
248248template <class ReadType >
249249void FrameBarcodeIndexBuilder::ConstructBarcodeIndex (io::ReadStreamList<ReadType> read_streams,
250250 FrameBarcodeIndex<Graph> &barcode_index,
251- const io::SequencingLibraryBase &lib,
252251 bool is_tellseq) {
253252 {
254253 size_t starting_barcode = 0 ;
0 commit comments