Skip to content

Commit 95e9029

Browse files
committed
Refactor fwd/rev counting
1 parent c9e1cfb commit 95e9029

6 files changed

Lines changed: 126 additions & 10 deletions

File tree

include/input.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ struct UserInputTeloscope : UserInput {
1616
unsigned short int canonicalSize = 6;
1717
std::vector<std::string> rawPatterns = {"TTAGGG", "CCCTAA"};
1818
std::vector<std::string> patterns = {"TTAGGG", "CCCTAA"};
19+
std::vector<std::pair<std::string, bool>> patternInfo; // (pattern, isForward)
1920

2021
uint32_t windowSize = 1000;
2122
uint32_t step = 500;

include/teloscope.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class Trie {
1414
struct TrieNode {
1515
std::array<int32_t, 4> children = {-1, -1, -1, -1}; // A=0, C=1, G=2, T=3
1616
bool isEndOfWord = false;
17+
bool isForward = false; // true if pattern is forward-oriented (closer to canonicalFwd)
1718
};
1819

1920
std::vector<TrieNode> nodes; // contiguous node pool
@@ -33,7 +34,7 @@ class Trie {
3334
public:
3435
Trie() { nodes.emplace_back(); } // root at index 0
3536

36-
void insertPattern(const std::string& pattern);
37+
void insertPattern(const std::string& pattern, bool isForward);
3738

3839
// Return root index (always 0)
3940
int32_t getRoot() const { return 0; }
@@ -49,6 +50,11 @@ class Trie {
4950
return nodes[nodeIdx].isEndOfWord;
5051
}
5152

53+
// Check if pattern ending at this node is forward-oriented
54+
bool isForward(int32_t nodeIdx) const {
55+
return nodes[nodeIdx].isForward;
56+
}
57+
5258
unsigned short int getLongestPatternSize() const {
5359
return longestPatternSize;
5460
}
@@ -199,8 +205,8 @@ class Teloscope {
199205
Teloscope(UserInputTeloscope userInput) : userInput(userInput),
200206
canonicalFwdView(this->userInput.canonicalFwd),
201207
canonicalRevView(this->userInput.canonicalRev) {
202-
for (const auto& pattern : this->userInput.patterns) {
203-
trie.insertPattern(pattern);
208+
for (const auto& [pattern, isForward] : this->userInput.patternInfo) {
209+
trie.insertPattern(pattern, isForward);
204210
}
205211
}
206212

include/tools.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,9 @@ std::vector<std::string> expandPatterns(
2323
const std::vector<std::string> &rawPatterns,
2424
uint8_t editDistance);
2525

26+
std::vector<std::pair<std::string, bool>> expandPatternsWithOrientation(
27+
const std::vector<std::string> &rawPatterns,
28+
uint8_t editDistance,
29+
const std::string &canonicalFwd);
30+
2631
#endif // TOOLS_H

src/main.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,16 @@ int main(int argc, char **argv) {
158158
exit(EXIT_FAILURE);
159159
}
160160

161-
// Store canonical pattern and its reverse complement
161+
// Store canonical pattern with lexicographically smaller as Fwd
162162
userInput.canonicalSize = canonicalPattern.size();
163-
userInput.canonicalFwd = canonicalPattern;
164-
userInput.canonicalRev = revCom(canonicalPattern);
163+
std::string revComp = revCom(canonicalPattern);
164+
if (canonicalPattern <= revComp) {
165+
userInput.canonicalFwd = canonicalPattern;
166+
userInput.canonicalRev = revComp;
167+
} else {
168+
userInput.canonicalFwd = revComp;
169+
userInput.canonicalRev = canonicalPattern;
170+
}
165171
fprintf(stderr, "Setting canonical pattern: %s and its reverse complement: %s\n", userInput.canonicalFwd.c_str(), userInput.canonicalRev.c_str());
166172
}
167173
break;
@@ -434,9 +440,17 @@ int main(int argc, char **argv) {
434440
userInput.rawPatterns = {"TTAGGG", "CCCTAA"};
435441
}
436442

437-
// Expand IUPAC + edit-distance variants once and log the result
443+
// Expand IUPAC + edit-distance variants with orientation info
438444
lg.verbose("Input variables assigned");
439-
userInput.patterns = expandPatterns(userInput.rawPatterns, userInput.editDistance);
445+
userInput.patternInfo = expandPatternsWithOrientation(
446+
userInput.rawPatterns, userInput.editDistance, userInput.canonicalFwd);
447+
448+
// Also populate patterns vector for compatibility
449+
userInput.patterns.clear();
450+
userInput.patterns.reserve(userInput.patternInfo.size());
451+
for (const auto& [pattern, isForward] : userInput.patternInfo) {
452+
userInput.patterns.push_back(pattern);
453+
}
440454

441455
fprintf(stderr, "Scanning %zu telomeric variants (includes reverse complements).\n",
442456
userInput.patterns.size());

src/teloscope.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#include "input.h"
2727

2828
// Insert pattern into the flat node pool. Each char maps to index 0-3.
29-
void Trie::insertPattern(const std::string& pattern) {
29+
void Trie::insertPattern(const std::string& pattern, bool isForward) {
3030
int32_t current = 0; // start at root
3131
for (char ch : pattern) {
3232
int8_t idx = charToIndex(ch);
@@ -40,6 +40,7 @@ void Trie::insertPattern(const std::string& pattern) {
4040
current = nodes[current].children[idx];
4141
}
4242
nodes[current].isEndOfWord = true;
43+
nodes[current].isForward = isForward;
4344

4445
if (pattern.size() > longestPatternSize) {
4546
longestPatternSize = pattern.size();
@@ -485,7 +486,7 @@ void Teloscope::analyzeWindow(const std::string_view &window, uint32_t windowSta
485486
std::string_view pattern(window.data() + i, (j - i + 1));
486487
bool isTerminal = (windowStart + i <= terminalLimit ||
487488
windowStart + i >= segmentSize - terminalLimit); // Check i/j handles
488-
bool isForward = (pattern.size() >= 3 && pattern.compare(0, 3, "CCC") == 0);
489+
bool isForward = trie.isForward(current); // O(1) lookup from Trie node
489490
bool isCanonical = (pattern == canonicalFwdView ||
490491
pattern == canonicalRevView);
491492
float densityGain = (static_cast<float>(pattern.size()) / window.size()) * 100.0f;

src/tools.cpp

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,4 +146,93 @@ std::vector<std::string> expandPatterns(
146146
preparedPatterns.end());
147147

148148
return preparedPatterns;
149+
}
150+
151+
152+
std::vector<std::pair<std::string, bool>> expandPatternsWithOrientation(
153+
const std::vector<std::string> &rawPatterns,
154+
uint8_t editDistance,
155+
const std::string &canonicalFwd) {
156+
157+
// Helper: check if pattern is closer to canonicalFwd than canonicalRev
158+
auto isCloserToFwd = [&](const std::string &pattern) -> bool {
159+
std::string canonicalRev = revCom(canonicalFwd);
160+
size_t patLen = pattern.size();
161+
size_t canLen = canonicalFwd.size();
162+
163+
// Same length: direct Hamming comparison
164+
if (patLen == canLen) {
165+
uint8_t distFwd = 0, distRev = 0;
166+
for (size_t i = 0; i < patLen; ++i) {
167+
if (pattern[i] != canonicalFwd[i]) ++distFwd;
168+
if (pattern[i] != canonicalRev[i]) ++distRev;
169+
}
170+
// Tie-break: lexicographically smaller canonical wins (canonicalFwd is always lex-smaller)
171+
return distFwd <= distRev;
172+
}
173+
174+
// Different lengths: find best alignment for each
175+
const std::string &shorter = (patLen < canLen) ? pattern : canonicalFwd;
176+
const std::string &longer = (patLen < canLen) ? canonicalFwd : pattern;
177+
const std::string longerRev = (patLen < canLen) ? canonicalRev : revCom(pattern);
178+
size_t shortLen = shorter.size();
179+
size_t longLen = longer.size();
180+
181+
uint8_t minDistFwd = 255, minDistRev = 255;
182+
for (size_t offset = 0; offset <= longLen - shortLen; ++offset) {
183+
uint8_t dist = 0;
184+
for (size_t i = 0; i < shortLen; ++i) {
185+
if (shorter[i] != longer[offset + i]) ++dist;
186+
}
187+
minDistFwd = std::min(minDistFwd, dist);
188+
}
189+
for (size_t offset = 0; offset <= longLen - shortLen; ++offset) {
190+
uint8_t dist = 0;
191+
for (size_t i = 0; i < shortLen; ++i) {
192+
if (shorter[i] != longerRev[offset + i]) ++dist;
193+
}
194+
minDistRev = std::min(minDistRev, dist);
195+
}
196+
return minDistFwd <= minDistRev;
197+
};
198+
199+
std::vector<std::pair<std::string, bool>> result;
200+
201+
for (const auto &seed : rawPatterns) {
202+
if (seed.empty()) continue;
203+
204+
std::vector<std::string> combinations;
205+
std::string working = seed;
206+
getCombinations(seed, working, 0, combinations);
207+
208+
for (const auto &combo : combinations) {
209+
// Determine if this seed combo is forward-oriented
210+
bool seedIsForward = isCloserToFwd(combo);
211+
212+
std::vector<std::string> variants;
213+
variants.push_back(combo);
214+
215+
if (editDistance > 0) {
216+
auto edits = getEditVariants(combo, editDistance);
217+
variants.insert(variants.end(), edits.begin(), edits.end());
218+
}
219+
220+
for (const auto &variant : variants) {
221+
// Forward variant inherits seed orientation
222+
result.emplace_back(variant, seedIsForward);
223+
// RevCom is opposite orientation
224+
result.emplace_back(revCom(variant), !seedIsForward);
225+
}
226+
}
227+
}
228+
229+
// Sort and deduplicate, keeping first occurrence (orientation consistent)
230+
std::sort(result.begin(), result.end(),
231+
[](const auto &a, const auto &b) { return a.first < b.first; });
232+
result.erase(
233+
std::unique(result.begin(), result.end(),
234+
[](const auto &a, const auto &b) { return a.first == b.first; }),
235+
result.end());
236+
237+
return result;
149238
}

0 commit comments

Comments
 (0)