Skip to content

Commit d711b0c

Browse files
committed
dont evaluate TLEN in duplication evaluation to save RAM and CPU
1 parent 8995b3e commit d711b0c

File tree

8 files changed

+48
-73
lines changed

8 files changed

+48
-73
lines changed

src/duplicate.cpp

Lines changed: 29 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "duplicate.h"
22
#include "overlapanalysis.h"
33
#include <memory.h>
4+
#include <math.h>
45

56
Duplicate::Duplicate(Options* opt) {
67
mOptions = opt;
@@ -10,10 +11,8 @@ Duplicate::Duplicate(Options* opt) {
1011
memset(mDups, 0, sizeof(uint64)*mKeyLenInBit);
1112
mCounts = new uint16[mKeyLenInBit];
1213
memset(mCounts, 0, sizeof(uint16)*mKeyLenInBit);
13-
mLength = new uint16[mKeyLenInBit];
14-
memset(mLength, 0, sizeof(uint16)*mKeyLenInBit);
15-
mGC = new uint16[mKeyLenInBit];
16-
memset(mGC, 0, sizeof(uint16)*mKeyLenInBit);
14+
mGC = new uint8[mKeyLenInBit];
15+
memset(mGC, 0, sizeof(uint8)*mKeyLenInBit);
1716
}
1817

1918
Duplicate::~Duplicate(){
@@ -48,11 +47,10 @@ uint64 Duplicate::seq2int(const char* data, int start, int keylen, bool& valid)
4847
return ret;
4948
}
5049

51-
void Duplicate::addRecord(uint32 key, uint64 kmer32, int tlen, int gc) {
50+
void Duplicate::addRecord(uint32 key, uint64 kmer32, uint8 gc) {
5251
if(mCounts[key] == 0) {
5352
mCounts[key] = 1;
5453
mDups[key] = kmer32;
55-
mLength[key] = tlen;
5654
mGC[key] = gc;
5755
} else {
5856
if(mDups[key] == kmer32)
@@ -89,7 +87,9 @@ void Duplicate::statRead(Read* r) {
8987
}
9088
}
9189

92-
addRecord(key, kmer32, r->length(), gc);
90+
gc = round(255.0 * (double) gc / (double) r->length());
91+
92+
addRecord(key, kmer32, (uint8)gc);
9393
}
9494

9595
void Duplicate::statPair(Read* r1, Read* r2) {
@@ -112,33 +112,27 @@ void Duplicate::statPair(Read* r1, Read* r2) {
112112
int gc = 0;
113113

114114
// not calculated
115-
int tlen = 0;
116115
if(mCounts[key] == 0) {
117-
OverlapResult ov = OverlapAnalysis::analyze(r1, r2);
118-
if(ov.overlap_len > 30) {
119-
if(ov.offset < 0)
120-
tlen = ov.overlap_len;
121-
else
122-
tlen = r1->length() + r2->length() - ov.overlap_len;
123-
}
124-
for(int i=0; i<r1->length() && i<tlen; i++) {
116+
for(int i=0; i<r1->length(); i++) {
125117
if(data1[i] == 'G' || data1[i] == 'C')
126118
gc++;
127119
}
128-
for(int i=0; i<r2->length() && i<tlen-r1->length(); i++) {
120+
for(int i=0; i<r2->length(); i++) {
129121
if(data2[i] == 'G' || data2[i] == 'C')
130122
gc++;
131123
}
132124
}
133125

134-
addRecord(key, kmer32, tlen, gc);
126+
gc = round(255.0 * (double) gc / (double)( r1->length() + r2->length()));
127+
128+
addRecord(key, kmer32, gc);
135129
}
136130

137-
double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN, double* meanGC, int histSize) {
131+
double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanGC, int histSize) {
138132
long totalNum = 0;
139133
long dupNum = 0;
140-
int* gcTlenNum = new int[histSize];
141-
memset(gcTlenNum, 0, sizeof(int)*histSize);
134+
int* gcStatNum = new int[histSize];
135+
memset(gcStatNum, 0, sizeof(int)*histSize);
142136
for(int key=0; key<list[0]->mKeyLenInBit; key++) {
143137
bool consistent = true;
144138
for(int i=0; i<list.size()-1; i++) {
@@ -148,15 +142,13 @@ double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN,
148142
}
149143
if(consistent) {
150144
int count = 0;
151-
int numSum = 0;
152-
double tlenSum = 0;
153145
double gcSum = 0;
146+
int num = 0;
154147
for(int i=0; i<list.size(); i++) {
155148
count += list[i]->mCounts[key];
156-
if(list[i]->mLength[key] > 0) {
157-
numSum++;
158-
tlenSum += list[i]->mLength[key];
159-
gcSum += (double)list[i]->mGC[key] / (double)list[i]->mLength[key];
149+
if(list[i]->mGC[key]>0) {
150+
gcSum += (double)list[i]->mGC[key];
151+
num++;
160152
}
161153
}
162154

@@ -166,32 +158,30 @@ double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN,
166158

167159
if(count >= histSize){
168160
hist[histSize-1]++;
169-
if(numSum > 0) {
170-
meanTLEN[histSize-1] += tlenSum/numSum;
171-
meanGC[histSize-1] += gcSum/numSum;
172-
gcTlenNum[histSize-1]++;
161+
if(num>0) {
162+
meanGC[histSize-1] += gcSum/num;
163+
gcStatNum[histSize-1]++;
173164
}
174165
}
175166
else{
176167
hist[count]++;
177-
if(numSum > 0) {
178-
meanTLEN[count] += tlenSum/numSum;
179-
meanGC[count] += gcSum/numSum;
180-
gcTlenNum[count]++;
168+
if(num>0) {
169+
meanGC[count] += gcSum/num;
170+
gcStatNum[count]++;
181171
}
172+
182173
}
183174
}
184175
}
185176
}
186177

187178
for(int i=0; i<histSize; i++) {
188-
if(gcTlenNum[i] > 0) {
189-
meanTLEN[i] = meanTLEN[i] / gcTlenNum[i];
190-
meanGC[i] = meanGC[i] / gcTlenNum[i];
179+
if(gcStatNum[i] > 0) {
180+
meanGC[i] = meanGC[i] / 255.0 / gcStatNum[i];
191181
}
192182
}
193183

194-
delete[] gcTlenNum;
184+
delete[] gcStatNum;
195185

196186
if(totalNum == 0)
197187
return 0.0;

src/duplicate.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,18 @@ class Duplicate{
1818
void statRead(Read* r1);
1919
void statPair(Read* r1, Read* r2);
2020
uint64 seq2int(const char* data, int start, int keylen, bool& valid);
21-
void addRecord(uint32 key, uint64 kmer32, int tlen, int gc);
21+
void addRecord(uint32 key, uint64 kmer32, uint8 gc);
2222

2323
// make histogram and get duplication rate
24-
static double statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN, double* meanGC, int histSize);
24+
static double statAll(vector<Duplicate*>& list, int* hist, double* meanGC, int histSize);
2525

2626
private:
2727
Options* mOptions;
2828
int mKeyLenInBase;
2929
int mKeyLenInBit;
3030
uint64* mDups;
3131
uint16* mCounts;
32-
uint16* mLength;
33-
uint16* mGC;
32+
uint8* mGC;
3433

3534
};
3635

src/htmlreporter.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "htmlreporter.h"
22
#include <chrono>
3+
#include <memory.h>
34

45
extern string command;
56

@@ -12,9 +13,8 @@ HtmlReporter::HtmlReporter(Options* opt){
1213
HtmlReporter::~HtmlReporter(){
1314
}
1415

15-
void HtmlReporter::setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate) {
16+
void HtmlReporter::setDupHist(int* dupHist, double* dupMeanGC, double dupRate) {
1617
mDupHist = dupHist;
17-
mDupMeanTlen = dupMeanTlen;
1818
mDupMeanGC = dupMeanGC;
1919
mDupRate = dupRate;
2020
}
@@ -185,8 +185,11 @@ void HtmlReporter::reportDuplication(ofstream& ofs) {
185185
allCount += mDupHist[i+1];
186186
}
187187
double* percents = new double[total];
188-
for(int i=0; i<total; i++) {
189-
percents[i] = (double)mDupHist[i+1] * 100.0 / (double)allCount;
188+
memset(percents, 0, sizeof(double)*total);
189+
if(allCount > 0) {
190+
for(int i=0; i<total; i++) {
191+
percents[i] = (double)mDupHist[i+1] * 100.0 / (double)allCount;
192+
}
190193
}
191194
int maxGC = total;
192195
double* gc = new double[total];
@@ -196,7 +199,6 @@ void HtmlReporter::reportDuplication(ofstream& ofs) {
196199
if(percents[i] <= 0.05 && maxGC == total)
197200
maxGC = i;
198201
}
199-
double* tlen = mDupMeanTlen + 1;
200202

201203
json_str += "{";
202204
json_str += "x:[" + Stats::list2string(x, total) + "],";

src/htmlreporter.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class HtmlReporter{
1616
public:
1717
HtmlReporter(Options* opt);
1818
~HtmlReporter();
19-
void setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate);
19+
void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
2020
void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);
2121

2222
static void outputRow(ofstream& ofs, string key, long value);
@@ -35,7 +35,6 @@ class HtmlReporter{
3535
private:
3636
Options* mOptions;
3737
int* mDupHist;
38-
double* mDupMeanTlen;
3938
double* mDupMeanGC;
4039
double mDupRate;
4140
};

src/jsonreporter.cpp

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ JsonReporter::JsonReporter(Options* opt){
99
JsonReporter::~JsonReporter(){
1010
}
1111

12-
void JsonReporter::setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate) {
12+
void JsonReporter::setDupHist(int* dupHist, double* dupMeanGC, double dupRate) {
1313
mDupHist = dupHist;
14-
mDupMeanTlen = dupMeanTlen;
1514
mDupMeanGC = dupMeanGC;
1615
mDupRate = dupRate;
1716
}
@@ -104,13 +103,6 @@ void JsonReporter::report(FilterResult* result, Stats* preStats1, Stats* postSta
104103
ofs << ",";
105104
}
106105
ofs << "]," << endl;
107-
ofs << "\t\t\"mean_tlen\": [";
108-
for(int d=1; d<mOptions->duplicate.histSize; d++) {
109-
ofs << mDupMeanTlen[d];
110-
if(d!=mOptions->duplicate.histSize-1)
111-
ofs << ",";
112-
}
113-
ofs << "]," << endl;
114106
ofs << "\t\t\"mean_gc\": [";
115107
for(int d=1; d<mOptions->duplicate.histSize; d++) {
116108
ofs << mDupMeanGC[d];

src/jsonreporter.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,12 @@ class JsonReporter{
1717
JsonReporter(Options* opt);
1818
~JsonReporter();
1919

20-
void setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate);
20+
void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
2121
void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);
2222

2323
private:
2424
Options* mOptions;
2525
int* mDupHist;
26-
double* mDupMeanTlen;
2726
double* mDupMeanGC;
2827
double mDupRate;
2928
};

src/peprocessor.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,27 +154,25 @@ bool PairEndProcessor::process(){
154154
if(mOptions->duplicate.enabled) {
155155
dupHist = new int[mOptions->duplicate.histSize];
156156
memset(dupHist, 0, sizeof(int) * mOptions->duplicate.histSize);
157-
dupMeanTlen = new double[mOptions->duplicate.histSize];
158-
memset(dupMeanTlen, 0, sizeof(double) * mOptions->duplicate.histSize);
159157
dupMeanGC = new double[mOptions->duplicate.histSize];
160158
memset(dupMeanGC, 0, sizeof(double) * mOptions->duplicate.histSize);
161159
vector<Duplicate*> dupList;
162160
for(int t=0; t<mOptions->thread; t++){
163161
dupList.push_back(configs[t]->getDuplicate());
164162
}
165-
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanTlen, dupMeanGC, mOptions->duplicate.histSize);
163+
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanGC, mOptions->duplicate.histSize);
166164
cout << endl;
167165
cout << "Duplication rate: " << dupRate * 100.0 << "%" << endl;
168166
}
169167

170168
// make JSON report
171169
JsonReporter jr(mOptions);
172-
jr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
170+
jr.setDupHist(dupHist, dupMeanGC, dupRate);
173171
jr.report(finalFilterResult, finalPreStats1, finalPostStats1, finalPreStats2, finalPostStats2);
174172

175173
// make HTML report
176174
HtmlReporter hr(mOptions);
177-
hr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
175+
hr.setDupHist(dupHist, dupMeanGC, dupRate);
178176
hr.report(finalFilterResult, finalPreStats1, finalPostStats1, finalPreStats2, finalPostStats2);
179177

180178
// clean up
@@ -193,7 +191,6 @@ bool PairEndProcessor::process(){
193191

194192
if(mOptions->duplicate.enabled) {
195193
delete[] dupHist;
196-
delete[] dupMeanTlen;
197194
delete[] dupMeanGC;
198195
}
199196

src/seprocessor.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,27 +128,25 @@ bool SingleEndProcessor::process(){
128128
if(mOptions->duplicate.enabled) {
129129
dupHist = new int[mOptions->duplicate.histSize];
130130
memset(dupHist, 0, sizeof(int) * mOptions->duplicate.histSize);
131-
dupMeanTlen = new double[mOptions->duplicate.histSize];
132-
memset(dupMeanTlen, 0, sizeof(double) * mOptions->duplicate.histSize);
133131
dupMeanGC = new double[mOptions->duplicate.histSize];
134132
memset(dupMeanGC, 0, sizeof(double) * mOptions->duplicate.histSize);
135133
vector<Duplicate*> dupList;
136134
for(int t=0; t<mOptions->thread; t++){
137135
dupList.push_back(configs[t]->getDuplicate());
138136
}
139-
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanTlen, dupMeanGC, mOptions->duplicate.histSize);
137+
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanGC, mOptions->duplicate.histSize);
140138
cout << endl;
141139
cout << "Duplication rate: " << dupRate * 100.0 << "%" << endl;
142140
}
143141

144142
// make JSON report
145143
JsonReporter jr(mOptions);
146-
jr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
144+
jr.setDupHist(dupHist, dupMeanGC, dupRate);
147145
jr.report(finalFilterResult, finalPreStats, finalPostStats);
148146

149147
// make HTML report
150148
HtmlReporter hr(mOptions);
151-
hr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
149+
hr.setDupHist(dupHist, dupMeanGC, dupRate);
152150
hr.report(finalFilterResult, finalPreStats, finalPostStats);
153151

154152
// clean up
@@ -165,7 +163,6 @@ bool SingleEndProcessor::process(){
165163

166164
if(mOptions->duplicate.enabled) {
167165
delete[] dupHist;
168-
delete[] dupMeanTlen;
169166
delete[] dupMeanGC;
170167
}
171168

0 commit comments

Comments
 (0)