11#include " duplicate.h"
22#include " overlapanalysis.h"
33#include < memory.h>
4+ #include < math.h>
45
56Duplicate::Duplicate (Options* opt) {
67 mOptions = opt;
@@ -10,10 +11,8 @@ Duplicate::Duplicate(Options* opt) {
1011 memset (mDups , 0 , sizeof (uint64)*mKeyLenInBit );
1112 mCounts = new uint16[mKeyLenInBit ];
1213 memset (mCounts , 0 , sizeof (uint16)*mKeyLenInBit );
13- mLength = new uint16[mKeyLenInBit ];
14- memset (mLength , 0 , sizeof (uint16)*mKeyLenInBit );
15- mGC = new uint16[mKeyLenInBit ];
16- memset (mGC , 0 , sizeof (uint16)*mKeyLenInBit );
14+ mGC = new uint8[mKeyLenInBit ];
15+ memset (mGC , 0 , sizeof (uint8)*mKeyLenInBit );
1716}
1817
1918Duplicate::~Duplicate (){
@@ -48,11 +47,10 @@ uint64 Duplicate::seq2int(const char* data, int start, int keylen, bool& valid)
4847 return ret;
4948}
5049
51- void Duplicate::addRecord (uint32 key, uint64 kmer32, int tlen, int gc) {
50+ void Duplicate::addRecord (uint32 key, uint64 kmer32, uint8 gc) {
5251 if (mCounts [key] == 0 ) {
5352 mCounts [key] = 1 ;
5453 mDups [key] = kmer32;
55- mLength [key] = tlen;
5654 mGC [key] = gc;
5755 } else {
5856 if (mDups [key] == kmer32)
@@ -89,7 +87,9 @@ void Duplicate::statRead(Read* r) {
8987 }
9088 }
9189
92- addRecord (key, kmer32, r->length (), gc);
90+ gc = round (255.0 * (double ) gc / (double ) r->length ());
91+
92+ addRecord (key, kmer32, (uint8)gc);
9393}
9494
9595void Duplicate::statPair (Read* r1, Read* r2) {
@@ -112,33 +112,27 @@ void Duplicate::statPair(Read* r1, Read* r2) {
112112 int gc = 0 ;
113113
114114 // not calculated
115- int tlen = 0 ;
116115 if (mCounts [key] == 0 ) {
117- OverlapResult ov = OverlapAnalysis::analyze (r1, r2);
118- if (ov.overlap_len > 30 ) {
119- if (ov.offset < 0 )
120- tlen = ov.overlap_len ;
121- else
122- tlen = r1->length () + r2->length () - ov.overlap_len ;
123- }
124- for (int i=0 ; i<r1->length () && i<tlen; i++) {
116+ for (int i=0 ; i<r1->length (); i++) {
125117 if (data1[i] == ' G' || data1[i] == ' C' )
126118 gc++;
127119 }
128- for (int i=0 ; i<r2->length () && i<tlen-r1-> length () ; i++) {
120+ for (int i=0 ; i<r2->length (); i++) {
129121 if (data2[i] == ' G' || data2[i] == ' C' )
130122 gc++;
131123 }
132124 }
133125
134- addRecord (key, kmer32, tlen, gc);
126+ gc = round (255.0 * (double ) gc / (double )( r1->length () + r2->length ()));
127+
128+ addRecord (key, kmer32, gc);
135129}
136130
137- double Duplicate::statAll (vector<Duplicate*>& list, int * hist, double * meanTLEN, double * meanGC, int histSize) {
131+ double Duplicate::statAll (vector<Duplicate*>& list, int * hist, double * meanGC, int histSize) {
138132 long totalNum = 0 ;
139133 long dupNum = 0 ;
140- int * gcTlenNum = new int [histSize];
141- memset (gcTlenNum , 0 , sizeof (int )*histSize);
134+ int * gcStatNum = new int [histSize];
135+ memset (gcStatNum , 0 , sizeof (int )*histSize);
142136 for (int key=0 ; key<list[0 ]->mKeyLenInBit ; key++) {
143137 bool consistent = true ;
144138 for (int i=0 ; i<list.size ()-1 ; i++) {
@@ -148,15 +142,13 @@ double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN,
148142 }
149143 if (consistent) {
150144 int count = 0 ;
151- int numSum = 0 ;
152- double tlenSum = 0 ;
153145 double gcSum = 0 ;
146+ int num = 0 ;
154147 for (int i=0 ; i<list.size (); i++) {
155148 count += list[i]->mCounts [key];
156- if (list[i]->mLength [key] > 0 ) {
157- numSum++;
158- tlenSum += list[i]->mLength [key];
159- gcSum += (double )list[i]->mGC [key] / (double )list[i]->mLength [key];
149+ if (list[i]->mGC [key]>0 ) {
150+ gcSum += (double )list[i]->mGC [key];
151+ num++;
160152 }
161153 }
162154
@@ -166,32 +158,30 @@ double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN,
166158
167159 if (count >= histSize){
168160 hist[histSize-1 ]++;
169- if (numSum > 0 ) {
170- meanTLEN[histSize-1 ] += tlenSum/numSum;
171- meanGC[histSize-1 ] += gcSum/numSum;
172- gcTlenNum[histSize-1 ]++;
161+ if (num>0 ) {
162+ meanGC[histSize-1 ] += gcSum/num;
163+ gcStatNum[histSize-1 ]++;
173164 }
174165 }
175166 else {
176167 hist[count]++;
177- if (numSum > 0 ) {
178- meanTLEN[count] += tlenSum/numSum;
179- meanGC[count] += gcSum/numSum;
180- gcTlenNum[count]++;
168+ if (num>0 ) {
169+ meanGC[count] += gcSum/num;
170+ gcStatNum[count]++;
181171 }
172+
182173 }
183174 }
184175 }
185176 }
186177
187178 for (int i=0 ; i<histSize; i++) {
188- if (gcTlenNum[i] > 0 ) {
189- meanTLEN[i] = meanTLEN[i] / gcTlenNum[i];
190- meanGC[i] = meanGC[i] / gcTlenNum[i];
179+ if (gcStatNum[i] > 0 ) {
180+ meanGC[i] = meanGC[i] / 255.0 / gcStatNum[i];
191181 }
192182 }
193183
194- delete[] gcTlenNum ;
184+ delete[] gcStatNum ;
195185
196186 if (totalNum == 0 )
197187 return 0.0 ;
0 commit comments