Skip to content

Commit 69e3ad2

Browse files
add the parameter -m to set the minimum genome length to filter out short genome
1 parent 73d2bee commit 69e3ad2

File tree

4 files changed

+19
-13
lines changed

4 files changed

+19
-13
lines changed

src/SketchInfo.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,15 @@ int producer_fasta_task(std::string file, rabbit::fa::FastaDataPool* fastaPool,
112112
return 0;
113113
}
114114

115-
void consumer_fasta_seqSize(rabbit::fa::FastaDataPool* fastaPool, FaChunkQueue &dq, uint64_t* maxSize, uint64_t* minSize, uint64_t* totalSize, uint64_t* number, uint64_t* badNumber){
115+
void consumer_fasta_seqSize(rabbit::fa::FastaDataPool* fastaPool, FaChunkQueue &dq, uint64_t minLen, uint64_t* maxSize, uint64_t* minSize, uint64_t* totalSize, uint64_t* number, uint64_t* badNumber){
116116
rabbit::int64 id = 0;
117117
rabbit::fa::FastaChunk *faChunk;
118118
while(dq.Pop(id, faChunk)){
119119
std::vector<Reference> data;
120120
int ref_num = rabbit::fa::chunkListFormat(*faChunk, data);
121121
for(Reference &r: data){
122122
uint64_t length = (uint64_t)r.length;
123-
if(length < 10000){
123+
if(length < minLen){
124124
*badNumber = *badNumber + 1;
125125
continue;
126126
}
@@ -209,7 +209,7 @@ void consumer_fasta_task(rabbit::fa::FastaDataPool* fastaPool, FaChunkQueue &dq,
209209
}
210210
#endif
211211

212-
void calSize(bool sketchByFile, string inputFile, int threads, uint64_t &maxSize, uint64_t& minSize, uint64_t& averageSize){
212+
void calSize(bool sketchByFile, string inputFile, int threads, uint64_t minLen, uint64_t &maxSize, uint64_t& minSize, uint64_t& averageSize){
213213
maxSize = 0;
214214
minSize = 1 << 31;
215215
averageSize = 0;
@@ -238,7 +238,7 @@ void calSize(bool sketchByFile, string inputFile, int threads, uint64_t &maxSize
238238
stat(line.c_str(), &statbuf);
239239
curSize = statbuf.st_size;
240240
}
241-
if(curSize < 10000){
241+
if(curSize < minLen){
242242
badNumber++;
243243
continue;
244244
}
@@ -270,7 +270,7 @@ void calSize(bool sketchByFile, string inputFile, int threads, uint64_t &maxSize
270270
std::thread producer(producer_fasta_task, inputFile, fastaPool, std::ref(queue1));
271271
std::thread **threadArr = new std::thread* [th];
272272
for(int t = 0; t < th; t++){
273-
threadArr[t] = new std::thread(std::bind(consumer_fasta_seqSize, fastaPool, std::ref(queue1), &maxSizeArr[t], &minSizeArr[t], &totalSizeArr[t], &numArr[t], &badNumArr[t]));
273+
threadArr[t] = new std::thread(std::bind(consumer_fasta_seqSize, fastaPool, std::ref(queue1), minLen, &maxSizeArr[t], &minSizeArr[t], &totalSizeArr[t], &numArr[t], &badNumArr[t]));
274274
}
275275
producer.join();
276276

@@ -294,7 +294,7 @@ void calSize(bool sketchByFile, string inputFile, int threads, uint64_t &maxSize
294294
while(1){
295295
int length = kseq_read(ks1);
296296
if(length < 0) break;
297-
if(length < 10000){
297+
if(length < minLen){
298298
badNumber++;
299299
continue;
300300
}
@@ -313,7 +313,7 @@ void calSize(bool sketchByFile, string inputFile, int threads, uint64_t &maxSize
313313

314314
if((double)badNumber / totalNumber >= 0.2)
315315
{
316-
fprintf(stderr, "Warning: there are %d poor quality (length < 10000) genome assemblies in the total %d genome assemblied.\n", badNumber, totalNumber);
316+
fprintf(stderr, "Warning: there are %d poor quality (length < %ld) genome assemblies in the total %d genome assemblied.\n", badNumber, minLen, totalNumber);
317317
}
318318
cerr << "\t===the totalSize is: " << totalSize << endl;
319319
cerr << "\t===the maxSize is: " << maxSize << endl;
@@ -520,7 +520,7 @@ bool sketchSequences(string inputFile, int kmerSize, int sketchSize, string sket
520520
return true;
521521
}
522522

523-
bool sketchFiles(string inputFile, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, vector<SketchInfo>& sketches, int threads){
523+
bool sketchFiles(string inputFile, uint64_t minLen, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, vector<SketchInfo>& sketches, int threads){
524524
fprintf(stderr, "input fileList, sketch by file\n");
525525
fstream fs(inputFile);
526526
if(!fs){
@@ -691,7 +691,7 @@ bool sketchFiles(string inputFile, int kmerSize, int sketchSize, string sketchFu
691691
tmpSketchInfo.fileName = fileList[i];
692692
tmpSketchInfo.totalSeqLength = totalLength;
693693
tmpSketchInfo.fileSeqs = curFileSeqs;
694-
if(totalLength >= 10000)//filter the poor quality genome assemblies whose length less than 10k bp(fastANI paper)
694+
if(totalLength >= minLen)//filter the poor quality genome assemblies whose length less than minLen(fastANI paper)
695695
sketches.push_back(tmpSketchInfo);
696696
if(i % 10000 == 0) cerr << "finished sketching: " << i << " genomes" << endl;
697697
}

src/SketchInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ struct SketchInfo{
3636

3737

3838

39-
void calSize(bool sketchByFile, string inputFile, int threads, uint64_t &maxSize, uint64_t& minSize, uint64_t& averageSize);
39+
void calSize(bool sketchByFile, string inputFile, int threads, uint64_t minLen, uint64_t &maxSize, uint64_t& minSize, uint64_t& averageSize);
4040
bool sketchSequences(string inputFile, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, vector<SketchInfo>& sketches, int threads);
41-
bool sketchFiles(string inputFile, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, vector<SketchInfo>& sketches, int threads);
41+
bool sketchFiles(string inputFile, uint64_t minLen, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, vector<SketchInfo>& sketches, int threads);
4242
bool cmpSketch(SketchInfo s1, SketchInfo s2);
4343

4444

src/main.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ int main(int argc, char * argv[]){
6363
string mstSketchFile = "sketch.info";
6464
bool isSave = true;
6565
bool isSetKmer = false;
66+
uint64_t minLen = 0;
6667
while(argIndex < argc){
6768
switch(argv[argIndex][1]){
6869
case 't':
@@ -84,6 +85,10 @@ int main(int argc, char * argv[]){
8485
case 'e':
8586
isSave = false;
8687
break;
88+
case 'm':
89+
minLen = stoi(argv[++argIndex]);
90+
fprintf(stderr, "set the filter minimum length: %ld\n", minLen);
91+
break;
8792
case 'k':
8893
isSetKmer = true;
8994
kmerSize = stoi(argv[++argIndex]);
@@ -237,7 +242,7 @@ int main(int argc, char * argv[]){
237242
}//end useFile
238243

239244
uint64_t maxSize, minSize, averageSize;
240-
calSize(sketchByFile, inputFile, threads, maxSize, minSize, averageSize);
245+
calSize(sketchByFile, inputFile, threads, minLen, maxSize, minSize, averageSize);
241246

242247
if(isContainment && isJaccard){
243248
cerr << "conflict distance measurement of Mash distance (fixed-sketch-size) and AAF distance (variable-sketch-size) " << endl;
@@ -334,7 +339,7 @@ int main(int argc, char * argv[]){
334339

335340
}//end sketch by sequence
336341
else{
337-
if(!sketchFiles(inputFile, kmerSize, sketchSize, sketchFunc, isContainment, containCompress, sketches, threads)){
342+
if(!sketchFiles(inputFile, minLen, kmerSize, sketchSize, sketchFunc, isContainment, containCompress, sketches, threads)){
338343
printUsage();
339344
return 1;
340345
}

src/parameter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ inline void printUsage(){
4747
fprintf(stdout, "usage: clust-greedy [-h] [-l] [-t] <int> [-d] <double> [-F] <string> [-i] <string> [-o] <string> \n");
4848
fprintf(stdout, "usage: clust-greedy [-h] [-f] [-d] <double> [-i] <string> <string> [-o] <string>\n");
4949
fprintf(stdout, " -h\t\t: this help message\n");
50+
fprintf(stdout, " -m <int>\t: set the filter minimum genome length (minLen), genome with total length less the minLen will be ignore, for both clust-mst and clust-greedy\n");
5051
fprintf(stdout, " -k <int>\t: set kmer size, automatically calculate the kmer size without -k option, for both clust-mst and clust-greedy\n");
5152
fprintf(stdout, " -s <int>\t: set sketch size, default 1000, for both clust-mst and clust-greedy\n");
5253
fprintf(stdout, " -c <int>\t: set sampling ratio to compute viriable sketchSize, sketchSize = genomeSize/samplingRatio, only support with MinHash sketch function of clust-greedy\n");

0 commit comments

Comments
 (0)