diff --git a/.gitignore b/.gitignore index c208510..4471d0c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +famsa FAMSA.VC.VC.opendb Debug x64 @@ -9,3 +10,6 @@ FAMSA.VC.db *.opendb /.vs /src/famsa.vcxproj.user +**/*.o +libs/mimalloc/*.o +.vscode/** diff --git a/src/core/defs.h b/src/core/defs.h index 6dc6b72..8883053 100644 --- a/src/core/defs.h +++ b/src/core/defs.h @@ -81,6 +81,7 @@ const symbol_t NO_AA_SYMBOLS = UNKNOWN_SYMBOL; // no. of symbols that can be #define MAX3(x, y, z) (max(x, max(y, z))) #define ABS(x) ((x) >= 0 ? (x) : -(x)) +enum class seq_t {AA, DNA, RNA}; inline void *my_align(std::size_t alignment, std::size_t size, void *&ptr, std::size_t &space) { diff --git a/src/core/io_service.cpp b/src/core/io_service.cpp index 05b8e4d..4507c9e 100644 --- a/src/core/io_service.cpp +++ b/src/core/io_service.cpp @@ -25,7 +25,19 @@ Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys using namespace std; // ******************************************************************* -bool IOService::saveAlignment(const std::string& file_name, vector& sequences, int no_threads, int gzip_level) +bool IOService::saveAlignment(const std::string& file_name, vector& sequences, const std::string& format, seq_t seq_type, int no_threads, int gzip_level) +{ + if (format == "fasta"){ + return saveFASTAFormat(file_name, sequences, no_threads, gzip_level); + } else if (format == "clustal"){ + return saveCLUSTALFormat(file_name, sequences, seq_type); + } else { + return saveFASTAFormat(file_name, sequences, no_threads, gzip_level); + } +} + +// ******************************************************************* +bool IOService::saveFASTAFormat(const std::string& file_name, vector& sequences, int no_threads, int gzip_level) { string s; string id, seq; @@ -180,3 +192,254 @@ bool IOService::saveAlignment(const std::string& file_name, vector& sequences, seq_t seq_type) +{ + int idx; /* counter for sequences */ + int len; /* tmp variable for name lengths */ + int namelen; /* maximum name length used */ + int pos; /* position counter */ + char *buf; /* buffer for writing seq */ + int cpl = 60; //msa->alenalen+10 : (iWrap > 0 ? iWrap : 60); /* char per line (< 64) */ + + /* consensus line stuff */ + int subpos; + char first; + int bail; + int strong_bins[9]; + int weak_bins[11]; + /*int cons;*/ + int bin; + int nseq = sequences.size(); // msa->nseq + bool bResno = false; + FILE *fp = openFile(file_name, "w"); + int *piResCnt = NULL; + + if (1 == bResno) { + if (NULL == (piResCnt = (int *)malloc(nseq * sizeof(int)))) { + printf("%s:%s:%d: could not malloc %d int for residue count", + __FUNCTION__, __FILE__, __LINE__, nseq); + return false; + } else { + memset(piResCnt, 0, nseq * sizeof(int)); + } + } /* do print residue numbers */ + + if (NULL == (buf = (char *)malloc(cpl+20))) { + printf("%s:%s:%d: could not malloc %d char for buffer", + __FUNCTION__, __FILE__, __LINE__, cpl+20); + return false; + } else { + memset(buf, 0, cpl+20); + } + + /* calculate max namelen used */ + namelen = 0; + for (idx = 0; idx < nseq; idx++) + { + /*if ((len = strlen(msa->sqname[idx])) > namelen) */ /* strlen() gives problems for unicode, FS, -> 290 */ + if ((len = utf8len(sequences[idx]->id.c_str())) > namelen) + namelen = len; + } + + fprintf(fp, "CLUSTAL multiple sequence alignment format. Alignment performed with FAMSA.\n"); + + /***************************************************** + * Write the sequences + *****************************************************/ + + fprintf(fp, "\n"); /* original had two blank lines */ + + map alignments; + int alen = 0; // length of global alignment + + string firstseq; + for (idx = 0; idx < nseq; idx++) + { + auto p = sequences[idx]; + string seq = p->Decode(); + string id = p->id; + if (idx == 0){ + alen = seq.size(); + firstseq = seq; + } + alignments[id] = seq; + + // cout << id << " -- " << alignments[id] << endl; + } + + for (pos = 0; pos < alen; pos += cpl) + { + fprintf(fp, "\n"); /* Blank line between sequence blocks */ + for (idx = 0; idx < nseq; idx++) + { + auto p = sequences[idx]; + string id = p->id; + string sqname = id.c_str();// msa->sqname[idx] + if (sqname[0] == '>') { + sqname.erase(0, 1); + } + const char *aseq = alignments[id].c_str(); // msa->aseq + const char *csqname = sqname.c_str(); + strncpy(buf, aseq + pos, cpl); + + buf[cpl] = '\0'; + if (1 == bResno) { + char *pc = NULL; + for (pc = buf; *pc != '\0'; pc++){ + if ( ( (*pc >= 'a') && (*pc <= 'z') ) || ( (*pc >= 'A') && (*pc <= 'Z') ) ) { + piResCnt[idx]++; + } + } + /* printf("%*s") gives problems for unicode, FS, -> 290 */ + /*fprintf(fp, "%-*s\t%s\t%d\n", namelen+5, msa->sqname[idx], buf, piResCnt[idx]);*/ + fprintf(fp, "%s%*s %s\t%d\n", csqname, (int)(namelen+5-utf8len(csqname)), "", buf, piResCnt[idx]); + } else { + /* printf("%*s") gives problems for unicode, FS, -> 290 */ + /*fprintf(fp, "%-*s\t%s\n", namelen+5, msa->sqname[idx], buf);*/ + fprintf(fp, "%s%*s %s\n", csqname, (int)(namelen+5-utf8len(csqname)), "", buf); + } + } + /* do consensus dots */ + + /* print namelen+5 spaces */ + for(subpos = 0; subpos <= namelen+5; subpos++) + fprintf(fp, " "); + + const char *firstaseq = firstseq.c_str(); // msa->aseq + for(subpos = pos; subpos < min(pos + cpl, alen); subpos++) + { + /* see if 100% conservation */ + first = firstaseq[subpos]; + bail = 0; + for (idx = 1; idx < nseq; idx++) + { + auto p = sequences[idx]; + string id = p->id; + const char *aseq = alignments[id].c_str(); // msa->aseq + if(toupper(aseq[subpos]) != toupper(first)) { /* toupper makes consensus case-insensitive, FS, r290 -> */ + bail = 1; + break; + } + } + if(!bail) + fprintf(fp, "*"); + else { + /* if not then check strong */ + for(bin = 0; bin < 9; bin++) + strong_bins[bin] = 0; /* clear the bins */ + + for (idx = 0; (seq_type == seq_t::AA) && (idx < nseq); idx++) + { /* do this only for amino acids, no strong/weak consensus for nucleotide, FS, r290 -> */ + auto p = sequences[idx]; + string id = p->id; + const char *aseq = alignments[id].c_str(); // msa->aseq + + switch(toupper(aseq[subpos])) /* toupper makes consensus case-insensitive, FS, r290 -> */ + { + case 'S': strong_bins[0]++; break; + case 'T': strong_bins[0]++; break; + case 'A': strong_bins[0]++; break; + case 'N': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; break; + case 'E': strong_bins[1]++; strong_bins[3]++; break; + case 'Q': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; strong_bins[4]++; break; + case 'K': strong_bins[1]++; strong_bins[2]++; strong_bins[4]++; break; + case 'D': strong_bins[3]++; break; + case 'R': strong_bins[4]++; break; + case 'H': strong_bins[2]++; strong_bins[4]++; strong_bins[7]++; break; /* added bin-2 (NHQK), FS 2016-07-14 */ + case 'M': strong_bins[5]++; strong_bins[6]++; break; + case 'I': strong_bins[5]++; strong_bins[6]++; break; + case 'L': strong_bins[5]++; strong_bins[6]++; break; + case 'V': strong_bins[5]++; break; + case 'F': strong_bins[6]++; strong_bins[8]++; break; + case 'Y': strong_bins[7]++; strong_bins[8]++; break; + case 'W': strong_bins[8]++; break; + } + } + bail = 0; + for(bin = 0; bin < 9; bin++) + if(strong_bins[bin] == nseq) { + bail = 1; + break; + } + + if(bail) + fprintf(fp, ":"); + else { + /* check weak */ + for(bin = 0; bin < 11; bin++) + weak_bins[bin] = 0; /* clear the bins */ + + for(idx = 0; (seq_type == seq_t::AA) && (idx < nseq); idx++) + { /* do this only for amino acids, no strong/weak consensus for nucleotide, FS, r290 -> */ + auto p = sequences[idx]; + string id = p->id; + const char *aseq = alignments[id].c_str(); // msa->aseq + + switch(toupper(aseq[subpos])) /* toupper makes consensus case-insensitive, FS, r290 -> */ + { + case 'C': weak_bins[0]++; break; + case 'S': weak_bins[0]++; weak_bins[2]++; weak_bins[3]++; weak_bins[4]++; weak_bins[5]++; weak_bins[6]++; break; + case 'A': weak_bins[0]++; weak_bins[1]++; weak_bins[2]++; weak_bins[4]++; break; + case 'T': weak_bins[1]++; weak_bins[3]++; weak_bins[4]++; break; + case 'V': weak_bins[1]++; weak_bins[9]++; break; + case 'G': weak_bins[2]++; weak_bins[5]++; break; /* Added bin-5, FS 2016-07-14 */ + case 'N': weak_bins[3]++; weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; + case 'K': weak_bins[3]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; + case 'D': weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; break; + case 'E': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; + case 'Q': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; + case 'H': weak_bins[7]++; weak_bins[8]++; weak_bins[10]++; break; + case 'R': weak_bins[8]++; break; + case 'F': weak_bins[9]++; weak_bins[10]++; break; + case 'L': weak_bins[9]++; break; + case 'I': weak_bins[9]++; break; + case 'M': weak_bins[9]++; break; + case 'Y': weak_bins[10]++; break; + } + } + bail = 0; + for(bin = 0; bin < 11; bin++) + if(weak_bins[bin] == nseq) { + bail = 1; + break; + } + if(bail) + fprintf(fp, "."); + else + fprintf(fp, " "); + } + } + } + fprintf(fp,"\n"); + } // end position for loop + + free(piResCnt); piResCnt = NULL; + return true; +} diff --git a/src/core/io_service.h b/src/core/io_service.h index 48b6a43..faa12b1 100644 --- a/src/core/io_service.h +++ b/src/core/io_service.h @@ -18,7 +18,12 @@ class IOService { public: template static size_t loadFasta(const std::string& file_name, std::vector& sequences, memory_monotonic_safe* mma = nullptr); - static bool saveAlignment(const std::string& file_name, vector & sequences, int no_threads, int gzip_level); + static bool saveAlignment(const std::string& file_name, vector & sequences, const std::string& format, seq_t seq_type, int no_threads, int gzip_level); + +private: + static bool saveFASTAFormat(const std::string& file_name, vector & sequences, int no_threads, int gzip_level); + static bool saveCLUSTALFormat(const std::string& file_name, vector & sequences, seq_t seq_type); + }; diff --git a/src/core/params.cpp b/src/core/params.cpp index 94f3437..886a732 100644 --- a/src/core/params.cpp +++ b/src/core/params.cpp @@ -90,7 +90,9 @@ void CParams::show_usage(bool expert) << " -gz - enable gzipped output (default: " << bool2str[gzippd_output] << ")\n" << " -gz-lev - gzip compression level [0-9] (default: " << gzip_level << ")\n" - << " -refine_mode - refinement mode (default: auto - the refinement is enabled for sets <= " << thr_refinement << " seq.)\n\n"; + << " -refine_mode - refinement mode (default: auto - the refinement is enabled for sets <= " << thr_refinement << " seq.)\n" + << " -output_format - output format type (default: fasta)\n" + << " -seq_type - input sequence type type, only used for clustal output format (default: aa)\n\n"; if (expert) { @@ -186,6 +188,28 @@ bool CParams::parse(int argc, char** argv, bool& showExpert) findOption(params, "-cluster_fraction", cluster_fraction); findOption(params, "-cluster_iters", cluster_iters); + string def_val = output_format; + findOption(params, "-output_format", output_format); + if (output_format != "fasta" && output_format != "clustal") + { + LOG_NORMAL << "Incorrect output format: " << output_format << ". This was changed to default value: " << def_val << endl; + output_format = def_val; + } + + string input_seq_type = "aa"; // default is amino acid. + def_val = input_seq_type; + findOption(params, "-seq_type", input_seq_type); + if (input_seq_type == "aa") + seq_type = seq_t::AA; + else if (input_seq_type == "dna") + seq_type = seq_t::DNA; + else if (input_seq_type == "rna") + seq_type = seq_t::RNA; + else { + LOG_NORMAL << "Incorrect seq_type: " << input_seq_type << ". This was changed to default value: " << def_val << endl; + seq_type = seq_t::AA; + } + export_tree = findSwitch(params, "-gt_export"); export_distances = findSwitch(params, "-dist_export"); generate_square_matrix = findSwitch(params, "-square_matrix"); diff --git a/src/core/params.h b/src/core/params.h index e889bbe..a10361f 100644 --- a/src/core/params.h +++ b/src/core/params.h @@ -114,6 +114,8 @@ class CParams string input_file_name; string input_file_name_2; string output_file_name; + string output_format = "fasta"; + seq_t seq_type = seq_t::AA; vector> score_matrix; vector score_vector; diff --git a/src/famsa.cpp b/src/famsa.cpp index 706d135..d60c6bb 100644 --- a/src/famsa.cpp +++ b/src/famsa.cpp @@ -48,7 +48,7 @@ int main(int argc, char *argv[]) CParams params; if (!params.parse(argc, argv, showExpert)) { - // some parameters could be parsed - used default values for printing + // some parameters could be parsed - used clustal values for printing CParams def_params; def_params.show_usage(showExpert); return 0; @@ -86,7 +86,7 @@ int main(int argc, char *argv[]) profile_aligner.GetAlignment(result); - IOService::saveAlignment(params.output_file_name, result, params.n_threads, + IOService::saveAlignment(params.output_file_name, result, params.output_format, params.seq_type, params.n_threads, params.gzippd_output ? params.gzip_level : -1); return 0; } @@ -113,7 +113,8 @@ int main(int argc, char *argv[]) famsa.getStatistics().put("alignment.length", result[0]->gapped_size); LOG_VERBOSE << "Saving alignment in " << params.output_file_name; - ok = IOService::saveAlignment(params.output_file_name, result, params.n_threads, + + ok = IOService::saveAlignment(params.output_file_name, result, params.output_format, params.seq_type, params.n_threads, params.gzippd_output ? params.gzip_level : -1); LOG_VERBOSE << " [OK]" << endl; diff --git a/test/clustal_output/amino.fa b/test/clustal_output/amino.fa new file mode 100644 index 0000000..a547696 --- /dev/null +++ b/test/clustal_output/amino.fa @@ -0,0 +1,36 @@ +>1 +ALFYKPDEKKDFQVCGAWDVTVQFHTMSVSEIQKHHTQKPRPHFEHAGRCEFATEQAMDR +REGCRNYIQCPKSNHTNIHRPVCCYCMRTIMVSQMAQVYIMHRNLRMCFCLSAHGCDLWA +YLGDIDNEWCVYDYTSPMTCKMAHSTVFLTFFWAMARYRHVIPCMPNLRQQEIFFTWGKW +LCEFMNYQPINCCVWWCVHRHCYGRRRSICIYVVFANLFDVSSRGWVPKTIMDPRWSCSF +VRRHQNTAAMICWGILAFWGLQLEVCERDPEQILTMRESMGNIKPKWILMFIYFNKTIHD +YTCQNYLEYEKGMEHGPWVCDFECCVYWHVVCQDQGRKWQESQSTSNGMADELLVHMMYA +KMMCIVVVEVRECKYHTKNQSFIFACNYTRRINETSGLAVSNLMWMVVKWACAVQWETRI +LLAIMIAWRKQPPVYTCKWCKVRFRWLYYEQSGQAQHHCTKGFLSCFFWNKRGLVQDGYG +YFTRDVVDYYICTMELWFMEASHPADWREFWSFKQIDMPGCKKQNIYFIVMECDQAKAIC +FGNSTMMPQFCGCWDPNYTNRRVEKAYLCRMDSVCKFVGQYQPSARMYLPSMSWYCLLSS +WPSNKPANALPDCAFYESNDMKQGQLLSYRSFWYKTTQMEENMGSMWPQHDYDRTLGSTF +KACRLCCIEIWTFDKSLSGMWMCIGSWFYMWWCKCFISGNANVNYWLDTKVGIPRLFNGN +HNPEDHVKSGSRVHVWQGCALLQENECIIPEYSGPTKARWMPSHWTTGTHYKTQDDFRLR +LTPKDKQVHNTKDQYSWAETAFEFASESEARGHSTEHVMKFWFYVVQGGMFHLSHISRLR +WITGSHSVSNLPEIQIAVAVSYQAAALWNMFQPYLNPLGMRECIYTPGRCKRMPLRMNYQ +EKLKMIPMRFKNRKMFILNQRVNTWMDKKTFVPDLENCAKMQGEGRNMYEWHGERRKYNH +ENITRKGKRVLITWPYFQATPWMQIGRFTLKNHTGGALWK +>2 +ALFYKPDEKKDFQVCGAWDVTVQFHTMSVSEIQKHHTQKPRPHFEHAGRCEFATEQAMDR +REGCRNYIQCPKSNHTNIHRPVCCYCMRTIMVSQMAQVYIMHRNLRMCFCLSAHGCDLWA +YLGDIDNEWCVYDYTSPMTCKMAHSTVFLTFFWgMARYRHVIPCMPNLRQQEIFFTWGKW +LCEFMNYQPINCCVWWCVHRHCYGRRRSICIYVVFANLFDVSSRGWVPKTIMDPRWSCSF +VRRHQNTAAMICWGILAFWGLQLEVCERDPEQILTMRESMGNIKPKWILMFIYFNKTIHD +YTCQNYLEYEKGMEHGPWVCDFECCVYWHVVCQsQGRKWQESQSTSNGMADELLVHMMYA +KMMCIVVVEVRECKYHTKNQSFIFACNYTRRINETSGLAVSNLMWMVVKWACAVQWETRI +LLAIMIAWRKQPPVYTCKWCKVRFRWLYYEQSGQAKGFLSCFFWNKRGLVQDGYG +YFTRDVVDYYICTMELWFMEASHPADWREFWSFKQIDMPGCKKQNIYFIVMECDQAKAIC +FGNSTMMPQFCGCWDPNYTNRRVEKAYLCRMDStCKFVGQYQPSARMYLPSMSWYCLLSS +WPSNKPANALPDCAFYESNDMKQGQLLSYRSFWYKTTQMEENMGSMWPQHDYDRTLGSTF +KACRLCCIEIWTFDKSLSGMWMCIGSWFYMWWCKCFISGNANVNYWLDTKVGIPRLFNGN +HNPEDHVKSGSRVHVWQGCALLQENECIIPEYSrPTKARWMPSHWTTGTHYKTQDDFRLR +LTPKDKQVHNTKDQYSWAETAFEFASESEARGHSTEHVMKFWFYVVQGGMFHLSHISRLR +WITGSHSVSNLPEIQIAVAVSYQAAALWNMFQPYLNPLGMRECIYTPGRCKRMPLRMNYQ +EKLKMIPMRFKNRKMFILNQRVNTWMDKKTFVPDLENCAKMQGEGRNMYEWHGERRKYNH +ENITRKGKRVLITWPYFQATPWMQIGRFTLKNHTGGALWK diff --git a/test/clustal_output/dna.fa b/test/clustal_output/dna.fa new file mode 100644 index 0000000..0797da6 --- /dev/null +++ b/test/clustal_output/dna.fa @@ -0,0 +1,4 @@ +>1 +CAAGGAAGTTACCAATTTTTTGTTTCCGAATGACACGCGTCTCCTTGCGGGTAGATCGCCGACCGCAGAACTTACGAGCCAGGGGAAACAGTAAGGCCTAATTAGGTAAAGGGAGTAAGTGCTCGAACGCTTCAGATGTAACCATATACTTACGCTGGATCTTCTCCCGCGAATTTTAACCCTCACCAACTACGAGATTTGAGGTAAACCAAATAAGCACGTAGTGGCGCTATCCGACTGTTCCCAAATTGTAACTTATCGTTCCGTGAAGGCCAGAGTTACTTCCCGGCCCTTTCCATGCGCGCACCATACCCTCCTAGTTCCCCGGTTATCTCTCCGAGGAGGGAGTGAGCGATCCTCCGTTTACGTTTTGTTACCAATGACGTAGCTATGTATTTTGTACAGGTTGCCAACGGGTTTCACAATTCACAGATAGTGGGGATCCCGGCAAAGGGCCTATATTTGCGGTCCAACTTAGGCGTAAACTACGATGGTACCTACTCAGACCCAGCTCGCGCGGCGTAAATAACGCACTCATCCCAGCTGATTCTCGGCGATCTACGCAGCGACATGATTATCAACAGCTGTCTGGCAGCTCTAATCTTTTACCATGGTCGTAAAAGCCTCCAAGAGTTAGATCATACCTAACGCCACAAAAGTGACACGACGCCGATGGGTACCGGACTTTAGGTCGACCACAGTTCGGTAAGGGAGAGGCCCTGCGGCGTACTTCATTTTGTATATGCAACGTGCCCAAGTGGCGCCAGGCAAGTCTCAGCTGGTTCCTGTGTTAGCTCGAGGCTAGGCATGGGAGCTGATTGAACATGGGTTGGGGGCCTCGAACCGTCGAGGACCCCATAGTACCTCGGACACCAAGTAGGGCAGCCTATAGTTTGAAGCAGTACTATTTCAGGGGGGGAGCCCTCATGGTCTCTTCTACTGATGACTCAACACGCTAGGGACGTGAAGTCGATTCCTTCGATGGTTATAAATCAAAGGC +>2 +CAAGGAAGTTACCAATTTTTTGTTTCCGAATGACACGCGTCTCCTTGCGGGTAGATCGCCGACCGCAGAACTTACGtGCCAGGGGAAACAGTAAcGCCTAATTAGGTAAAGGGAGTAAGTGCTCGAACaCTTCAGATGTAACCATATACTTACGCTGgATCTTCTCCCGCGAATTTTAACCCTCACCAACTACGAGATTTGAGGTAAACCAAATAAGCACGTAGTGGCGCTATCCGACTGTTCCCAAATTGTAACTTATCGTTCCGTGAAGGCCAGAGTTACTTCCCGGCCCTTTCCATGCGCGCACCATACCCTCCTAGTTCCCCGGTTATCTCTCCGAGGAGGGAGTGAGCGATCCTCCGTTTACGTTTTGTTACCAATGACGTAGCTATGTATTTTGTACAGGTTGCCAACGGGTTTCACAATTCACAGATAGTGGGGATCCCGGCAAAGGGCCTATATTTGCGGTCCAACTTAGGCGTAAACTACGATGGTACCTACTCAGACCCAGCTCGCGCGGCGTAAATAACGCACTCATCCCAGCTGATTCTCGGCGATCTACGCAGCGACATGATTATCAACAGCTGTCTGGCAGCTCTAATCTTTTACCATGGTCGTAAAAGCCTCCAAGAGTTAGATCATACCTAACGCCACAAAAGTGACACGACGCCGATGGGTACCGGACTTTAGGTCGACCACAGTTCGGTAAGGGAGAGGCCCTGCGGCGTACTTCATTTTGTATATGCAACGTGCCCAAGTGGCGCCAGGCAAGTCTCAGCTGGTTCCTGTGTTAGCTCGAGGCTAGGCATGGGAGCTGATTGAACATGGGTTGGGGGCCTCGAACCGTCGAGGACCCCATAGTACCTCGGACACCAAGTAGGGCAGCCTATAGTTTGAAGCAGTACTATTTCAGGGGGGGAGCTCTACTGATGACTCAACACGCTAGGGACGTGAAGTCGATTCCTTCGATGGTTATAAATCAAAGGC