5858public class AnnotationFileConverter extends DatastoreFileConverter {
5959
6060 private static final Logger LOG = Logger .getLogger (AnnotationFileConverter .class );
61- private static final String TEMPFILE = "/tmp/annotation.gff3" ;
61+ private static final String TEMPGENEFILE = "/tmp/gene_models_main.gff3" ;
62+ private static final String TEMPIPRSCANFILE = "/tmp/iprscan.gff3" ;
6263
6364 // GFF sourced
6465 Map <String ,Item > chromosomes = new HashMap <>();
@@ -142,9 +143,6 @@ public void process(Reader reader) throws IOException {
142143 System .out .println ("## Processing " +getCurrentFile ().getName ());
143144 processGFAFile ();
144145 gfaFileExists = true ;
145- } else if (getCurrentFile ().getName ().endsWith (".pathway.tsv.gz" )) {
146- System .out .println ("## Processing " +getCurrentFile ().getName ());
147- processPathwayFile ();
148146 } else if (getCurrentFile ().getName ().endsWith (".protein.faa.gz" ) || getCurrentFile ().getName ().endsWith (".protein_primary.faa.gz" )) {
149147 System .out .println ("## Processing " +getCurrentFile ().getName ());
150148 processProteinFasta ();
@@ -157,8 +155,14 @@ public void process(Reader reader) throws IOException {
157155 System .out .println ("## Processing " +getCurrentFile ().getName ());
158156 processMRNAFasta ();
159157 mrnaFileExists = true ;
158+ } else if (getCurrentFile ().getName ().endsWith (".pathway.tsv.gz" )) {
159+ System .out .println ("## Processing " +getCurrentFile ().getName ());
160+ processPathwayFile ();
161+ } else if (getCurrentFile ().getName ().endsWith (".iprscan.gff3.gz" )) {
162+ System .out .println ("## Processing " +getCurrentFile ().getName ());
163+ processIPRScanGFF3 ();
160164 } else {
161- System .out .println ("## Skipping file " +getCurrentFile ().getName ());
165+ System .out .println ("## - Skipping " +getCurrentFile ().getName ());
162166 }
163167 }
164168
@@ -170,12 +174,11 @@ public void close() throws ObjectStoreException, RuntimeException {
170174 if (readme ==null ) {
171175 throw new RuntimeException ("README file not found. Aborting." );
172176 }
173- if (!cdsFileExists ) System .err .println ("ERROR: cds FASTA file is missing." );
174- if (!mrnaFileExists ) System .err .println ("ERROR: mrna FASTA file is missing." );
177+ if (!cdsFileExists && !mrnaFileExists ) System .err .println ("ERROR: neither mrna nor cds FASTA file is present. One must be." );
175178 if (!proteinFileExists ) System .err .println ("ERROR: protein FASTA file is missing." );
176179 if (!gfaFileExists ) System .err .println ("ERROR: gfa file is missing." );
177180 if (!gff3FileExists ) System .err .println ("ERROR: GFF3 file is missing." );
178- if (!cdsFileExists || !mrnaFileExists || !proteinFileExists || !gfaFileExists || !gff3FileExists ) {
181+ if (( !cdsFileExists && !mrnaFileExists ) || !proteinFileExists || !gfaFileExists || !gff3FileExists ) {
179182 throw new RuntimeException ("Missing required annotation file(s). Aborting." );
180183 }
181184 // set references and collections for objects loaded from FASTAs based on matching identifiers
@@ -376,7 +379,6 @@ void processGFAFile() throws IOException, RuntimeException {
376379 geneFamily .addToCollection ("genes" , gene );
377380 geneFamily .addToCollection ("proteins" , protein );
378381 }
379- br .close ();
380382 }
381383
382384 /**
@@ -411,7 +413,7 @@ void processGFF3File() throws IOException, RuntimeException {
411413 throw new RuntimeException ("README not read before " +getCurrentFile ().getName ()+". Aborting." );
412414 }
413415 // uncompress the gff3.gz file to a temp file
414- File tempfile = new File (TEMPFILE );
416+ File tempfile = new File (TEMPGENEFILE );
415417 tempfile .delete ();
416418 BufferedWriter writer = new BufferedWriter (new FileWriter (tempfile ));
417419 BufferedReader reader = GZIPBufferedReader .getReader (getCurrentFile ());
@@ -422,7 +424,7 @@ void processGFF3File() throws IOException, RuntimeException {
422424 }
423425 writer .close ();
424426 // now load the uncompressed GFF
425- FeatureList featureList = GFF3Reader .read (TEMPFILE );
427+ FeatureList featureList = GFF3Reader .read (TEMPGENEFILE );
426428 for (FeatureI featureI : featureList ) {
427429 String seqname = featureI .seqname ();
428430 Location location = featureI .location ();
@@ -535,6 +537,61 @@ void processGFF3File() throws IOException, RuntimeException {
535537 }
536538 }
537539
540+ /**
541+ * Process an IPRScan GFF file which has Proteins in the sequence column.
542+ * 0 1 2 3 4 5 6 7
543+ * medsa.XinJiangDaYe.gnm1.ann1.RKB9.iprscan.gff3.gz
544+ *
545+ * medsa.XinJiangDaYe.gnm1.ann1.MS_gene000000.t1 PANTHER protein_match 1 463 . + . Name=PTHR10178:SF14;status=T;ID=match$1047424_1_463;date=04-02-2021
546+ * medsa.XinJiangDaYe.gnm1.ann1.MS_gene000008.t1 ProSiteProfiles protein_match 10 264 . + . Name=PS50294;status=T;ID=match$461640_10_264;date=03-02-2021;
547+ * signature_desc=Trp-Asp (WD) repeats circular profile.
548+ * medsa.XinJiangDaYe.gnm1.ann1.MS_gene000000.t1 Pfam protein_hmm_match 3 88 4.8E-11 + . Name=PF03732;status=T;ID=match$1047423_3_88;date=04-02-2021;
549+ * signature_desc=Retrotransposon gag protein;Target=PF03732 6 96;
550+ */
551+ void processIPRScanGFF3 () throws IOException {
552+ // uncompress the gff3.gz file to a temp file
553+ File tempfile = new File (TEMPIPRSCANFILE );
554+ tempfile .delete ();
555+ BufferedWriter writer = new BufferedWriter (new FileWriter (tempfile ));
556+ BufferedReader reader = GZIPBufferedReader .getReader (getCurrentFile ());
557+ String line = null ;
558+ while ( (line =reader .readLine ())!=null ) {
559+ writer .write (line );
560+ writer .newLine ();
561+ }
562+ writer .close ();
563+ // now load the uncompressed GFF
564+ FeatureList featureList = GFF3Reader .read (TEMPIPRSCANFILE );
565+ for (FeatureI featureI : featureList ) {
566+ String seqname = featureI .seqname ();
567+ Location location = featureI .location ();
568+ String type = featureI .type ();
569+ // feature
570+ Item feature = getFeatureOnProtein (type , seqname , location );
571+ String id = getAttribute (featureI , "ID" );
572+ feature .setAttribute ("primaryIdentifier" , id );
573+ features .put (id , feature );
574+ // source isn't supplied by FeatureI
575+ // feature.setAttribute("source", rec.getSource());
576+ // attributes
577+ String name = getAttribute (featureI , "Name" );
578+ String status = getAttribute (featureI , "status" );
579+ String date = getAttribute (featureI , "date" );
580+ String target = getAttribute (featureI , "Target" );
581+ String signatureDesc = getAttribute (featureI , "signature_desc" );
582+ // accession=Name
583+ feature .setAttribute ("accession" , name );
584+ // status
585+ if (status !=null ) feature .setAttribute ("status" , status );
586+ // date
587+ if (date !=null ) feature .setAttribute ("date" , date );
588+ // target
589+ if (target !=null ) feature .setAttribute ("target" , target );
590+ // signatureDesc
591+ if (signatureDesc !=null ) feature .setAttribute ("signatureDesc" , signatureDesc );
592+ }
593+ }
594+
538595 /**
539596 * Add an OntologyAnnotation with the given identifier to the given feature's collection
540597 * NOTE: GO terms are GOTerm objects.
@@ -625,6 +682,31 @@ void placeFeatureOnSequence(Item feature, String seqname, Location location) thr
625682 }
626683 }
627684
685+ /**
686+ * Place a feature on a protein.
687+ */
688+ Item getFeatureOnProtein (String type , String seqname , Location location ) throws RuntimeException {
689+ Item protein = getProtein (seqname );
690+ Item feature = null ;
691+ if (type .equals ("protein_match" )) {
692+ feature = createItem ("ProteinMatch" );
693+ } else if (type .equals ("protein_hmm_match" )) {
694+ feature = createItem ("ProteinHmmMatch" );
695+ } else {
696+ throw new RuntimeException ("IPRSCAN GFF record type " +type +" is not supported by this loader." );
697+ }
698+ feature .setReference ("protein" , protein );
699+ // reference feature on new IM Location
700+ Item proteinLocation = createItem ("Location" );
701+ proteinLocation .setReference ("feature" , feature );
702+ proteinLocation .setAttribute ("start" , String .valueOf (location .bioStart ()));
703+ proteinLocation .setAttribute ("end" , String .valueOf (location .bioEnd ()));
704+ proteinLocation .setReference ("locatedOn" , protein );
705+ locations .add (proteinLocation );
706+ feature .setReference ("location" , proteinLocation );
707+ return feature ;
708+ }
709+
628710 /**
629711 * Get/add a Gene Item, keyed by primaryIdentifier
630712 */
0 commit comments