Skip to content

Commit 7f47d60

Browse files
author
Sam Hokin
committed
Final 7.0.3.2. Fixed lis-phylotree for new files.
1 parent e891447 commit 7f47d60

8 files changed

Lines changed: 154 additions & 52 deletions

File tree

lis-annotation/src/main/java/org/intermine/bio/dataconversion/AnnotationFileConverter.java

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@
5858
public class AnnotationFileConverter extends DatastoreFileConverter {
5959

6060
private static final Logger LOG = Logger.getLogger(AnnotationFileConverter.class);
61-
private static final String TEMPFILE = "/tmp/annotation.gff3";
61+
private static final String TEMPGENEFILE = "/tmp/gene_models_main.gff3";
62+
private static final String TEMPIPRSCANFILE = "/tmp/iprscan.gff3";
6263

6364
// GFF sourced
6465
Map<String,Item> chromosomes = new HashMap<>();
@@ -142,9 +143,6 @@ public void process(Reader reader) throws IOException {
142143
System.out.println("## Processing "+getCurrentFile().getName());
143144
processGFAFile();
144145
gfaFileExists = true;
145-
} else if (getCurrentFile().getName().endsWith(".pathway.tsv.gz")) {
146-
System.out.println("## Processing "+getCurrentFile().getName());
147-
processPathwayFile();
148146
} else if (getCurrentFile().getName().endsWith(".protein.faa.gz") || getCurrentFile().getName().endsWith(".protein_primary.faa.gz")) {
149147
System.out.println("## Processing "+getCurrentFile().getName());
150148
processProteinFasta();
@@ -157,8 +155,14 @@ public void process(Reader reader) throws IOException {
157155
System.out.println("## Processing "+getCurrentFile().getName());
158156
processMRNAFasta();
159157
mrnaFileExists = true;
158+
} else if (getCurrentFile().getName().endsWith(".pathway.tsv.gz")) {
159+
System.out.println("## Processing "+getCurrentFile().getName());
160+
processPathwayFile();
161+
} else if (getCurrentFile().getName().endsWith(".iprscan.gff3.gz")) {
162+
System.out.println("## Processing "+getCurrentFile().getName());
163+
processIPRScanGFF3();
160164
} else {
161-
System.out.println("## Skipping file "+getCurrentFile().getName());
165+
System.out.println("## - Skipping "+getCurrentFile().getName());
162166
}
163167
}
164168

@@ -170,12 +174,11 @@ public void close() throws ObjectStoreException, RuntimeException {
170174
if (readme==null) {
171175
throw new RuntimeException("README file not found. Aborting.");
172176
}
173-
if (!cdsFileExists) System.err.println("ERROR: cds FASTA file is missing.");
174-
if (!mrnaFileExists) System.err.println("ERROR: mrna FASTA file is missing.");
177+
if (!cdsFileExists && !mrnaFileExists) System.err.println("ERROR: neither mrna nor cds FASTA file is present. One must be.");
175178
if (!proteinFileExists) System.err.println("ERROR: protein FASTA file is missing.");
176179
if (!gfaFileExists) System.err.println("ERROR: gfa file is missing.");
177180
if (!gff3FileExists) System.err.println("ERROR: GFF3 file is missing.");
178-
if (!cdsFileExists || !mrnaFileExists || !proteinFileExists || !gfaFileExists || !gff3FileExists) {
181+
if ((!cdsFileExists && !mrnaFileExists) || !proteinFileExists || !gfaFileExists || !gff3FileExists) {
179182
throw new RuntimeException("Missing required annotation file(s). Aborting.");
180183
}
181184
// set references and collections for objects loaded from FASTAs based on matching identifiers
@@ -376,7 +379,6 @@ void processGFAFile() throws IOException, RuntimeException {
376379
geneFamily.addToCollection("genes", gene);
377380
geneFamily.addToCollection("proteins", protein);
378381
}
379-
br.close();
380382
}
381383

382384
/**
@@ -411,7 +413,7 @@ void processGFF3File() throws IOException, RuntimeException {
411413
throw new RuntimeException("README not read before "+getCurrentFile().getName()+". Aborting.");
412414
}
413415
// uncompress the gff3.gz file to a temp file
414-
File tempfile = new File(TEMPFILE);
416+
File tempfile = new File(TEMPGENEFILE);
415417
tempfile.delete();
416418
BufferedWriter writer = new BufferedWriter(new FileWriter(tempfile));
417419
BufferedReader reader = GZIPBufferedReader.getReader(getCurrentFile());
@@ -422,7 +424,7 @@ void processGFF3File() throws IOException, RuntimeException {
422424
}
423425
writer.close();
424426
// now load the uncompressed GFF
425-
FeatureList featureList = GFF3Reader.read(TEMPFILE);
427+
FeatureList featureList = GFF3Reader.read(TEMPGENEFILE);
426428
for (FeatureI featureI : featureList) {
427429
String seqname = featureI.seqname();
428430
Location location = featureI.location();
@@ -535,6 +537,61 @@ void processGFF3File() throws IOException, RuntimeException {
535537
}
536538
}
537539

540+
/**
541+
* Process an IPRScan GFF file which has Proteins in the sequence column.
542+
* 0 1 2 3 4 5 6 7
543+
* medsa.XinJiangDaYe.gnm1.ann1.RKB9.iprscan.gff3.gz
544+
*
545+
* medsa.XinJiangDaYe.gnm1.ann1.MS_gene000000.t1 PANTHER protein_match 1 463 . + . Name=PTHR10178:SF14;status=T;ID=match$1047424_1_463;date=04-02-2021
546+
* medsa.XinJiangDaYe.gnm1.ann1.MS_gene000008.t1 ProSiteProfiles protein_match 10 264 . + . Name=PS50294;status=T;ID=match$461640_10_264;date=03-02-2021;
547+
* signature_desc=Trp-Asp (WD) repeats circular profile.
548+
* medsa.XinJiangDaYe.gnm1.ann1.MS_gene000000.t1 Pfam protein_hmm_match 3 88 4.8E-11 + . Name=PF03732;status=T;ID=match$1047423_3_88;date=04-02-2021;
549+
* signature_desc=Retrotransposon gag protein;Target=PF03732 6 96;
550+
*/
551+
void processIPRScanGFF3() throws IOException {
552+
// uncompress the gff3.gz file to a temp file
553+
File tempfile = new File(TEMPIPRSCANFILE);
554+
tempfile.delete();
555+
BufferedWriter writer = new BufferedWriter(new FileWriter(tempfile));
556+
BufferedReader reader = GZIPBufferedReader.getReader(getCurrentFile());
557+
String line = null;
558+
while ( (line=reader.readLine())!=null ) {
559+
writer.write(line);
560+
writer.newLine();
561+
}
562+
writer.close();
563+
// now load the uncompressed GFF
564+
FeatureList featureList = GFF3Reader.read(TEMPIPRSCANFILE);
565+
for (FeatureI featureI : featureList) {
566+
String seqname = featureI.seqname();
567+
Location location = featureI.location();
568+
String type = featureI.type();
569+
// feature
570+
Item feature = getFeatureOnProtein(type, seqname, location);
571+
String id = getAttribute(featureI, "ID");
572+
feature.setAttribute("primaryIdentifier", id);
573+
features.put(id, feature);
574+
// source isn't supplied by FeatureI
575+
// feature.setAttribute("source", rec.getSource());
576+
// attributes
577+
String name = getAttribute(featureI, "Name");
578+
String status = getAttribute(featureI, "status");
579+
String date = getAttribute(featureI, "date");
580+
String target = getAttribute(featureI, "Target");
581+
String signatureDesc = getAttribute(featureI, "signature_desc");
582+
// accession=Name
583+
feature.setAttribute("accession", name);
584+
// status
585+
if (status!=null) feature.setAttribute("status", status);
586+
// date
587+
if (date!=null) feature.setAttribute("date", date);
588+
// target
589+
if (target!=null) feature.setAttribute("target", target);
590+
// signatureDesc
591+
if (signatureDesc!=null) feature.setAttribute("signatureDesc", signatureDesc);
592+
}
593+
}
594+
538595
/**
539596
* Add an OntologyAnnotation with the given identifier to the given feature's collection
540597
* NOTE: GO terms are GOTerm objects.
@@ -625,6 +682,31 @@ void placeFeatureOnSequence(Item feature, String seqname, Location location) thr
625682
}
626683
}
627684

685+
/**
686+
* Place a feature on a protein.
687+
*/
688+
Item getFeatureOnProtein(String type, String seqname, Location location) throws RuntimeException {
689+
Item protein = getProtein(seqname);
690+
Item feature = null;
691+
if (type.equals("protein_match")) {
692+
feature = createItem("ProteinMatch");
693+
} else if (type.equals("protein_hmm_match")) {
694+
feature = createItem("ProteinHmmMatch");
695+
} else {
696+
throw new RuntimeException("IPRSCAN GFF record type "+type+" is not supported by this loader.");
697+
}
698+
feature.setReference("protein", protein);
699+
// reference feature on new IM Location
700+
Item proteinLocation = createItem("Location");
701+
proteinLocation.setReference("feature", feature);
702+
proteinLocation.setAttribute("start", String.valueOf(location.bioStart()));
703+
proteinLocation.setAttribute("end", String.valueOf(location.bioEnd()));
704+
proteinLocation.setReference("locatedOn", protein);
705+
locations.add(proteinLocation);
706+
feature.setReference("location", proteinLocation);
707+
return feature;
708+
}
709+
628710
/**
629711
* Get/add a Gene Item, keyed by primaryIdentifier
630712
*/

lis-annotation/src/main/resources/lis-annotation_keys.properties

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
##
2-
## lis-gfa keys
2+
## lis-annotation keys
33
##
44

55
Ontology.key_name=name
@@ -8,26 +8,25 @@ SOTerm.key_name=name
88
OntologyAnnotation.key_subject_term=subject,ontologyTerm
99
DataSource.key_name=name
1010
DataSet.key_name=name
11-
1211
Organism.key_taxonid=taxonId
1312
Strain.key_identifier=identifier
14-
1513
Publication.key_doi=doi
1614

17-
## BioEntity
18-
Protein.key_primaryidentifier=primaryIdentifier
19-
ProteinDomain.key_primaryidentifier=primaryIdentifier
20-
21-
## SequenceFeature
15+
## BioEntity (and SequenceFeature)
2216
Chromosome.key_primaryidentifier=primaryIdentifier
2317
Supercontig.key_primaryidentifier=primaryIdentifier
2418
Gene.key_primaryidentifier=primaryIdentifier
2519
CDS.key_primaryidentifier=primaryIdentifier
2620
CDSRegion.key_primaryidentifier=primaryIdentifier
2721
Exon.key_primaryidentifier=primaryIdentifier
2822
MRNA.key_primaryidentifier=primaryIdentifier
23+
Protein.key_primaryidentifier=primaryIdentifier
24+
ProteinDomain.key_primaryidentifier=primaryIdentifier
25+
ProteinMatch.key_primaryidentifier=primaryIdentifier
26+
ProteinHmmMatch.key_primaryidentifier=primaryIdentifier
2927
Transcript.key_primaryidentifier=primaryIdentifier
3028
NCRNA.key_primaryidentifier=primaryIdentifier
29+
ThreePrimeUTR.key_primaryidentifier=primaryIdentifier
3130
FivePrimeUTR.key_primaryidentifier=primaryIdentifier
3231
LncRNA.key_primaryidentifier=primaryIdentifier
3332
Pseudogene.key_primaryidentifier=primaryIdentifier

lis-genefamily/src/main/java/org/intermine/bio/dataconversion/GeneFamilyFileConverter.java

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import org.intermine.objectstore.ObjectStoreException;
2121
import org.intermine.xml.full.Item;
2222

23+
import org.ncgr.zip.GZIPBufferedReader;
24+
2325
/**
2426
* Load gene family data from LIS datastore files.
2527
*
@@ -39,6 +41,8 @@ public class GeneFamilyFileConverter extends DatastoreFileConverter {
3941
Map<String,Item> proteinDomains = new HashMap<>();
4042
Map<String,Item> geneFamilies = new HashMap<>();
4143

44+
boolean ahrdFileFound = false;
45+
4246
/**
4347
* Create a new GeneFamilyFileConverter
4448
* @param writer the ItemWriter to write out new items
@@ -69,8 +73,9 @@ public void process(Reader reader) throws IOException {
6973
}
7074
dataSet.setReference("dataSource", dataSource);
7175
// process files
72-
if (getCurrentFile().getName().endsWith(".info_annot_ahrd.tsv")) {
73-
processInfoAnnotAhrdFile(reader);
76+
if (getCurrentFile().getName().endsWith(".info_annot_ahrd.tsv.gz")) {
77+
processInfoAnnotAhrdFile();
78+
ahrdFileFound = true;
7479
}
7580
}
7681

@@ -79,6 +84,9 @@ public void process(Reader reader) throws IOException {
7984
*/
8085
@Override
8186
public void close() throws Exception {
87+
if (!ahrdFileFound) {
88+
throw new RuntimeException("File ending in .info_annot_ahrd.tsv.gz not found. Aborting");
89+
}
8290
store(dataSource);
8391
store(dataSet);
8492
store(geneFamilies.values());
@@ -164,9 +172,9 @@ public Item getProteinDomain(String identifier) {
164172
}
165173

166174
/**
167-
* Process an info_annot_ahrd.tsv file which contains gene families and semi-colon separated groups of ontology terms.
168-
* 0 1 2 3 4 5
169-
* lis.genefam.fam1.M65K.info_annot_ahrd.tsv
175+
* Process an info_annot_ahrd.tsv.gz file which contains gene families and semi-colon separated groups of ontology terms.
176+
* 0 1 2 3 4 5 6
177+
* lis.genefam.fam1.M65K.info_annot_ahrd.tsv.gz
170178
* legfed_v1_0.L_LFXSXJ splicing factor 3B subunit 3-like isoform X2 [Glycine max];
171179
* IPR004871 (Cleavage/polyadenylation specificity factor, A subunit, C-terminal);
172180
* GO:0003676 (nucleic acid binding), GO:0005634 (nucleus)
@@ -182,9 +190,9 @@ public Item getProteinDomain(String identifier) {
182190
* SIASPGRGILAIDESNATCGKRLASIGLDNTEVNRQAYRQLLLTTPGLGEYISGAILFEE
183191
* ...
184192
*/
185-
void processInfoAnnotAhrdFile(Reader reader) throws IOException {
193+
void processInfoAnnotAhrdFile() throws IOException {
186194
// spin through the AHRD file lines
187-
BufferedReader br = new BufferedReader(reader);
195+
BufferedReader br = GZIPBufferedReader.getReader(getCurrentFile());
188196
String line = null;
189197
while ((line=br.readLine())!=null) {
190198
if (line.startsWith("#") || line.trim().length()==0) continue; // comment line
-25.1 KB
Binary file not shown.

lis-phylotree/libs/bioinfweb-commons-java-core-3.2.0.jar renamed to lis-phylotree/libs/bioinfweb-commons-java-core-3.4.0.jar

122 KB
Binary file not shown.
610 KB
Binary file not shown.

lis-phylotree/src/main/java/org/intermine/bio/dataconversion/PhylotreeFileConverter.java

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -64,24 +64,25 @@ public PhylotreeFileConverter(ItemWriter writer, Model model) throws ObjectStore
6464
*/
6565
@Override
6666
public void process(Reader reader) {
67-
// there is no README (yet)
68-
// DataSet
69-
if (dataSetName==null || dataSetUrl==null || dataSetDescription==null) {
70-
throw new RuntimeException("ERROR: dataSetName, dataSetUrl, and dataSetDescription must be set in project.xml.");
71-
}
72-
dataSet = createItem("DataSet");
73-
dataSet.setAttribute("name", dataSetName);
74-
dataSet.setAttribute("url", dataSetUrl);
75-
dataSet.setAttribute("description", dataSetDescription);
76-
if (dataSetLicence!=null) {
77-
dataSet.setAttribute("licence", dataSetLicence);
78-
} else {
79-
dataSet.setAttribute("licence", DatastoreFileConverter.DEFAULT_DATASET_LICENCE);
67+
// there is no README (yet) so hand-roll DataSet
68+
if (dataSet==null) {
69+
if (dataSetName==null || dataSetUrl==null || dataSetDescription==null) {
70+
throw new RuntimeException("ERROR: dataSetName, dataSetUrl, and dataSetDescription must be set in project.xml.");
71+
}
72+
dataSet = createItem("DataSet");
73+
dataSet.setAttribute("name", dataSetName);
74+
dataSet.setAttribute("url", dataSetUrl);
75+
dataSet.setAttribute("description", dataSetDescription);
76+
if (dataSetLicence!=null) {
77+
dataSet.setAttribute("licence", dataSetLicence);
78+
} else {
79+
dataSet.setAttribute("licence", DatastoreFileConverter.DEFAULT_DATASET_LICENCE);
80+
}
81+
dataSet.setReference("dataSource", dataSource);
8082
}
81-
dataSet.setReference("dataSource", dataSource);
8283
try {
8384
processTreeFile();
84-
} catch (IOException ex) {
85+
} catch (Exception ex) {
8586
throw new RuntimeException(ex);
8687
}
8788
}
@@ -125,9 +126,11 @@ Item getPhylonode(Node node, String name) {
125126
if (node.isFeature()) {
126127
phylonode.setAttribute("isLeaf", "true");
127128
phylonode.setAttribute("name", node.label);
128-
// assume Proteins
129-
Item protein = getProtein(node.label);
130-
phylonode.setReference("protein", protein);
129+
// some nodes hold proteins
130+
if (isFullYuck(node.label)) {
131+
Item protein = getProtein(node.label);
132+
phylonode.setReference("protein", protein);
133+
}
131134
}
132135
return phylonode;
133136
}
@@ -170,6 +173,7 @@ void processEdge(Edge e, String identifier) {
170173

171174
/**
172175
* Process a file in the phylotree subdirectory, in Newick format.
176+
* legume.genefam.fam1.M65K.trees_ML_rooted/legfed_v1_0.L_6W30ML
173177
*/
174178
void processTreeFile() throws IOException {
175179
JPhyloIOReaderWriterFactory factory = new JPhyloIOReaderWriterFactory();
@@ -184,13 +188,20 @@ void processTreeFile() throws IOException {
184188
throw new RuntimeException(ex);
185189
}
186190
// create this Phylotree and GeneFamily
187-
String name = getCurrentFile().getName();
191+
// 0 1
192+
// legfed_v1_0.L_6W30ML
193+
String identifier = getCurrentFile().getName();
194+
String[] parts = identifier.split("\\.");
195+
String name = parts[1];
188196
Item phylotree = createItem("Phylotree");
189197
phylotrees.add(phylotree);
198+
// TODO: 5.0.7.3 switch to identifier
199+
// phylotree.setAttribute("identifier", identifier);
190200
phylotree.setAttribute("name", name);
191201
phylotree.setReference("dataSet", dataSet);
192202
Item geneFamily = createItem("GeneFamily");
193203
geneFamilies.add(geneFamily);
204+
// TODO: 5.0.7.3 switch to identifier
194205
geneFamily.setAttribute("name", name);
195206
geneFamily.setReference("phylotree", phylotree);
196207
phylotree.setReference("geneFamily", geneFamily);
@@ -225,6 +236,7 @@ void processTreeFile() throws IOException {
225236
// Indicates a node in a phylogenetic tree or network.
226237
Node n = new Node(event.asNodeEvent());
227238
Item phylonode = getPhylonode(n, name);
239+
if (phylonode==null) return;
228240
phylonode.setReference("tree", phylotree);
229241
phylotree.addToCollection("nodes", phylonode);
230242
if (n.isFeature()) numLeaves++;
@@ -263,15 +275,17 @@ void processTreeFile() throws IOException {
263275
newick.setAttribute("contents", contents);
264276
}
265277

278+
/**
279+
* Return true if the given identifier is in LIS full yuck format.
280+
*/
281+
boolean isFullYuck(String primaryIdentifier) {
282+
return primaryIdentifier.split("\\.").length>=5;
283+
}
284+
266285
/**
267286
* Get/add a Protein Item, keyed by primaryIdentifier
268287
*/
269288
Item getProtein(String primaryIdentifier) {
270-
// check that we've got a full-yuck identifier
271-
String[] parts = primaryIdentifier.split("\\.");
272-
if (parts.length<5) {
273-
throw new RuntimeException("Protein primary identifier in "+getCurrentFile().getName()+" is not LIS format:"+primaryIdentifier);
274-
}
275289
if (proteins.containsKey(primaryIdentifier)) {
276290
return proteins.get(primaryIdentifier);
277291
} else {

0 commit comments

Comments
 (0)