Skip to content

Commit 509b2c5

Browse files
authored
Merge pull request #2329 from broadinstitute/jb-ols-taxon-fallback
Using OLS validation as fallback for extended taxon IDs (SCP-6070)
2 parents ec1a920 + b9a0922 commit 509b2c5

File tree

8 files changed

+104
-14
lines changed

8 files changed

+104
-14
lines changed

app/javascript/lib/validation/ontology-validation.js

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import { metadataSchema } from './shared-validation'
1717
// TODO: Replace "development" with "main" after next ingest release
1818
const ONTOLOGY_BASE_URL =
1919
'https://raw.githubusercontent.com/broadinstitute/scp-ingest-pipeline/' +
20-
'development/ingest/validation/ontologies/'
20+
'main/ingest/validation/ontologies/'
2121

2222
/** Quickly retrieve current version cache key for ontologies */
2323
async function fetchOntologyCacheVersion() {
@@ -169,7 +169,7 @@ export function getOntologyBasedProps() {
169169
*/
170170
export function getAcceptedOntologies(key, metadataSchema) {
171171
// E.g. "ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/mondo,https://www.ebi.ac.uk/ols/ontologies/pato"
172-
const olsUrls = metadataSchema.properties[key].ontology
172+
const olsUrls = metadataSchema.properties[key]?.ontology
173173

174174
const acceptedOntologies =
175175
olsUrls?.split(',').map(url => url.split('/').slice(-1)[0].toUpperCase())
@@ -180,3 +180,31 @@ export function getAcceptedOntologies(key, metadataSchema) {
180180

181181
return acceptedOntologies
182182
}
183+
184+
/**
185+
* fetch a remote ontology term from OLS for NCBI taxon IDs
186+
* @param termId {String} ontology term ID, e.g. "NCBITaxon_9606"
187+
* @returns {Object, null} JSON of ontology term, if found
188+
*/
189+
export async function fetchOlsOntologyTerm(termId) {
190+
const noMatch = {}
191+
noMatch[termId] = 'Not found'
192+
try {
193+
const ontologyName = termId.split('_')[0].toLowerCase()
194+
const purlIri = `http://purl.obolibrary.org/obo/${termId}`
195+
// purl IRI values must be double-encoded, to match behavior in:
196+
// https://github.com/broadinstitute/scp-ingest-pipeline/blob/development/ingest/validation/validate_metadata.py#L348
197+
const termUrl = `https://www.ebi.ac.uk/ols4/api/ontologies/${ontologyName}/` +
198+
`terms/${encodeURIComponent(encodeURIComponent(purlIri))}?lang=en`
199+
console.debug(`termUrl: ${termUrl}`)
200+
const rawTerm = await fetch(termUrl)
201+
console.debug(`rawTerm: ${JSON.stringify(rawTerm)}`)
202+
if (rawTerm.ok) {
203+
return rawTerm.json()
204+
} else {
205+
return noMatch
206+
}
207+
} catch (error) {
208+
return noMatch
209+
}
210+
}

app/javascript/lib/validation/shared-validation.js

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
// Ultimately sourced from: scp-ingest-pipeline/schemas
66
import * as _schema from 'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json';
7+
import { fetchOlsOntologyTerm } from '~/lib/validation/ontology-validation'
78

89
export const metadataSchema = _schema
910
export const REQUIRED_CONVENTION_COLUMNS = metadataSchema.required.filter(c => c !== 'CellID')
@@ -323,3 +324,43 @@ export function getLabelSuffixForOntology(indentifier) {
323324
const shortName = getOntologyShortNameLc(indentifier)
324325
return shortName === 'uo' ? '_label' : '__ontology_label'
325326
}
327+
328+
/** attempt to rectify invalid taxon ID issues, if present */
329+
export async function fixTaxonIdIssues(issues) {
330+
// fallback for any taxon IDs not found
331+
const invalidIds = findInvalidIds(issues)
332+
if (Object.keys(invalidIds).length > 0) {
333+
let filteredIssues = issues
334+
const promises = []
335+
Object.entries(invalidIds).map(async ([termId, label]) => {
336+
const promise = fetchOlsOntologyTerm(termId)
337+
promises.push(promise)
338+
})
339+
const results= await Promise.all(promises)
340+
results.map(olsTerm => {
341+
const termId = olsTerm?.short_form || Object.keys(olsTerm)[0]
342+
const matchingIssue = issues.find(issue => {return issue[3]?.id === termId})
343+
const label = matchingIssue[3].label
344+
if (olsTerm && olsTerm.label === label) {
345+
filteredIssues = filteredIssues.filter(issue => {return issue[3]?.id !== termId})
346+
}
347+
})
348+
return filteredIssues
349+
} else {
350+
return issues
351+
}
352+
}
353+
354+
355+
/** find instances of 'ontology:invalid-id' errors */
356+
export function findInvalidIds(issues) {
357+
const invalidIds = {}
358+
for (let i = 0; i < issues.length; i++) {
359+
const issue = issues[i]
360+
if (issue[3]?.subtype === 'ontology:invalid-id' && issue[3].id.toLowerCase().startsWith('ncbi')) {
361+
const invalidId = issue[3].id
362+
invalidIds[invalidId] = issue[3].label
363+
}
364+
}
365+
return invalidIds
366+
}

app/javascript/lib/validation/validate-anndata.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { getOAuthToken } from '~/lib/scp-api'
44
import {
55
validateUnique, validateRequiredMetadataColumns,
66
validateAlphanumericAndUnderscores, getOntologyShortNameLc,
7-
metadataSchema, REQUIRED_CONVENTION_COLUMNS
7+
metadataSchema, REQUIRED_CONVENTION_COLUMNS, fixTaxonIdIssues
88
} from './shared-validation'
99
import { fetchOntologies, getOntologyBasedProps, getAcceptedOntologies } from './ontology-validation'
1010

@@ -131,7 +131,6 @@ export function checkOntologyIdFormat(key, ontologyIds) {
131131
/** Validate author's annotation labels and IDs match those in ontologies */
132132
export async function checkOntologyLabelsAndIds(key, ontologies, groups) {
133133
const [ids, idIndexes, labels, labelIndexes] = groups
134-
135134
const issues = []
136135

137136
// Determine unique (ontology ID, ontology label) pairs
@@ -158,7 +157,7 @@ export async function checkOntologyLabelsAndIds(key, ontologies, groups) {
158157
const msg = `Invalid ontology ID: ${id}`
159158
issues.push([
160159
'error', 'ontology:label-lookup-error', msg,
161-
{ subtype: 'ontology:invalid-id' }
160+
{ subtype: 'ontology:invalid-id', id, label }
162161
])
163162
} else {
164163
const validLabels = ontology[id]
@@ -252,7 +251,7 @@ async function validateOntologyLabelsAndIds(hdf5File) {
252251
}
253252
}
254253

255-
return issues
254+
return await fixTaxonIdIssues(issues)
256255
}
257256

258257

app/javascript/lib/validation/validate-file-content.js

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ import {
1818
getParsedHeaderLines, parseLine, ParseException,
1919
validateUniqueCellNamesWithinFile, validateMetadataLabelMatches,
2020
validateGroupColumnCounts, timeOutCSFV, validateUnique,
21-
validateRequiredMetadataColumns, validateAlphanumericAndUnderscores
21+
validateRequiredMetadataColumns, validateAlphanumericAndUnderscores,
22+
getOntologyShortNameLc, getLabelSuffixForOntology, fixTaxonIdIssues
2223
} from './shared-validation'
2324
import { parseDifferentialExpressionFile } from './validate-differential-expression'
2425
import { parseAnnDataFile } from './validate-anndata'
2526
import { fetchOntologies, getOntologyBasedProps } from '~/lib/validation/ontology-validation'
26-
import { getOntologyShortNameLc, getLabelSuffixForOntology } from './shared-validation'
2727

2828
/**
2929
* Gzip decompression requires reading the whole file, given the current
@@ -243,7 +243,8 @@ export async function parseMetadataFile(chunker, mimeType, fileOptions) {
243243
// add other line-by-line validations here
244244
}
245245
})
246-
return { issues, delimiter, numColumns: headers[0].length }
246+
const filteredIssues = await fixTaxonIdIssues(issues)
247+
return { issues: filteredIssues, delimiter, numColumns: headers[0].length }
247248
}
248249

249250
/** validate all ontology-based convention terms in a given line */
@@ -293,7 +294,7 @@ export function validateOntologyTerm(prop, ontologyId, label, ontologies, knownE
293294
errorIdentifier = `${ontologyId}-invalid-id`
294295
issue = [
295296
'error', 'ontology:label-lookup-error', msg,
296-
{ subtype: 'ontology:invalid-id' }
297+
{ subtype: 'ontology:invalid-id', id: ontologyId, label }
297298
]
298299
} else {
299300
const validLabels = ontology[ontologyId]

config/application.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class Application < Rails::Application
2929
config.middleware.use Rack::Brotli
3030

3131
# Docker image for file parsing via scp-ingest-pipeline
32-
config.ingest_docker_image = 'gcr.io/broad-singlecellportal-staging/scp-ingest-pipeline:1.43.0'
32+
config.ingest_docker_image = 'gcr.io/broad-singlecellportal-staging/scp-ingest-pipeline:1.43.1'
3333

3434
# Docker image for image pipeline jobs
3535
config.image_pipeline_docker_image = 'gcr.io/broad-singlecellportal-staging/image-pipeline:0.1.0_c2b090043'

config/initializers/content_security_policy.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
'https://www.googleapis.com', 'https://storage.googleapis.com', 'https://s3.amazonaws.com', 'https://data.broadinstitute.org', 'https://portals.broadinstitute.org',
5858
'https://accounts.google.com', 'https://terra-bard-dev.appspot.com', 'https://terra-bard-alpha.appspot.com', 'https://terra-bard-prod.appspot.com',
5959
'https://rest.ensembl.org', 'https://eutils.ncbi.nlm.nih.gov', 'https://mygene.info', 'https://webservice.wikipathways.org', 'https://o54426.ingest.sentry.io',
60-
'https://igv-genepattern-org.s3.amazonaws.com', 'https://*.soe.ucsc.edu', 'https://raw.githubusercontent.com'
60+
'https://igv-genepattern-org.s3.amazonaws.com', 'https://*.soe.ucsc.edu', 'https://raw.githubusercontent.com', 'https://www.ebi.ac.uk'
6161
]
6262
if ENV['NOT_DOCKERIZED']
6363
# enable connections to live reload server

test/js/lib/validate-file-content.test.js

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ import { render, screen } from '@testing-library/react'
44
import '@testing-library/jest-dom/extend-expect'
55

66
import ValidateFile from 'lib/validation/validate-file'
7-
import { fetchOntologies } from 'lib/validation/ontology-validation'
7+
import { fetchOntologies, fetchOlsOntologyTerm } from 'lib/validation/ontology-validation'
88
import { validateConventionTerms, validateOntologyTerm } from 'lib/validation/validate-file-content'
99
import {
10-
REQUIRED_CONVENTION_COLUMNS, getOntologyShortNameLc, getLabelSuffixForOntology
10+
REQUIRED_CONVENTION_COLUMNS, getOntologyShortNameLc, getLabelSuffixForOntology, fixTaxonIdIssues
1111
} from 'lib/validation/shared-validation'
1212
import { getLogProps } from 'lib/validation/log-validation'
1313
import ValidationMessage from 'components/validation/ValidationMessage'
@@ -642,4 +642,25 @@ describe('validates file contents against minified ontologies', () => {
642642
const uoId = "UO_0000036"
643643
expect("_label").toEqual(getLabelSuffixForOntology(uoId))
644644
})
645+
646+
it('gets remote term from OLS', async () => {
647+
const termId = "NCBITaxon_197152"
648+
const olsTerm = await fetchOlsOntologyTerm(termId)
649+
expect(olsTerm.label).toBe('Cloeon dipterum')
650+
const missingTerm = await fetchOlsOntologyTerm('NCBITaxon_foo')
651+
expect(missingTerm).toMatchObject({ 'NCBITaxon_foo': 'Not found' })
652+
const missingOntology = await fetchOlsOntologyTerm('foo_197152')
653+
expect(missingOntology).toMatchObject({ 'foo_197152': 'Not found' })
654+
})
655+
656+
it('filters out extended ontolgy issues', async () => {
657+
const issues = [
658+
[
659+
'error', 'ontology:label-lookup-error', 'Invalid ontology ID: NCBITaxon_197152',
660+
{ id: 'NCBITaxon_197152', label: 'Cloeon dipterum', subtype: 'ontology:invalid-id' }
661+
]
662+
]
663+
const filtered = await fixTaxonIdIssues(issues)
664+
expect(filtered).toHaveLength(0)
665+
})
645666
})
50 KB
Binary file not shown.

0 commit comments

Comments
 (0)