Skip to content

Commit 355930d

Browse files
authored
Merge pull request #47 from sapporo-wes/feat/tataki-edam-enrichment
feat: enrich RO-Crate output files with EDAM format IDs via tataki
2 parents bde327c + bb9955a commit 355930d

3 files changed

Lines changed: 253 additions & 1 deletion

File tree

docs/ro-crate.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ sapporo-service accepts arbitrary workflow engines and workflow languages via th
4545
| Non-numeric `exit_code` | `FailedActionStatus` set; `exitCode` property omitted |
4646
| Output file disappeared after listing | File skipped; remaining outputs processed normally |
4747
| Missing `run_request.json` | `TypeError` raised with a descriptive message (generation cannot proceed) |
48+
| tataki Docker unavailable or fails | Warning logged; `encodingFormat` left unchanged |
4849

4950
## Entity Graph
5051

@@ -174,6 +175,14 @@ For output files with VCF format (`.vcf`, `.vcf.gz`), `vcf-stats` is run in a Do
174175

175176
Output files are automatically annotated with [EDAM ontology](http://edamontology.org/) format identifiers based on file extension. EDAM entities use `@type: "Thing"` as they represent ontology terms rather than web resources. The mapping is defined in `sapporo/ro_crate.py` (`EDAM_MAPPING` dict). Common non-bioinformatics formats (JSON, CSV, TSV, HTML, YAML, Markdown, ZIP, gzip, plain text) are also mapped to their IANA media types.
176177

178+
### tataki Content-Based Format Detection
179+
180+
[tataki](https://github.com/sapporo-wes/tataki) is run in a Docker container (`ghcr.io/sapporo-wes/tataki:latest`) against all output files after the extension-based EDAM detection. tataki detects file formats by inspecting file content (magic bytes, structure analysis) rather than relying on file extensions, covering both bioinformatics formats (BAM, VCF, FASTQ, ...) and common formats (TSV, CSV, JSON, HTML, PDF, PNG, SVG).
181+
182+
When tataki identifies a file's format, the file's `encodingFormat` is replaced with the EDAM ontology entity returned by tataki. Files that tataki cannot identify retain their original `encodingFormat` (extension-based EDAM + MIME type).
183+
184+
This enrichment enables [tonkaz](https://github.com/sapporo-wes/tonkaz) Level 1-3 file-content comparison on typical workflow outputs. If Docker is not available or tataki fails, the enrichment is silently skipped.
185+
177186
## API Endpoint
178187

179188
### `GET /runs/{run_id}/ro-crate`
@@ -207,7 +216,8 @@ The generation flow:
207216
4. Build the `CreateAction` with inputs, outputs, logs, and metadata.
208217
5. Run MultiQC in Docker and attach statistics (skipped if Docker is unavailable).
209218
6. Run samtools/vcftools in Docker on applicable output files (skipped if Docker is unavailable).
210-
7. Write `ro-crate-metadata.json` and `README.md` to the run directory.
219+
7. Run tataki in Docker to enrich output files with EDAM format IDs (skipped if Docker is unavailable).
220+
8. Write `ro-crate-metadata.json` and `README.md` to the run directory.
211221

212222
## Validation
213223

sapporo/ro_crate.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
BIOSCHEMAS_COMPUTATIONAL_WORKFLOW = "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE"
4545

4646
_STDERR_TAIL_LINES = 20
47+
_TATAKI_IMAGE = "ghcr.io/sapporo-wes/tataki:latest"
4748
_DOCKER_IMAGE_RE = re.compile(
4849
r"(?:^|\s)"
4950
r"((?:[\w.-]+(?::\d+)?/)?[\w.-]+(?:/[\w.-]+)*:[\w.+-]+)"
@@ -1028,6 +1029,77 @@ def add_readme_entity(crate: ROCrate, run_dir: Path, run_id: str) -> None:
10281029
crate.add(file_ins)
10291030

10301031

1032+
# === tataki EDAM enrichment ===
1033+
1034+
1035+
def add_tataki_edam(crate: ROCrate, run_dir: Path) -> None:
1036+
"""Enrich output File entities with EDAM format IDs from tataki.
1037+
1038+
Runs tataki via Docker against all files in the outputs directory and
1039+
replaces their ``encodingFormat`` with a proper EDAM ontology entity.
1040+
Silently skips if Docker is unavailable or tataki fails — enrichment
1041+
is always best-effort and never causes the run to fail.
1042+
"""
1043+
if shutil.which("docker") is None:
1044+
return
1045+
1046+
outputs_dir = run_dir / RUN_DIR_STRUCTURE["outputs_dir"]
1047+
if not outputs_dir.exists():
1048+
return
1049+
1050+
rel_paths = [p.relative_to(outputs_dir) for p in outputs_dir.glob("**/*") if p.is_file()]
1051+
if not rel_paths:
1052+
return
1053+
1054+
cmd = [
1055+
"docker",
1056+
"run",
1057+
"--rm",
1058+
"-v",
1059+
f"{outputs_dir}:/work",
1060+
"-w",
1061+
"/work",
1062+
_TATAKI_IMAGE,
1063+
"-f",
1064+
"json",
1065+
"--quiet",
1066+
*[f"/work/{rel}" for rel in rel_paths],
1067+
]
1068+
try:
1069+
proc = subprocess.run(cmd, capture_output=True, check=False, timeout=300)
1070+
except subprocess.TimeoutExpired:
1071+
LOGGER.warning("tataki Docker command timed out after 300s")
1072+
return
1073+
if proc.returncode != 0:
1074+
LOGGER.warning("tataki Docker command failed (rc=%d): %s", proc.returncode, proc.stderr.decode()[:500])
1075+
return
1076+
try:
1077+
detections: dict[str, Any] = json.loads(proc.stdout)
1078+
except json.JSONDecodeError:
1079+
LOGGER.warning("Failed to parse tataki JSON output")
1080+
return
1081+
1082+
for container_path, detection in detections.items():
1083+
edam_id: str | None = detection.get("id")
1084+
edam_label: str | None = detection.get("label")
1085+
if not edam_id:
1086+
continue
1087+
1088+
rel_from_work = container_path.removeprefix("/work/")
1089+
entity_rel = str(Path(RUN_DIR_STRUCTURE["outputs_dir"]) / rel_from_work)
1090+
entity = crate.get(entity_rel)
1091+
if entity is None:
1092+
continue
1093+
1094+
edam_entity = ContextEntity(
1095+
crate,
1096+
edam_id,
1097+
properties={"@type": "Thing", "name": edam_label or edam_id},
1098+
)
1099+
crate.add(edam_entity)
1100+
entity["encodingFormat"] = edam_entity
1101+
1102+
10311103
# === Entry points ===
10321104

10331105

@@ -1083,6 +1155,7 @@ def generate_ro_crate_metadata(run_dir: Path) -> dict[str, Any]:
10831155
action.append_to("executedBy", sapporo_ins, compact=True)
10841156

10851157
add_readme_entity(crate, run_dir, run_id)
1158+
add_tataki_edam(crate, run_dir)
10861159

10871160
result: dict[str, Any] = crate.metadata.generate()
10881161

tests/unit/test_ro_crate.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
add_file_stats,
2424
add_multiqc_stats,
2525
add_samtools_stats,
26+
add_tataki_edam,
2627
add_vcftools_stats,
2728
add_workflow_entity,
2829
compute_sha256,
@@ -1716,6 +1717,174 @@ def test_generated_by_has_vcftools_software(self, tmp_path: Path, mocker: "Mocke
17161717
assert generated_by is not None
17171718

17181719

1720+
class TestAddTatakiEdam:
1721+
_TATAKI_JSON = json.dumps(
1722+
{
1723+
"/work/result.tsv": {
1724+
"id": "http://edamontology.org/format_3475",
1725+
"label": "TSV",
1726+
"decompressed": {"id": None, "label": None},
1727+
},
1728+
}
1729+
).encode()
1730+
1731+
def _make_run_dir_with_outputs(self, tmp_path: Path, output_files: dict[str, str | bytes]) -> Path:
1732+
"""Create a run directory with output files and generate an RO-Crate."""
1733+
return create_run_dir(
1734+
tmp_path,
1735+
RUN_ID,
1736+
exit_code="0",
1737+
end_time="2024-01-01T00:10:00",
1738+
wf_params="{}",
1739+
output_files=output_files,
1740+
outputs_json=[],
1741+
)
1742+
1743+
def test_skips_when_docker_not_available(self, tmp_path: Path) -> None:
1744+
"""Should return immediately when docker binary is not found."""
1745+
rd = self._make_run_dir_with_outputs(tmp_path, {"result.tsv": "a\tb\n"})
1746+
crate = create_base_crate()
1747+
with patch("sapporo.ro_crate.shutil.which", return_value=None):
1748+
add_tataki_edam(crate, rd)
1749+
# No EDAM entities added
1750+
edam_entities = [e for e in crate.get_entities() if "edamontology.org" in str(e.id)]
1751+
assert edam_entities == []
1752+
1753+
def test_skips_when_outputs_dir_missing(self, tmp_path: Path) -> None:
1754+
"""Should return immediately when outputs directory does not exist."""
1755+
rd = create_run_dir(tmp_path, RUN_ID, exit_code="0", end_time="2024-01-01T00:10:00", wf_params="{}")
1756+
crate = create_base_crate()
1757+
add_tataki_edam(crate, rd)
1758+
edam_entities = [e for e in crate.get_entities() if "edamontology.org" in str(e.id)]
1759+
assert edam_entities == []
1760+
1761+
def test_skips_when_no_output_files(self, tmp_path: Path) -> None:
1762+
"""Should return immediately when outputs directory is empty."""
1763+
rd = create_run_dir(tmp_path, RUN_ID, exit_code="0", end_time="2024-01-01T00:10:00", wf_params="{}")
1764+
from sapporo.config import RUN_DIR_STRUCTURE
1765+
1766+
rd.joinpath(RUN_DIR_STRUCTURE["outputs_dir"]).mkdir(parents=True, exist_ok=True)
1767+
crate = create_base_crate()
1768+
add_tataki_edam(crate, rd)
1769+
edam_entities = [e for e in crate.get_entities() if "edamontology.org" in str(e.id)]
1770+
assert edam_entities == []
1771+
1772+
def test_skips_on_docker_failure(self, tmp_path: Path, mocker: "MockerFixture") -> None:
1773+
"""Should log warning and return when Docker command fails."""
1774+
from subprocess import CompletedProcess
1775+
1776+
rd = self._make_run_dir_with_outputs(tmp_path, {"result.tsv": "a\tb\n"})
1777+
crate = create_base_crate()
1778+
mocker.patch("sapporo.ro_crate.shutil.which", return_value="/usr/bin/docker")
1779+
mocker.patch(
1780+
"sapporo.ro_crate.subprocess.run",
1781+
return_value=CompletedProcess(args=[], returncode=1, stdout=b"", stderr=b"image not found"),
1782+
)
1783+
add_tataki_edam(crate, rd)
1784+
edam_entities = [e for e in crate.get_entities() if "edamontology.org" in str(e.id)]
1785+
assert edam_entities == []
1786+
1787+
def test_skips_on_timeout(self, tmp_path: Path, mocker: "MockerFixture") -> None:
1788+
"""Should log warning and return when Docker command times out."""
1789+
import subprocess as sp
1790+
1791+
rd = self._make_run_dir_with_outputs(tmp_path, {"result.tsv": "a\tb\n"})
1792+
crate = create_base_crate()
1793+
mocker.patch("sapporo.ro_crate.shutil.which", return_value="/usr/bin/docker")
1794+
mocker.patch("sapporo.ro_crate.subprocess.run", side_effect=sp.TimeoutExpired(cmd="docker", timeout=300))
1795+
add_tataki_edam(crate, rd)
1796+
edam_entities = [e for e in crate.get_entities() if "edamontology.org" in str(e.id)]
1797+
assert edam_entities == []
1798+
1799+
def test_skips_on_invalid_json(self, tmp_path: Path, mocker: "MockerFixture") -> None:
1800+
"""Should log warning and return when tataki output is not valid JSON."""
1801+
from subprocess import CompletedProcess
1802+
1803+
rd = self._make_run_dir_with_outputs(tmp_path, {"result.tsv": "a\tb\n"})
1804+
crate = create_base_crate()
1805+
mocker.patch("sapporo.ro_crate.shutil.which", return_value="/usr/bin/docker")
1806+
mocker.patch(
1807+
"sapporo.ro_crate.subprocess.run",
1808+
return_value=CompletedProcess(args=[], returncode=0, stdout=b"not json", stderr=b""),
1809+
)
1810+
add_tataki_edam(crate, rd)
1811+
edam_entities = [e for e in crate.get_entities() if "edamontology.org" in str(e.id)]
1812+
assert edam_entities == []
1813+
1814+
def test_enriches_encoding_format(self, tmp_path: Path, mocker: "MockerFixture") -> None:
1815+
"""Should replace encodingFormat with EDAM entity from tataki."""
1816+
from subprocess import CompletedProcess
1817+
1818+
from sapporo.config import RUN_DIR_STRUCTURE
1819+
1820+
rd = self._make_run_dir_with_outputs(tmp_path, {"result.tsv": "col1\tcol2\nval1\tval2\n"})
1821+
mocker.patch("sapporo.ro_crate.shutil.which", return_value="/usr/bin/docker")
1822+
mocker.patch(
1823+
"sapporo.ro_crate.subprocess.run",
1824+
return_value=CompletedProcess(args=[], returncode=0, stdout=self._TATAKI_JSON, stderr=b""),
1825+
)
1826+
1827+
jsonld = generate_ro_crate_metadata(rd)
1828+
graph = jsonld["@graph"]
1829+
1830+
# Find the output file entity
1831+
outputs_prefix = RUN_DIR_STRUCTURE["outputs_dir"]
1832+
file_entity = next(
1833+
(e for e in graph if e.get("@id", "").startswith(outputs_prefix) and "result.tsv" in e.get("@id", "")), None
1834+
)
1835+
assert file_entity is not None
1836+
1837+
# encodingFormat should be the EDAM entity reference
1838+
enc = file_entity["encodingFormat"]
1839+
assert enc == {"@id": "http://edamontology.org/format_3475"}
1840+
1841+
# EDAM entity should exist in the graph
1842+
edam_entity = next((e for e in graph if e.get("@id") == "http://edamontology.org/format_3475"), None)
1843+
assert edam_entity is not None
1844+
assert edam_entity["@type"] == "Thing"
1845+
assert edam_entity["name"] == "TSV"
1846+
1847+
def test_skips_undetected_files(self, tmp_path: Path, mocker: "MockerFixture") -> None:
1848+
"""Should not modify encodingFormat when tataki returns null id."""
1849+
from subprocess import CompletedProcess
1850+
1851+
from sapporo.config import RUN_DIR_STRUCTURE
1852+
1853+
tataki_null = json.dumps(
1854+
{
1855+
"/work/unknown.bin": {
1856+
"id": None,
1857+
"label": None,
1858+
"decompressed": {"id": None, "label": None},
1859+
},
1860+
}
1861+
).encode()
1862+
1863+
rd = self._make_run_dir_with_outputs(tmp_path, {"unknown.bin": "\x00\x01\x02"})
1864+
mocker.patch("sapporo.ro_crate.shutil.which", return_value="/usr/bin/docker")
1865+
mocker.patch(
1866+
"sapporo.ro_crate.subprocess.run",
1867+
return_value=CompletedProcess(args=[], returncode=0, stdout=tataki_null, stderr=b""),
1868+
)
1869+
1870+
jsonld = generate_ro_crate_metadata(rd)
1871+
graph = jsonld["@graph"]
1872+
1873+
# File entity should exist with original encodingFormat (not replaced)
1874+
outputs_prefix = RUN_DIR_STRUCTURE["outputs_dir"]
1875+
file_entity = next(
1876+
(e for e in graph if e.get("@id", "").startswith(outputs_prefix) and "unknown.bin" in e.get("@id", "")),
1877+
None,
1878+
)
1879+
assert file_entity is not None
1880+
1881+
# No EDAM entity from tataki should be in the graph
1882+
tataki_edam = [
1883+
e for e in graph if e.get("@id", "").startswith("http://edamontology.org/format_") and e.get("name") is None
1884+
]
1885+
assert tataki_edam == []
1886+
1887+
17191888
class TestAddFileStats:
17201889
def test_skips_when_docker_not_available(self, tmp_path: Path) -> None:
17211890
"""Should return immediately when docker binary is not found."""

0 commit comments

Comments
 (0)