cbg-ethz
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 63 additions & 50 deletions b/‎README.md‎
Lines changed: 63 additions & 50 deletions
diff --git a/‎conda-recipe/meta.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda-recipe/meta.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sr2silo/process/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎src/sr2silo/process/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/sr2silo/process/convert.py‎
Lines changed: 1 addition & 40 deletions b/‎src/sr2silo/process/convert.py‎
Lines changed: 1 addition & 40 deletions
@@ -152,3 +152,6 @@ results
 
 # Bioinformatics files
 .bai
+
+# References directory (ignore all untracked references)
+resources/references/
@@ -18,47 +18,53 @@
 [![Pyright](https://img.shields.io/badge/type%20checked-pyright-blue.svg)](https://github.com/microsoft/pyright)
 
 ### General Use: Convert Nucleotide Alignment Reads - CIGAR in .BAM to Cleartext JSON
-sr2silo can convert millions of Short-Read nucleotide read in the form of a .bam CIGAR
-alignments to cleartext alignments. Further, it will gracefully extract insertions
-and deletions. Optionally, sr2silo can translate and align each read using [diamond / blastX](https://github.com/bbuchfink/diamond). And again handle insertions and deletions.
+sr2silo can convert millions of Short-Read nucleotide reads in the form of .bam CIGAR
+alignments to cleartext alignments compatible with LAPIS-SILO v0.8.0+. It gracefully extracts insertions
+and deletions. Optionally, sr2silo can translate and align each read using [diamond / blastX](https://github.com/bbuchfink/diamond), handling insertions and deletions in amino acid sequences as well.
 
 Your input `.bam/.sam` with one line as:
-````
-294	163	NC_045512.2	79	60	31S220M	=	197	400	CTCTTGTAGAT	FGGGHHHHLMM	...
-````
+```text
+294 163 NC_045512.2 79  60  31S220M =   197 400 CTCTTGTAGAT FGGGHHHHLMM ...
+```
 
-sr2silo outputs per read a JSON (mock output):
+sr2silo outputs per read a JSON (compatible with LAPIS-SILO v0.8.0+):
 
-```
+```json
 {
-  "metadata":{
-    "read_id":"AV233803:AV044:2411515907:1:10805:5199:3294",
-      ...
-    },
-    "nucleotideInsertions":{
-                            "main":[10 : ACTG]
-                            },
-    "aminoAcidInsertions":{
-                            "E":[],
-                            ...
-                            "ORF1a":[2323 : TG, 2389 : CA],
-                            ...
-                            "S":[23 : A]
-                            },
-    "alignedNucleotideSequences":
-                                {
-                                  "main":"NNNNNNNNNNNNNNNNNNCGGTTTCGTCCGTGTTGCAGCCG...GTGTCAACATCTTAAAGATGGCACTTGTGNNNNNNNNNNNNNNNNNNNNNNNN"
-                                  },
-    "unalignedNucleotideSequences":{
-                                  "main":"CGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGGGTGTGA...TACAGGTTCGCGACGTGCTCGTGTGAAAGATGGCACTTGTG"
-                                  },
-    "alignedAminoAcidSequences":{
-                "E":"",
-                ...
-                "ORF1a":"...XXXMESLVPGFNEKTHVQLSLPVLQVRVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVXXXXXX...",
-                ...
-                "S":""}
-      }
+  "read_id": "AV233803:AV044:2411515907:1:10805:5199:3294",
+  "sample_id": "A1_05_2024_10_08",
+  "batch_id": "20241024_2411515907",
+  "sampling_date": "2024-10-08",
+  "location_name": "Lugano (TI)",
+  "read_length": "250",
+  "location_code": "05",
+  "main": {
+    "sequence": "CGGTTTCGTCCGTGTTGCAGCCG...GTGTCAACATCTTAAAGATGGCACTTGTG",
+    "insertions": ["10:ACTG", "456:TACG"],
+    "offset": 4545
+  },
+  "unaligned_main": "CGGTTTCGTCCGTGTTGCAGCCGATCATCTAGGT...TACAGGTTCGCGACGTGCTCGTGTGAAAGATGGCACTTGTG",
+  "S": {
+    "sequence": "MESLVPGFNEKTHVQLSLPVLQVRVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGV",
+    "insertions": ["23:A", "145:KLM"],
+    "offset": 78
+  },
+  "ORF1a": {
+    "sequence": "XXXMESLVPGFNEKTHVQLSLPVLQVRVRGFGDSVEEVLSEARQHLKDGTCGLV",
+    "insertions": ["2323:TG", "2389:CA"],
+    "offset": 678
+  },
+  "E": null,
+  "M": null,
+  "N": null,
+  "ORF1b": null,
+  "ORF3a": null,
+  "ORF6": null,
+  "ORF7a": null,
+  "ORF7b": null,
+  "ORF8": null,
+  "ORF10": null
+}
 ```
 
 The total output is handled in an `.ndjson.zst`.
@@ -75,22 +81,29 @@ For detailed information about resource requirements, especially for cluster env
 
 ### Wrangling Short-Read Genomic Alignments for SILO Database
 
-Originally this was started for wargeling short-read genomic alignments for from wastewater-sampling, into a format for easy import into [Loculus](https://github.com/loculus-project/loculus) and its sequence database SILO.
+Originally this was started for wrangling short-read genomic alignments from wastewater-sampling, into a format for easy import into [Loculus](https://github.com/loculus-project/loculus) and its sequence database SILO.
 
-sr2silo is designed to process a nucliotide alignments from `.bam` files with metadata, translate and align reads in amino acids, gracefully handling all insertions and deletions and upload the results to the backend [LAPIS-SILO](https://github.com/GenSpectrum/LAPIS-SILO).
+sr2silo is designed to process nucleotide alignments from `.bam` files with metadata, translate and align reads in amino acids, gracefully handling all insertions and deletions and upload the results to the backend [LAPIS-SILO](https://github.com/GenSpectrum/LAPIS-SILO) v0.8.0+.
 
-For the V-Pipe to Silo implementation we carry through the following metadata:
-```
-  "metadata":{
-    "read_id":"AV233803:AV044:2411515907:1:10805:5199:3294",
-    "sample_id":"A1_05_2024_10_08",
-    "batch_id":"20241024_2411515907",
-    "sampling_date":"2024-10-08",
-    "location_name":"Lugano (TI)",
-    "read_length":"250",
-    "primer_protocol":"v532",
-    "location_code":"5"
-    }
+**New Output Format for LAPIS-SILO v0.8.0+:**
+- Metadata fields are now at the root level (no nested "metadata" object)
+- Genomic segments use a structured format with `sequence`, `insertions`, and `offset` fields
+- The main nucleotide segment is required and contains the primary alignment
+- Gene segments (S, ORF1a, etc.) contain amino acid sequences or `null` if empty
+- Insertions use the format `"position:sequence"` (e.g., `"123:ACGT"`)
+- Unaligned sequences are prefixed with `unaligned_` (e.g., `unaligned_main`)
+
+For the V-Pipe to Silo implementation we include the following metadata fields at the root level:
+```json
+{
+  "read_id": "AV233803:AV044:2411515907:1:10805:5199:3294",
+  "sample_id": "A1_05_2024_10_08",
+  "batch_id": "20241024_2411515907",
+  "sampling_date": "2024-10-08",
+  "location_name": "Lugano (TI)",
+  "read_length": "250",
+  "location_code": "05"
+}
 ```
 
 ### Setting up the repository
 
@@ -1,6 +1,6 @@
 # conda recipe
 {% set name = "sr2silo" %}
-{% set version = "1.1.1" %}
+{% set version = "1.2.0" %}
 
 package:
   name: {{ name|lower }}
 
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "sr2silo"
-version = "1.1.1"
-description = "ETL tool for importing short-read sequencing data into SILO database, powering Loculus."
+version = "1.2.0"
+description = "ETL tool for importing short-read sequencing data into SILO database (v0.8.0+), powering Loculus."
 authors = ["Gordon Julian Koehn <gordon.koehn@dbsse.ethz.ch>"]
 readme = "README.md"
 packages = [{ include = "sr2silo", from = "src" }]
 
@@ -9,7 +9,6 @@
     bam_to_fasta_query,
     bam_to_sam,
     get_gene_set_from_ref,
-    pad_alignment,
     sam_to_bam,
     sort_and_index_bam,
     sort_bam_file,
@@ -37,7 +36,6 @@
     "bam_to_sam",
     "get_gene_set_from_ref",
     "get_gene_set_from_ref",
-    "pad_alignment",
     "sam_to_bam",
     "sort_and_index_bam",
     "sort_bam_file",
 
@@ -5,7 +5,7 @@
 import logging
 import re
 from pathlib import Path
-from typing import List, Tuple, Union
+from typing import List, Tuple
 
 import pysam
 
@@ -315,45 +315,6 @@ def parse_cigar(cigar: str) -> List[Tuple[int, str]]:
     ]
 
 
-def pad_alignment(
-    sequence: Union[List[str], str],
-    reference_start: int,
-    reference_length: int,
-    unknown_char: str = "N",
-) -> str:
-    """
-    Pad the sequence to match the reference length.
-
-    This function takes a sequence and pads it with a specified character to align it
-    with a reference sequence of a given length. The padding is added to both the
-    beginning and the end of the sequence as needed.
-
-    Args:
-        sequence (Union[List[str], str]): The sequence to be padded.
-        reference_start (int): The starting position of the reference sequence.
-        reference_length (int): The total length of the reference sequence.
-        unknown_char (str, optional): The character to use for padding. Defaults
-                             to "N" for Nucleotides, choose "X" for Amino Acids.
-
-    Returns:
-        str: The padded sequence as a single string.
-    """
-
-    # Combine the aligned sequence
-    aligned_str = "".join(sequence)
-
-    # Calculate the padding needed for the left and right
-    left_padding = unknown_char * reference_start
-    right_padding = unknown_char * (
-        reference_length - len(aligned_str) - reference_start
-    )
-
-    # Pad the aligned sequence
-    padded_alignment = left_padding + aligned_str + right_padding
-
-    return padded_alignment
-
-
 def sam_to_seq_and_indels(
     seq: str, cigar: str
 ) -> Tuple[str, List[Insertion], List[Tuple[int, int]]]: