kr-colab · stsmall · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/docs/genotype_likelihoods.md b/docs/genotype_likelihoods.md
@@ -0,0 +1,97 @@
+# Genotype-likelihood input (ANGSD beagle)
+
+ReLocator loads ANGSD genotype-likelihood data natively. No preprocessing
+script, no intermediate TSV — the same shape as VCF/zarr/matrix inputs:
+
+```bash
+locator \
+    --gl output.beagle.gz \
+    --bam_list bam.filelist \
+    --sample_data data/sample_data.txt \
+    --out out/gl_run/run1
+```
+
+Or from the Python API:
+
+```python
+from locator.core import Locator
+
+loc = Locator({"sample_data": "data/sample_data.txt"})
+genotypes, samples = loc.load_genotypes(
+    gl="output.beagle.gz",
+    bam_list="bam.filelist",
+    gl_mode="dosage",  # or "full_gl"
+)
+```
+
+## Generating the beagle input
+
+Use ANGSD with `-doGlf 2` to write the beagle file Locator reads:
+
+```bash
+angsd \
+    -bam bam.filelist \
+    -ref reference.fa \
+    -GL 2 \
+    -doGlf 2 \
+    -doMajorMinor 1 \
+    -doMaf 1 \
+    -SNP_pval 1e-6 \
+    -minMapQ 20 \
+    -minQ 20 \
+    -minInd 10 \
+    -minMaf 0.05 \
+    -out output
+```
+
+Pass the exact same `bam.filelist` to both ANGSD and Locator. Sample IDs
+in `sample_data.txt` must match `Path(bam).stem` for each BAM, in the
+same column order ANGSD wrote into the beagle.
+
+## Modes
+
+`--gl_mode dosage` (default) emits one row per kept site holding expected
+dosage under a flat prior:
+
+```
+E[geno] = P(AB) + 2 * P(BB)
+```
+
+`--gl_mode full_gl` emits three rows per kept site (AA / AB / BB),
+preserving genotype uncertainty that the dosage scalar collapses. Useful
+at low coverage where a confidently-homozygous site `(0.9, 0.05, 0.05)`
+and a genuinely-uncertain site `(0.4, 0.4, 0.2)` would otherwise map to
+similar dosage values.
+
+## Site filtering
+
+Hard-coded defaults (matching the previous `scripts/gl_to_locator.py`
+script):
+
+| Threshold | Value | Effect |
+|---|---|---|
+| `gl_missing_threshold` | 0.4 | Sample at site is missing if `max(GL_AA, GL_AB, GL_BB) < 0.4` (near-uniform GL = no information) |
+| `max_missing_frac` | 0.10 | Site dropped if missing fraction across samples exceeds this |
+| `min_maf` | 0.01 | Site dropped if mean-dosage-derived MAF falls below this |
+
+These are not yet surfaced as CLI flags. If your dataset needs different
+thresholds, edit `locator/loaders.py:_load_from_gl` directly or open an
+issue.
+
+## Missing data
+
+Sample-level missingness at a site (per `gl_missing_threshold` above) is
+imputed inside the loader to per-site mean dosage (dosage mode) or per-site
+mean GL triplet (full_gl mode). The output flows through ReLocator's
+continuous-dosage filter path (`filter_dosage_matrix`) with no NaNs.
+
+Do **not** pass `--impute_missing` on the CLI for GL inputs; that flag
+operates on biallelic SNP `allel.GenotypeArray` via `np.random.binomial`
+and is not the right primitive for expected-dosage or GL-triplet inputs.
+
+## Sample ID alignment
+
+Sample IDs are derived from BAM filenames (`Path(bam).stem` for each line
+in `--bam_list`). These must match the `sampleID` column in
+`sample_data.txt`. The standard ReLocator reordering in `sort_samples`
+matches the two and warns on any mismatches.
diff --git a/locator/_gl.py b/locator/_gl.py
@@ -0,0 +1,128 @@
+"""Genotype-likelihood (ANGSD beagle) parsing and dosage encoding helpers.
+
+These are internal helpers used by `locator.loaders.DataLoaderMixin._load_from_gl`.
+The public entry points are `loc.load_genotypes(gl=..., bam_list=..., gl_mode=...)`
+and the `locator --gl ... --bam_list ... --gl_mode {dosage,full_gl}` CLI flags.
+"""
+
+from __future__ import annotations
+
+import gzip
+from pathlib import Path
+
+import numpy as np
+
+
+def sample_ids_from_bam_list(bam_list_path):
+    ids = []
+    with open(bam_list_path) as fh:
+        for line in fh:
+            bam = line.strip()
+            if bam:
+                ids.append(Path(bam).stem)
+    if not ids:
+        raise ValueError(f"No BAM paths found in: {bam_list_path}")
+    return ids
+
+
+def load_beagle(beagle_path):
+    """Load beagle.gz file.
+
+    Returns
+    -------
+    markers : list of str, length n_sites
+    gl_flat : np.ndarray float32, shape (n_sites, n_samples * 3)
+              columns are GL triplets [AA, AB, BB] per sample.
+    """
+    print(f"Loading beagle file: {beagle_path}", flush=True)
+    rows = []
+    markers = []
+    open_fn = gzip.open if str(beagle_path).endswith(".gz") else open
+    with open_fn(beagle_path, "rt") as fh:
+        fh.readline()  # discard header
+        for line in fh:
+            fields = line.rstrip("\n").split("\t")
+            markers.append(fields[0])
+            rows.append(fields[3:])  # skip marker, allele1, allele2
+
+    if not rows:
+        raise ValueError(f"No data rows found in beagle file: {beagle_path}")
+
+    try:
+        gl_flat = np.array(rows, dtype=np.float32)
+    except ValueError as e:
+        raise ValueError(f"Failed to parse beagle GL values as float32. Error: {e}")
+
+    print(f"  Loaded {len(markers)} sites", flush=True)
+    return markers, gl_flat
+
+
+def validate_dimensions(gl_flat, n_samples, beagle_path, bam_list_path):
+    expected_cols = n_samples * 3
+    if gl_flat.shape[1] != expected_cols:
+        raise ValueError(
+            f"Beagle file has {gl_flat.shape[1]} GL columns but expected "
+            f"{expected_cols} ({n_samples} samples × 3 GL values per sample).\n"
+            f"  Beagle: {beagle_path}\n"
+            f"  BAM list: {bam_list_path}\n"
+            f"Ensure --bam_list is the same file used in the ANGSD run."
+        )
+
+
+def reshape_gl(gl_flat, n_samples):
+    """Reshape (n_sites, n_samples*3) to (n_sites, n_samples, 3)."""
+    return gl_flat.reshape(gl_flat.shape[0], n_samples, 3)
+
+
+def expected_dosage(gl):
+    """E[dosage] = P(AB) + 2 * P(BB) under a flat prior."""
+    return gl[:, :, 1] + 2.0 * gl[:, :, 2]
+
+
+def detect_missing(gl, gl_missing_threshold):
+    """Return bool mask (n_sites, n_samples), True = sample missing at site."""
+    return gl.max(axis=2) < gl_missing_threshold
+
+
+def impute_dosage_with_site_mean(dosage, missing_mask):
+    """Impute missing dosage values with site-mean across non-missing samples."""
+    for i in range(dosage.shape[0]):
+        mask = missing_mask[i]
+        if not mask.any():
+            continue
+        present = dosage[i, ~mask]
+        dosage[i, mask] = present.mean() if present.size > 0 else 0.0
+    return dosage
+
+
+def impute_gl_with_site_mean(gl, missing_mask):
+    """Impute missing GL triplets with site-mean triplet across non-missing samples."""
+    for i in range(gl.shape[0]):
+        mask = missing_mask[i]
+        if not mask.any():
+            continue
+        present = gl[i, ~mask, :]
+        if present.size == 0:
+            gl[i, mask, :] = np.array([1.0 / 3, 1.0 / 3, 1.0 / 3], dtype=gl.dtype)
+        else:
+            gl[i, mask, :] = present.mean(axis=0)
+    return gl
+
+
+def filter_sites(dosage, missing_mask, min_maf, max_missing_frac):
+    """Apply site-level filters using imputed dosage and pre-imputation missing mask."""
+    missing_frac = missing_mask.mean(axis=1)
+    pass_missing = missing_frac <= max_missing_frac
+
+    mean_dos = dosage.mean(axis=1)
+    maf = np.minimum(mean_dos, 2.0 - mean_dos) / 2.0
+    pass_maf = maf >= min_maf
+
+    keep = pass_missing & pass_maf
+    reasons = {
+        "max_missing_frac": int((~pass_missing).sum()),
+        "min_maf": int((~pass_maf & pass_missing).sum()),
+        "total_removed": int((~keep).sum()),
+        "total_kept": int(keep.sum()),
+    }
+    return keep, reasons
diff --git a/locator/cli.py b/locator/cli.py
@@ -19,8 +19,8 @@ def parse_args():
         help="tab-delimited matrix of genotype dosage with first column "
         "named 'sampleID'. Accepts both hard-call dosage (integers 0/1/2) "
         "and continuous expected dosage from genotype-likelihood pipelines "
-        "(floats in [0, 2]). For the GL preprocessing workflow, see "
-        "`scripts/gl_to_locator.py --help`. E.g., "
+        "(floats in [0, 2]). For GL inputs, prefer the native --gl / "
+        "--bam_list flags. E.g., "
         "sampleID\\tsite1\\tsite2\\t... "
         "msp1\\t0\\t1\\t... "
         "msp2\\t2\\t0\\t...",
@@ -40,6 +40,28 @@ def parse_args():
         type=float,
         help="Drop microsat alleles below this per-locus frequency. default: 0.01",
     )
+    parser.add_argument(
+        "--gl",
+        help="ANGSD beagle.gz file (from -doGlf 2) containing genotype "
+        "likelihoods. Loaded natively into Locator as expected dosage or "
+        "as full GL triplets (see --gl_mode). Requires --bam_list to derive "
+        "sample IDs in the same order they appear in the beagle columns.",
+    )
+    parser.add_argument(
+        "--bam_list",
+        help="BAM file list used in the ANGSD run (one path per line). "
+        "Sample IDs are derived from Path(bam).stem and must appear in the "
+        "same order as the beagle GL columns. Required when --gl is set.",
+    )
+    parser.add_argument(
+        "--gl_mode",
+        choices=("dosage", "full_gl"),
+        default="dosage",
+        help="GL encoding mode. 'dosage' (default) uses expected dosage "
+        "E[geno] = P(AB) + 2*P(BB); 'full_gl' uses three rows per site "
+        "(AA / AB / BB) preserving genotype uncertainty. Ignored unless "
+        "--gl is set.",
+    )
     parser.add_argument(
         "--sample_data",
         help="tab-delimited text file with columns\
@@ -230,6 +252,9 @@ def main():  # noqa: C901
         matrix=args.matrix,
         microsat=args.microsat,
         microsat_min_allele_freq=args.microsat_maf,
+        gl=args.gl,
+        bam_list=args.bam_list,
+        gl_mode=args.gl_mode,
     )
     sample_data, locs = loc.sort_samples(samples, args.sample_data)
 

diff --git a/locator/data/filters.py b/locator/data/filters.py
@@ -172,8 +172,8 @@ def filter_dosage_matrix(
     if np.isnan(dosage).any():
         raise ValueError(
             "dosage matrix contains NaN values; impute upstream "
-            "(e.g. via gl_to_locator.py site-mean fill) before passing "
-            "to ReLocator."
+            "before passing to ReLocator. For GL inputs, use the native "
+            "loader (load_genotypes(gl=..., bam_list=...))."
         )
     n_sites, n_samples = dosage.shape
     mean_dosage = dosage.mean(axis=1)