Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions docs/genotype_likelihoods.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Genotype-likelihood input (ANGSD beagle)

ReLocator loads ANGSD genotype-likelihood data natively. No preprocessing
script, no intermediate TSV — the same shape as VCF/zarr/matrix inputs:

```bash
locator \
--gl output.beagle.gz \
--bam_list bam.filelist \
--sample_data data/sample_data.txt \
--out out/gl_run/run1
```

Or from the Python API:

```python
from locator.core import Locator

loc = Locator({"sample_data": "data/sample_data.txt"})
genotypes, samples = loc.load_genotypes(
gl="output.beagle.gz",
bam_list="bam.filelist",
gl_mode="dosage", # or "full_gl"
)
```

## Generating the beagle input

Use ANGSD with `-doGlf 2` to write the beagle file Locator reads:

```bash
angsd \
-bam bam.filelist \
-ref reference.fa \
-GL 2 \
-doGlf 2 \
-doMajorMinor 1 \
-doMaf 1 \
-SNP_pval 1e-6 \
-minMapQ 20 \
-minQ 20 \
-minInd 10 \
-minMaf 0.05 \
-out output
```

Pass the exact same `bam.filelist` to both ANGSD and Locator. Sample IDs
in `sample_data.txt` must match `Path(bam).stem` for each BAM, in the
same column order ANGSD wrote into the beagle.

## Modes

`--gl_mode dosage` (default) emits one row per kept site holding expected
dosage under a flat prior:

```
E[geno] = P(AB) + 2 * P(BB)
```

`--gl_mode full_gl` emits three rows per kept site (AA / AB / BB),
preserving genotype uncertainty that the dosage scalar collapses. Useful
at low coverage where a confidently-homozygous site `(0.9, 0.05, 0.05)`
and a genuinely-uncertain site `(0.4, 0.4, 0.2)` would otherwise map to
similar dosage values.

## Site filtering

Hard-coded defaults (matching the previous `scripts/gl_to_locator.py`
script):

| Threshold | Value | Effect |
|---|---|---|
| `gl_missing_threshold` | 0.4 | Sample at site is missing if `max(GL_AA, GL_AB, GL_BB) < 0.4` (near-uniform GL = no information) |
| `max_missing_frac` | 0.10 | Site dropped if missing fraction across samples exceeds this |
| `min_maf` | 0.01 | Site dropped if mean-dosage-derived MAF falls below this |

These are not yet surfaced as CLI flags. If your dataset needs different
thresholds, edit `locator/loaders.py:_load_from_gl` directly or open an
issue.

## Missing data

Sample-level missingness at a site (per `gl_missing_threshold` above) is
imputed inside the loader to per-site mean dosage (dosage mode) or per-site
mean GL triplet (full_gl mode). The output flows through ReLocator's
continuous-dosage filter path (`filter_dosage_matrix`) with no NaNs.

Do **not** pass `--impute_missing` on the CLI for GL inputs; that flag
operates on biallelic SNP `allel.GenotypeArray` via `np.random.binomial`
and is not the right primitive for expected-dosage or GL-triplet inputs.

## Sample ID alignment

Sample IDs are derived from BAM filenames (`Path(bam).stem` for each line
in `--bam_list`). These must match the `sampleID` column in
`sample_data.txt`. The standard ReLocator reordering in `sort_samples`
matches the two and warns on any mismatches.
128 changes: 128 additions & 0 deletions locator/_gl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Genotype-likelihood (ANGSD beagle) parsing and dosage encoding helpers.

These are internal helpers used by `locator.loaders.DataLoaderMixin._load_from_gl`.
The public entry points are `loc.load_genotypes(gl=..., bam_list=..., gl_mode=...)`
and the `locator --gl ... --bam_list ... --gl_mode {dosage,full_gl}` CLI flags.
"""

from __future__ import annotations

import gzip
from pathlib import Path

import numpy as np


def sample_ids_from_bam_list(bam_list_path):
ids = []
with open(bam_list_path) as fh:
for line in fh:
bam = line.strip()
if bam:
ids.append(Path(bam).stem)
if not ids:
raise ValueError(f"No BAM paths found in: {bam_list_path}")
return ids


def load_beagle(beagle_path):
"""Load beagle.gz file.

Returns
-------
markers : list of str, length n_sites
gl_flat : np.ndarray float32, shape (n_sites, n_samples * 3)
columns are GL triplets [AA, AB, BB] per sample.
"""
print(f"Loading beagle file: {beagle_path}", flush=True)
rows = []
markers = []
open_fn = gzip.open if str(beagle_path).endswith(".gz") else open
with open_fn(beagle_path, "rt") as fh:
fh.readline() # discard header
for line in fh:
fields = line.rstrip("\n").split("\t")
markers.append(fields[0])
rows.append(fields[3:]) # skip marker, allele1, allele2

if not rows:
raise ValueError(f"No data rows found in beagle file: {beagle_path}")

try:
gl_flat = np.array(rows, dtype=np.float32)
except ValueError as e:
raise ValueError(f"Failed to parse beagle GL values as float32. Error: {e}")

print(f" Loaded {len(markers)} sites", flush=True)
return markers, gl_flat


def validate_dimensions(gl_flat, n_samples, beagle_path, bam_list_path):
expected_cols = n_samples * 3
if gl_flat.shape[1] != expected_cols:
raise ValueError(
f"Beagle file has {gl_flat.shape[1]} GL columns but expected "
f"{expected_cols} ({n_samples} samples × 3 GL values per sample).\n"
f" Beagle: {beagle_path}\n"
f" BAM list: {bam_list_path}\n"
f"Ensure --bam_list is the same file used in the ANGSD run."
)


def reshape_gl(gl_flat, n_samples):
"""Reshape (n_sites, n_samples*3) to (n_sites, n_samples, 3)."""
return gl_flat.reshape(gl_flat.shape[0], n_samples, 3)


def expected_dosage(gl):
"""E[dosage] = P(AB) + 2 * P(BB) under a flat prior."""
return gl[:, :, 1] + 2.0 * gl[:, :, 2]


def detect_missing(gl, gl_missing_threshold):
"""Return bool mask (n_sites, n_samples), True = sample missing at site."""
return gl.max(axis=2) < gl_missing_threshold


def impute_dosage_with_site_mean(dosage, missing_mask):
"""Impute missing dosage values with site-mean across non-missing samples."""
for i in range(dosage.shape[0]):
mask = missing_mask[i]
if not mask.any():
continue
present = dosage[i, ~mask]
dosage[i, mask] = present.mean() if present.size > 0 else 0.0
return dosage


def impute_gl_with_site_mean(gl, missing_mask):
"""Impute missing GL triplets with site-mean triplet across non-missing samples."""
for i in range(gl.shape[0]):
mask = missing_mask[i]
if not mask.any():
continue
present = gl[i, ~mask, :]
if present.size == 0:
gl[i, mask, :] = np.array([1.0 / 3, 1.0 / 3, 1.0 / 3], dtype=gl.dtype)
else:
gl[i, mask, :] = present.mean(axis=0)
return gl


def filter_sites(dosage, missing_mask, min_maf, max_missing_frac):
"""Apply site-level filters using imputed dosage and pre-imputation missing mask."""
missing_frac = missing_mask.mean(axis=1)
pass_missing = missing_frac <= max_missing_frac

mean_dos = dosage.mean(axis=1)
maf = np.minimum(mean_dos, 2.0 - mean_dos) / 2.0
pass_maf = maf >= min_maf

keep = pass_missing & pass_maf
reasons = {
"max_missing_frac": int((~pass_missing).sum()),
"min_maf": int((~pass_maf & pass_missing).sum()),
"total_removed": int((~keep).sum()),
"total_kept": int(keep.sum()),
}
return keep, reasons
29 changes: 27 additions & 2 deletions locator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def parse_args():
help="tab-delimited matrix of genotype dosage with first column "
"named 'sampleID'. Accepts both hard-call dosage (integers 0/1/2) "
"and continuous expected dosage from genotype-likelihood pipelines "
"(floats in [0, 2]). For the GL preprocessing workflow, see "
"`scripts/gl_to_locator.py --help`. E.g., "
"(floats in [0, 2]). For GL inputs, prefer the native --gl / "
"--bam_list flags. E.g., "
"sampleID\\tsite1\\tsite2\\t... "
"msp1\\t0\\t1\\t... "
"msp2\\t2\\t0\\t...",
Expand All @@ -40,6 +40,28 @@ def parse_args():
type=float,
help="Drop microsat alleles below this per-locus frequency. default: 0.01",
)
parser.add_argument(
"--gl",
help="ANGSD beagle.gz file (from -doGlf 2) containing genotype "
"likelihoods. Loaded natively into Locator as expected dosage or "
"as full GL triplets (see --gl_mode). Requires --bam_list to derive "
"sample IDs in the same order they appear in the beagle columns.",
)
parser.add_argument(
"--bam_list",
help="BAM file list used in the ANGSD run (one path per line). "
"Sample IDs are derived from Path(bam).stem and must appear in the "
"same order as the beagle GL columns. Required when --gl is set.",
)
parser.add_argument(
"--gl_mode",
choices=("dosage", "full_gl"),
default="dosage",
help="GL encoding mode. 'dosage' (default) uses expected dosage "
"E[geno] = P(AB) + 2*P(BB); 'full_gl' uses three rows per site "
"(AA / AB / BB) preserving genotype uncertainty. Ignored unless "
"--gl is set.",
)
parser.add_argument(
"--sample_data",
help="tab-delimited text file with columns\
Expand Down Expand Up @@ -230,6 +252,9 @@ def main(): # noqa: C901
matrix=args.matrix,
microsat=args.microsat,
microsat_min_allele_freq=args.microsat_maf,
gl=args.gl,
bam_list=args.bam_list,
gl_mode=args.gl_mode,
)
sample_data, locs = loc.sort_samples(samples, args.sample_data)

Expand Down
4 changes: 2 additions & 2 deletions locator/data/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ def filter_dosage_matrix(
if np.isnan(dosage).any():
raise ValueError(
"dosage matrix contains NaN values; impute upstream "
"(e.g. via gl_to_locator.py site-mean fill) before passing "
"to ReLocator."
"before passing to ReLocator. For GL inputs, use the native "
"loader (load_genotypes(gl=..., bam_list=...))."
)
n_sites, n_samples = dosage.shape
mean_dosage = dosage.mean(axis=1)
Expand Down
Loading
Loading