Skip to content

Commit f5a52ee

Browse files
author
Nick Harding
authored
Merge pull request #28 from hardingnj/add_zarr
Add zarr reading functionality
2 parents 5ba1a3d + 3dfcc69 commit f5a52ee

File tree

4 files changed

+43
-5
lines changed

4 files changed

+43
-5
lines changed

bin/xpclr

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def main():
1818
psr.add_argument('--out', "-O", required=True, help='output file')
1919

2020
psr.add_argument('--format', "-F", required=False, default="vcf",
21-
help='input expected. One of "vcf" (default), "hdf5", or "txt"')
21+
help='input expected. One of "vcf" (default), "hdf5", "zarr" or "txt"')
2222

2323
# data inputs for hdf5/VCF format:
2424
psr.add_argument('--input', '-I', required=False, help='input file vcf or hdf5',
@@ -113,6 +113,17 @@ def main():
113113
gdistkey=args.gdistkey)
114114
logging.info("HDF5 loading complete")
115115

116+
elif args.format == 'zarr':
117+
118+
logging.info("Loading zarr")
119+
g1, g2, positions, genetic_dist = xpclr.util.load_zarr_data(
120+
args.input.strip(),
121+
chromosome,
122+
args.samplesA,
123+
args.samplesB,
124+
gdistkey=args.gdistkey)
125+
logging.info("zarr loading complete")
126+
116127
elif args.format == 'txt':
117128
# else if mode is text
118129
logging.info("Loading TXT")

xpclr/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.0.0"
1+
__version__ = "1.1.0"
22

33
from xpclr import methods
44
from xpclr import util

xpclr/methods.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,9 @@ def determine_weights(genotypes, ldcutoff, isphased=False):
265265

266266
# nans are possible, but rare, ie where only alts in A are at positions
267267
# missing in B. We consider these sites in LD and they are dropped.
268-
ld = allel.stats.ld.rogers_huff_r(d[:], fill=1.0)
268+
ld = allel.stats.ld.rogers_huff_r(d[:])
269269

270-
above_cut = squareform(ld**2) > ldcutoff
270+
above_cut = (squareform(ld**2) > ldcutoff) | (squareform(np.isnan(ld)))
271271

272272
# add one as self ld reported as 0
273273
return 1/(1 + np.sum(above_cut, axis=1))

xpclr/util.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import pandas as pd
2-
import h5py
32
import allel
43
import numpy as np
54
import logging
@@ -9,6 +8,8 @@
98
# FUNCTIONS
109
def load_hdf5_data(hdf5_fn, chrom, s1, s2, gdistkey=None):
1110

11+
import hdf5
12+
1213
samples1 = get_sample_ids(s1)
1314
samples2 = get_sample_ids(s2)
1415

@@ -30,6 +31,32 @@ def load_hdf5_data(hdf5_fn, chrom, s1, s2, gdistkey=None):
3031
return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
3132

3233

34+
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None):
35+
36+
import zarr
37+
38+
samples1 = get_sample_ids(s1)
39+
samples2 = get_sample_ids(s2)
40+
41+
zfh = zarr.open_group(zarr_fn, mode="r")[chrom]
42+
43+
samples_x = zfh["samples"][:]
44+
sample_name = [sid.decode() for sid in samples_x.tolist()]
45+
46+
idx1 = np.array([sample_name.index(sid) for sid in samples1])
47+
idx2 = np.array([sample_name.index(sid) for sid in samples2])
48+
49+
g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"])
50+
51+
pos = allel.SortedIndex(zfh["variants"]["POS"][:])
52+
if gdistkey is not None:
53+
gdist = h5fh["variants"][gdistkey][:]
54+
else:
55+
gdist = None
56+
57+
return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
58+
59+
3360
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):
3461

3562
tbl = pd.read_csv(mapfn, sep=" ",

0 commit comments

Comments
 (0)