Merge pull request #28 from hardingnj/add_zarr

Nick Harding · web-flow · commit f5a52eea9660 · 2019-01-08T16:46:51.000Z
Add zarr reading functionality
diff --git a/bin/xpclr b/bin/xpclr
@@ -18,7 +18,7 @@ def main():
     psr.add_argument('--out', "-O", required=True, help='output file')
 
     psr.add_argument('--format', "-F", required=False, default="vcf",
-                     help='input expected. One of "vcf" (default), "hdf5", or "txt"')
+                     help='input expected. One of "vcf" (default), "hdf5", "zarr" or "txt"')
 
     # data inputs for hdf5/VCF format:
     psr.add_argument('--input', '-I', required=False, help='input file vcf or hdf5',
@@ -113,6 +113,17 @@ def main():
             gdistkey=args.gdistkey)
         logging.info("HDF5 loading complete")
 
+    elif args.format == 'zarr':
+
+        logging.info("Loading zarr")
+        g1, g2, positions, genetic_dist = xpclr.util.load_zarr_data(
+            args.input.strip(),
+            chromosome,
+            args.samplesA,
+            args.samplesB,
+            gdistkey=args.gdistkey)
+        logging.info("zarr loading complete")
+
     elif args.format == 'txt':
         # else if mode is text
         logging.info("Loading TXT")
diff --git a/xpclr/__init__.py b/xpclr/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.0.0"
+__version__ = "1.1.0"
 
 from xpclr import methods
 from xpclr import util
diff --git a/xpclr/methods.py b/xpclr/methods.py
@@ -265,9 +265,9 @@ def determine_weights(genotypes, ldcutoff, isphased=False):
 
     # nans are possible, but rare, ie where only alts in A are at positions
     # missing in B. We consider these sites in LD and they are dropped.
-    ld = allel.stats.ld.rogers_huff_r(d[:], fill=1.0)
+    ld = allel.stats.ld.rogers_huff_r(d[:])
 
-    above_cut = squareform(ld**2) > ldcutoff
+    above_cut = (squareform(ld**2) > ldcutoff) | (squareform(np.isnan(ld)))
 
     # add one as self ld reported as 0
     return 1/(1 + np.sum(above_cut, axis=1))
diff --git a/xpclr/util.py b/xpclr/util.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import h5py
 import allel
 import numpy as np
 import logging
@@ -9,6 +8,8 @@
 # FUNCTIONS
 def load_hdf5_data(hdf5_fn, chrom, s1, s2, gdistkey=None):
 
+    import hdf5
+
     samples1 = get_sample_ids(s1)
     samples2 = get_sample_ids(s2)
 
@@ -30,6 +31,32 @@ def load_hdf5_data(hdf5_fn, chrom, s1, s2, gdistkey=None):
     return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
 
 
+def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None):
+
+    import zarr
+
+    samples1 = get_sample_ids(s1)
+    samples2 = get_sample_ids(s2)
+
+    zfh = zarr.open_group(zarr_fn, mode="r")[chrom]
+
+    samples_x = zfh["samples"][:]
+    sample_name = [sid.decode() for sid in samples_x.tolist()]
+
+    idx1 = np.array([sample_name.index(sid) for sid in samples1])
+    idx2 = np.array([sample_name.index(sid) for sid in samples2])
+
+    g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"])
+
+    pos = allel.SortedIndex(zfh["variants"]["POS"][:])
+    if gdistkey is not None:
+        gdist = h5fh["variants"][gdistkey][:]
+    else:
+        gdist = None
+
+    return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
+
+
 def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):
 
     tbl = pd.read_csv(mapfn, sep=" ",