improved loom interface for sparse matrices

falexwolf · falexwolf · commit 8b20571c26ea · 2018-03-19T17:39:56.000+01:00
diff --git a/anndata/readwrite/read.py b/anndata/readwrite/read.py
@@ -84,20 +84,20 @@ def read_umi_tools(filename: Union[Path, str]) -> AnnData:
     # import gzip to read a gzipped file :-)
     import gzip
     from pandas import DataFrame
-    
+
     dod = {}  # this will contain basically everything
     fh = gzip.open(filename)
     header = fh.readline()  # read the first line
-    
+
     for line in fh:
         t = line.decode('ascii').split('\t')  # gzip read bytes, hence the decoding
         try:
             dod[t[1]].update({t[0]:int(t[2])})
         except KeyError:
             dod[t[1]] = {t[0]:int(t[2])}
-    
+
     df = DataFrame.from_dict(dod, orient='index')  # build the matrix
-    df.fillna(value = 0., inplace=True)  # many NaN, replace with zeros
+    df.fillna(value=0., inplace=True)  # many NaN, replace with zeros
     return AnnData(np.array(df), {'obs_names': df.index}, {'var_names': df.columns})
 
 
@@ -138,28 +138,39 @@ def read_hdf(filename: Union[Path, str], key: str) -> AnnData:
     return adata
 
 
-def read_loom(filename: Union[Path, str]) -> AnnData:
+def read_loom(filename: Union[Path, str], sparse=False) -> AnnData:
     """Read `.loom`-formatted hdf5 file.
 
+    This reads the whole file into memory.
+
+    Beware that you have to explicitly state when you want to read the file as
+    sparse data.
+
     Parameters
     ----------
     filename : `str`
         The filename.
+    sparse : `bool`
+        Whether to read the data matrix as sparse.
 
     Returns
     -------
     An :class:`~anndata.AnnData` object.
     """
     filename = str(filename)  # allow passing pathlib.Path objects
     from loompy import connect
-    lc = connect(filename, 'r')
-    with h5py.File(filename, 'r') as f:
-        X = f['matrix'][()]
-    adata = AnnData(
-        X.T,
-        obs=dict(lc.col_attrs),  # not ideal: make the generator a dict...
-        var=dict(lc.row_attrs))
-    lc.close()
+    if sparse:
+        with connect(filename, 'r') as lc:
+            X = lc.sparse()
+    else:
+        with h5py.File(filename, 'r') as f:
+            X = f['matrix'][()]
+    with connect(filename, 'r') as lc:
+        adata = AnnData(
+            X.T,
+            obs=dict(lc.col_attrs),  # not ideal: make the generator a dict...
+            var=dict(lc.row_attrs))
+        lc.close()
     return adata
 
 
diff --git a/anndata/readwrite/write.py b/anndata/readwrite/write.py
@@ -66,8 +66,10 @@ def write_loom(filename: Union[Path, str], adata: AnnData):
     if issparse(X):
         logg.info(
             '... writing to \'.loom\' file densifies sparse matrix')
-        X = X.toarray()
+        X = X.tocoo()
     from loompy import create
+    if os.path.exists(filename):
+        os.remove(filename)
     create(filename, X, row_attrs=row_attrs, col_attrs=col_attrs)
 
 
diff --git a/anndata/tests/readwrite.py b/anndata/tests/readwrite.py
@@ -61,16 +61,16 @@ def test_readwrite_h5ad():
 
 
 def test_readwrite_loom():
-    for typ in [np.array, csr_matrix]:
+    for i, typ in enumerate([np.array, csr_matrix]):
         X = typ(X_list)
         adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
         adata.write_loom('./test.loom')
-        adata = ad.read_loom('./test.loom')
+        adata = ad.read_loom('./test.loom', sparse=(i == 1))
         if isinstance(X, np.ndarray):
             assert np.allclose(adata.X, X)
         else:
             # TODO: this should not be necessary
-            assert np.allclose(adata.X, X.toarray())
+            assert np.allclose(adata.X.toarray(), X.toarray())
 
 
 def test_read_csv():