Skip to content

Commit f3629f4

Browse files
committed
Made test file reading much more robust.
Fixed comment colnames
1 parent 19cd249 commit f3629f4

File tree

3 files changed

+31
-7
lines changed

3 files changed

+31
-7
lines changed

anndata/readwrite/read.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -197,22 +197,33 @@ def read_text(filename, delimiter=None, first_column_names=None, dtype='float32'
197197
return _read_text(filename, delimiter, first_column_names, dtype)
198198

199199

200+
def iter_lines(file_like):
201+
""" Helper for iterating only nonempty lines without line breaks"""
202+
for line in file_like:
203+
line = line.rstrip('\r\n')
204+
if line:
205+
yield line
206+
207+
200208
def _read_text(f, delimiter, first_column_names, dtype) -> AnnData:
201-
header = ''
209+
comments = []
202210
data = []
203-
lines = (l.rstrip('\r\n') for l in f)
211+
lines = iter_lines(f)
204212
col_names = []
205213
row_names = []
206214
# read header and column names
207215
for line in lines:
208216
if line.startswith('#'):
209-
header += line
217+
comment = line.lstrip('# ')
218+
if comment:
219+
comments.append(comment)
210220
else:
211221
if delimiter is not None and delimiter not in line:
212222
raise ValueError('Did not find delimiter "{}" in first line.'
213223
.format(delimiter))
214224
line_list = line.split(delimiter)
215-
if not is_float(line_list[0]):
225+
# the first column might be row names, so check the last
226+
if not is_float(line_list[-1]):
216227
col_names = line_list
217228
# logg.msg(' assuming first line in file stores column names', v=4)
218229
else:
@@ -225,9 +236,9 @@ def _read_text(f, delimiter, first_column_names, dtype) -> AnnData:
225236
break
226237
if not col_names:
227238
# try reading col_names from the last comment line
228-
if len(header) > 0:
239+
if len(comments) > 0:
229240
# logg.msg(' assuming last comment line stores variable names', v=4)
230-
col_names = np.array(header.split('\n')[-2].strip('#').split())
241+
col_names = np.array(comments[-1].split())
231242
# just numbers as col_names
232243
else:
233244
# logg.msg(' did not find column names in file', v=4)
@@ -269,7 +280,7 @@ def _read_text(f, delimiter, first_column_names, dtype) -> AnnData:
269280
# a lot of memory and CPU time
270281
if data[0].size != data[-1].size:
271282
raise ValueError(
272-
'length of first line {} is different from length of last line {}'
283+
'length of first line ({}) is different from length of last line ({})'
273284
.format(data[0].size, data[-1].size))
274285
data = np.array(data, dtype=dtype)
275286
# logg.msg(' constructed array from list of list', t=True, v=4)

anndata/tests/adata-comments.tsv

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# A regular comment
2+
# The next comment is actually colnames
3+
# c1 c2
4+
r1 1.0 0.0
5+
r2 3.0 0.0
6+
r3 5.0 6.0

anndata/tests/readwrite.py

+7
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,13 @@ def test_read_csv():
8080
assert adata.X.tolist() == X_list
8181

8282

83+
def test_read_tsv():
84+
adata = ad.read_text(HERE / 'adata-comments.tsv', '\t')
85+
assert adata.obs_names.tolist() == ['r1', 'r2', 'r3']
86+
assert adata.var_names.tolist() == ['c1', 'c2']
87+
assert adata.X.tolist() == X_list
88+
89+
8390
def test_write_csv():
8491
for typ in [np.array, csr_matrix]:
8592
X = typ(X_list)

0 commit comments

Comments
 (0)