Skip to content

Commit 0fa8af7

Browse files
Read specific columns as str (#52)
* Read specific columns as str * merge with current main and add test * fix function signature --------- Co-authored-by: Alister Burt <alisterburt@gmail.com>
1 parent 6b50943 commit 0fa8af7

File tree

3 files changed

+109
-38
lines changed

3 files changed

+109
-38
lines changed

src/starfile/functions.py

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING, Dict, List, Union
3+
from typing import TYPE_CHECKING, Dict, List, Union, Optional
44

55
if TYPE_CHECKING:
66
import pandas as pd
@@ -15,15 +15,32 @@
1515
from os import PathLike
1616

1717

18-
def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = False):
19-
"""
20-
Read a star file into a pandas dataframe or dict of pandas dataframes
18+
def read(
19+
filename: PathLike,
20+
read_n_blocks: Optional[int] = None,
21+
always_dict: bool = False,
22+
parse_as_string: List[str] = []
23+
) -> Union[DataBlock, Dict[DataBlock]]:
24+
"""Read data from a STAR file.
2125
22-
default behaviour in the case of only one data block being present in the STAR file is to
23-
return only a dataframe, this can be changed by setting 'always_dict=True'
24-
"""
26+
Basic data blocks are read as dictionaries. Loop blocks are read as pandas
27+
dataframes. When multiple data blocks are present a dictionary of datablocks is
28+
returned. When a single datablock is present only the block is returned by default.
29+
To force returning a dectionary even when only one datablock is present set
30+
`always_dict=True`.
2531
26-
parser = StarParser(filename, n_blocks_to_read=read_n_blocks)
32+
Parameters
33+
----------
34+
filename: PathLike
35+
File from which to read data.
36+
read_n_blocks: int | None
37+
Limit reading the file to the first n data blocks.
38+
always_dict: bool
39+
Always return a dictionary, even when only a single data block is present.
40+
parse_as_string: list[str]
41+
A list of keys or column names which will not be coerced to numeric values.
42+
"""
43+
parser = StarParser(filename, n_blocks_to_read=read_n_blocks, parse_as_string=parse_as_string)
2744
if len(parser.data_blocks) == 1 and always_dict is False:
2845
return list(parser.data_blocks.values())[0]
2946
else:
@@ -38,9 +55,24 @@ def write(
3855
na_rep: str = '<NA>',
3956
quote_character: str = '"',
4057
quote_all_strings: bool = False,
41-
**kwargs,
58+
**kwargs
4259
):
43-
"""Write data blocks as STAR files."""
60+
"""Write data to disk in the STAR format.
61+
62+
Parameters
63+
----------
64+
data: DataBlock | Dict[str, DataBlock] | List[DataBlock]
65+
Data to be saved to file. DataBlocks are dictionaries or dataframes.
66+
If a dictionary of datablocks are passed the keys will be the data block names.
67+
filename: PathLike
68+
Path where the file will be saved.
69+
float_format: str
70+
Float format string which will be passed to pandas.
71+
sep: str
72+
Separator between values, will be passed to pandas.
73+
na_rep: str
74+
Representation of null values, will be passed to pandas.
75+
"""
4476
StarWriter(
4577
data,
4678
filename=filename,

src/starfile/parser.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,28 @@
88
import numpy as np
99
import pandas as pd
1010
from pathlib import Path
11-
from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple
11+
from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple, List
1212

1313
from starfile.typing import DataBlock
1414

1515
if TYPE_CHECKING:
1616
from os import PathLike
1717

18-
def _apply_numeric(col: pd.Series) -> pd.Series:
19-
try:
20-
return pd.to_numeric(col)
21-
except ValueError:
22-
return col
2318

2419
class StarParser:
2520
filename: Path
2621
n_lines_in_file: int
2722
n_blocks_to_read: int
2823
current_line_number: int
2924
data_blocks: Dict[DataBlock]
30-
31-
def __init__(self, filename: PathLike, n_blocks_to_read: Optional[int] = None):
25+
parse_as_string: List[str]
26+
27+
def __init__(
28+
self,
29+
filename: PathLike,
30+
n_blocks_to_read: Optional[int] = None,
31+
parse_as_string: List[str] = [],
32+
):
3233
# set filename, with path checking
3334
filename = Path(filename)
3435
if not filename.exists():
@@ -39,6 +40,7 @@ def __init__(self, filename: PathLike, n_blocks_to_read: Optional[int] = None):
3940
self.data_blocks = {}
4041
self.n_lines_in_file = count_lines(self.filename)
4142
self.n_blocks_to_read = n_blocks_to_read
43+
self.parse_as_string = parse_as_string
4244

4345
# parse file
4446
self.current_line_number = 0
@@ -78,7 +80,15 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
7880
break
7981
elif self.current_line.startswith('_'): # '_foo bar'
8082
k, v = shlex.split(self.current_line)
81-
block[k[1:]] = numericise(v)
83+
column_name = k[1:]
84+
parse_column_as_string = (
85+
self.parse_as_string is not None
86+
and any(column_name == col for col in self.parse_as_string)
87+
)
88+
if parse_column_as_string is True:
89+
block[column_name] = v
90+
else:
91+
block[column_name] = numericise(v)
8292
self.current_line_number += 1
8393
return block
8494

@@ -108,18 +118,27 @@ def _parse_loop_block(self) -> pd.DataFrame:
108118
n_cols = len(loop_column_names)
109119
df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
110120
else:
121+
column_name_to_index = {col: idx for idx, col in enumerate(loop_column_names)}
111122
df = pd.read_csv(
112123
StringIO(loop_data.replace("'", '"')),
113124
delimiter=r'\s+',
114125
header=None,
115126
comment='#',
116-
keep_default_na=False
127+
dtype={column_name_to_index[k]: str for k in self.parse_as_string if k in loop_column_names},
128+
keep_default_na=False,
129+
engine='c',
117130
)
131+
df.columns = loop_column_names
132+
133+
# Numericise all columns in temporary copy
118134
df_numeric = df.apply(_apply_numeric)
119-
# Replace columns that are all NaN with the original string columns
135+
136+
# Replace columns that are all NaN with the original columns
120137
df_numeric[df_numeric.columns[df_numeric.isna().all()]] = df[df_numeric.columns[df_numeric.isna().all()]]
121-
df = df_numeric
122-
df.columns = loop_column_names
138+
139+
# Replace columns that should be strings
140+
for col in df.columns:
141+
df[col] = df_numeric[col] if col not in self.parse_as_string else df[col]
123142
return df
124143

125144

@@ -150,3 +169,10 @@ def numericise(value: str) -> Union[str, int, float]:
150169
# If it's not a float either, leave it as a string
151170
value = value
152171
return value
172+
173+
174+
def _apply_numeric(col: pd.Series) -> pd.Series:
175+
try:
176+
return pd.to_numeric(col)
177+
except ValueError:
178+
return col

tests/test_parsing.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -243,34 +243,47 @@ def test_empty_loop_block():
243243
assert len(parser.data_blocks) == 1
244244

245245

246-
247-
@pytest.mark.parametrize("quote_character, filename", [("'",basic_single_quote),
248-
('"',basic_double_quote),
249-
])
250-
def test_quote_basic(quote_character,filename):
246+
@pytest.mark.parametrize("quote_character, filename", [("'", basic_single_quote),
247+
('"', basic_double_quote),
248+
])
249+
def test_quote_basic(quote_character, filename):
251250
parser = StarParser(filename)
252251
assert len(parser.data_blocks) == 1
253252
assert parser.data_blocks['']['no_quote_string'] == "noquote"
254253
assert parser.data_blocks['']['quote_string'] == "quote string"
255254
assert parser.data_blocks['']['whitespace_string'] == " "
256255
assert parser.data_blocks['']['empty_string'] == ""
257256

258-
@pytest.mark.parametrize("quote_character, filename", [("'",loop_single_quote),
259-
('"',loop_double_quote),
260-
])
261-
def test_quote_loop(quote_character,filename):
257+
258+
@pytest.mark.parametrize("quote_character, filename", [("'", loop_single_quote),
259+
('"', loop_double_quote),
260+
])
261+
def test_quote_loop(quote_character, filename):
262262
import math
263263
parser = StarParser(filename)
264264
assert len(parser.data_blocks) == 1
265-
assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote"
266-
assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string"
267-
assert parser.data_blocks[''].loc[0,'whitespace_string'] == " "
268-
assert parser.data_blocks[''].loc[0,'empty_string'] == ""
265+
assert parser.data_blocks[''].loc[0, 'no_quote_string'] == "noquote"
266+
assert parser.data_blocks[''].loc[0, 'quote_string'] == "quote string"
267+
assert parser.data_blocks[''].loc[0, 'whitespace_string'] == " "
268+
assert parser.data_blocks[''].loc[0, 'empty_string'] == ""
269269

270270
assert parser.data_blocks[''].dtypes['number_and_string'] == object
271271
assert parser.data_blocks[''].dtypes['number_and_empty'] == 'float64'
272272
assert parser.data_blocks[''].dtypes['number'] == 'float64'
273273
assert parser.data_blocks[''].dtypes['empty_string_and_normal_string'] == object
274274

275-
assert math.isnan(parser.data_blocks[''].loc[1,'number_and_empty'])
276-
assert parser.data_blocks[''].loc[0,'empty_string_and_normal_string'] == ''
275+
assert math.isnan(parser.data_blocks[''].loc[1, 'number_and_empty'])
276+
assert parser.data_blocks[''].loc[0, 'empty_string_and_normal_string'] == ''
277+
278+
279+
def test_parse_as_string():
280+
parser = StarParser(postprocess, parse_as_string=['rlnFinalResolution', 'rlnResolution'])
281+
282+
# check 'rlnFinalResolution' is parsed as string in general (basic) block
283+
block = parser.data_blocks['general']
284+
assert type(block['rlnFinalResolution']) == str
285+
286+
# check 'rlnResolution' is parsed as string in fsc (loop) block
287+
df = parser.data_blocks['fsc']
288+
assert df['rlnResolution'].dtype == 'object'
289+

0 commit comments

Comments
 (0)