Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions odo/backends/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import datashape

from datashape import discover, Record, Option
from datashape.predicates import isrecord
from datashape.predicates import isrecord, isdimension
from datashape.dispatch import dispatch

from ..compatibility import unicode, PY2
Expand Down Expand Up @@ -140,18 +140,26 @@ class CSV(object):
If the csv file has a header or not
encoding : str (default utf-8)
File encoding
user_dshape: datashape or string representation
user specified datashape
kwargs : other...
Various choices about dialect
"""
canonical_extension = 'csv'

def __init__(self, path, has_header=None, encoding='utf-8',
sniff_nbytes=10000, **kwargs):
sniff_nbytes=10000, user_dshape=None, **kwargs):
self.path = path
self._has_header = has_header
self.encoding = encoding or 'utf-8'
self._kwargs = kwargs
self._sniff_nbytes = sniff_nbytes
if user_dshape:
if isinstance(user_dshape, (str, unicode)):
user_dshape = datashape.dshape(user_dshape)
if not isrecord(user_dshape.measure):
raise TypeError('Please provide a Record dshape for the csv')
self._dshape = user_dshape

def _sniff_dialect(self, path):
kwargs = self._kwargs
Expand Down Expand Up @@ -330,6 +338,9 @@ def _():

@discover.register(CSV)
def discover_csv(c, nrows=1000, **kwargs):
if c._dshape:
return c._dshape
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we could add an ensure_consistent_dshape default arg (default False); if set to True, then the c._dshape would be tested against the dshape of the df loaded below to ensure the user-specified dshape is compatible with the discovered dshape.


df = csv_to_dataframe(c, nrows=nrows, **kwargs)
df = coerce_datetimes(df)

Expand Down
11 changes: 11 additions & 0 deletions odo/backends/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,17 @@ def test_discover_with_dotted_names():
assert dshape == datashape.dshape('var * {"a.b": int64, "c.d": int64}')
assert dshape.measure.names == [u'a.b', u'c.d']

def test_discover_csv_with_fixed_dshape():
with filetext('name,val\nAlice,1\n,0\nBob,2') as fn:
ds = datashape.dshape('var * {name: string, val: float64}')
csv1 = CSV(fn, user_dshape=ds)
ds1 = discover(csv1)
assert ds1 == ds
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add a test that verifies that the passed-in datashape overrides the datashape when it isn't passed in.

Perhaps a CSV file like:

a,b
1,1.0
 , 
2,2.0

And an overridden dshape like var * {a: ?int32, ?int64}.

csv2 = CSV(fn, has_header=True)
ds2 = discover(csv2)
assert ds1 != ds2



try:
unichr
Expand Down