Skip to content

Commit 7249bfe

Browse files
committed
Fixes #31: Parse escaped quotes in field
The parser considered any quote to be terminal. We want to only consider unescaped quotes.
1 parent 527f5bd commit 7249bfe

File tree

1 file changed

+15
-17
lines changed

1 file changed

+15
-17
lines changed

src/Data/DataFrame/IO/CSV.hs

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import Data.DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, c
3030
import Data.DataFrame.Internal.DataFrame (DataFrame(..))
3131
import Data.DataFrame.Internal.Parsing
3232
import Data.DataFrame.Operations.Typing
33+
import Data.Foldable (fold)
3334
import Data.Function (on)
3435
import Data.IORef
3536
import Data.Maybe
@@ -184,27 +185,24 @@ field c =
184185
<?> "field"
185186
{-# INLINE field #-}
186187

188+
unquotedTerminators :: Char -> S.Set Char
189+
unquotedTerminators sep = S.fromList [sep, '\n', '\r', '"']
190+
187191
unquotedField :: Char -> Parser T.Text
188192
unquotedField sep =
189-
takeWhile nonTerminal <?> "unquoted field"
190-
where nonTerminal = (`S.notMember` S.fromList [sep, '\n', '\r', '"'])
193+
takeWhile (not . (`S.member` terminators)) <?> "unquoted field"
194+
where terminators = unquotedTerminators sep
191195
{-# INLINE unquotedField #-}
192196

193-
insideQuotes :: Parser T.Text
194-
insideQuotes =
195-
T.append <$> takeWhile (/= '"')
196-
<*> (T.concat <$> many (T.cons <$> dquotes <*> insideQuotes))
197-
<?> "inside of double quotes"
198-
where
199-
dquotes =
200-
string "\"\"" >> return '"'
201-
<?> "paired double quotes"
202-
{-# INLINE insideQuotes #-}
203-
204197
quotedField :: Parser T.Text
205-
quotedField =
206-
char '"' *> insideQuotes <* char '"'
207-
<?> "quoted field"
198+
quotedField = char '"' *> contents <* char '"' <?> "quoted field"
199+
where
200+
contents = fold <$> many (unquote <|> unescape)
201+
where
202+
unquote = takeWhile1 (notInClass "\"\\")
203+
unescape = char '\\' *> do
204+
T.singleton <$> do
205+
char '\\' <|> char '"'
208206
{-# INLINE quotedField #-}
209207

210208
lineEnd :: Parser ()
@@ -226,7 +224,7 @@ countRows c path = withFile path ReadMode $! go 0 ""
226224
Fail unconsumed ctx er -> do
227225
erpos <- hTell h
228226
fail $ "Failed to parse CSV file around " <> show erpos <> " byte; due: "
229-
<> show er <> "; context: " <> show ctx
227+
<> show er <> "; context: " <> show ctx <> " " <> show unconsumed
230228
Partial c -> do
231229
fail $ "Partial handler is called; n = " <> show n
232230
Done (unconsumed :: T.Text) _ ->

0 commit comments

Comments
 (0)