Skip to content

Commit d067a36

Browse files
authored
Merge pull request #295 from wynnw/hotfix/ignore-carriage-returns
Fix for handling cells with windows-style newline \r\n characters
2 parents fe96a98 + 5a6f677 commit d067a36

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

xlsx2csv.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,7 @@ def __init__(self):
598598
def parse(self, filehandle):
599599
self.parser = xml.parsers.expat.ParserCreate()
600600
self.parser.CharacterDataHandler = self.handleCharData
601+
self.parser.buffer_text = True
601602
self.parser.StartElementHandler = self.handleStartElement
602603
self.parser.EndElementHandler = self.handleEndElement
603604
self.parser.ParseFile(filehandle)
@@ -645,6 +646,9 @@ def handleEndElement(self, name):
645646
self.rPh = False
646647

647648

649+
XMLPARSER_WINDOWS_NEWLINE_STR = "_x000D_\n"
650+
651+
648652
class Sheet:
649653
def __init__(self, workbook, sharedString, styles, filehandle):
650654
self.py3 = sys.version_info[0] == 3
@@ -827,11 +831,20 @@ def handleCharData(self, data):
827831
if self.colType == "s": # shared string
828832
format_type = "string"
829833
self.data = self.sharedStrings[int(data)]
834+
835+
# Handle cell string data that has \r\n by changing the value that expat uses for the \r to an empty string.
836+
# This happens a lot with older versions of excel, and the character conversion is happening inside expat.
837+
if self.data.find(XMLPARSER_WINDOWS_NEWLINE_STR) > -1:
838+
self.data = self.data.replace(XMLPARSER_WINDOWS_NEWLINE_STR, "\n")
830839
elif self.colType == "b": # boolean
831840
format_type = "boolean"
832841
self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data
833842
elif self.colType == "str" or self.colType == "inlineStr":
834843
format_type = "string"
844+
845+
# Again, check for the \r\n change and clear the apply hack
846+
if data.find(XMLPARSER_WINDOWS_NEWLINE_STR) > -1:
847+
self.data = self.data.replace(XMLPARSER_WINDOWS_NEWLINE_STR, "\n")
835848
elif self.s_attr:
836849
s = int(self.s_attr)
837850

0 commit comments

Comments
 (0)