@@ -598,6 +598,7 @@ def __init__(self):
598598 def parse (self , filehandle ):
599599 self .parser = xml .parsers .expat .ParserCreate ()
600600 self .parser .CharacterDataHandler = self .handleCharData
601+ self .parser .buffer_text = True
601602 self .parser .StartElementHandler = self .handleStartElement
602603 self .parser .EndElementHandler = self .handleEndElement
603604 self .parser .ParseFile (filehandle )
@@ -645,6 +646,9 @@ def handleEndElement(self, name):
645646 self .rPh = False
646647
647648
649+ XMLPARSER_WINDOWS_NEWLINE_STR = "_x000D_\n "
650+
651+
648652class Sheet :
649653 def __init__ (self , workbook , sharedString , styles , filehandle ):
650654 self .py3 = sys .version_info [0 ] == 3
@@ -827,11 +831,20 @@ def handleCharData(self, data):
827831 if self .colType == "s" : # shared string
828832 format_type = "string"
829833 self .data = self .sharedStrings [int (data )]
834+
835+ # Handle cell string data that has \r\n by changing the value that expat uses for the \r to an empty string.
836+ # This happens a lot with older versions of excel, and the character conversion is happening inside expat.
837+ if self .data .find (XMLPARSER_WINDOWS_NEWLINE_STR ) > - 1 :
838+ self .data = self .data .replace (XMLPARSER_WINDOWS_NEWLINE_STR , "\n " )
830839 elif self .colType == "b" : # boolean
831840 format_type = "boolean"
832841 self .data = (int (data ) == 1 and "TRUE" ) or (int (data ) == 0 and "FALSE" ) or data
833842 elif self .colType == "str" or self .colType == "inlineStr" :
834843 format_type = "string"
844+
845+ # Again, check for the \r\n change and clear the apply hack
846+ if data .find (XMLPARSER_WINDOWS_NEWLINE_STR ) > - 1 :
847+ self .data = self .data .replace (XMLPARSER_WINDOWS_NEWLINE_STR , "\n " )
835848 elif self .s_attr :
836849 s = int (self .s_attr )
837850
0 commit comments