Merge pull request #295 from wynnw/hotfix/ignore-carriage-returns

dilshod · web-flow · commit d067a36b1ec1 · 2025-03-03T23:03:59.000+05:00
Fix for handling cells with windows-style newline \r\n characters
diff --git a/xlsx2csv.py b/xlsx2csv.py
@@ -598,6 +598,7 @@ def __init__(self):
     def parse(self, filehandle):
         self.parser = xml.parsers.expat.ParserCreate()
         self.parser.CharacterDataHandler = self.handleCharData
+        self.parser.buffer_text = True
         self.parser.StartElementHandler = self.handleStartElement
         self.parser.EndElementHandler = self.handleEndElement
         self.parser.ParseFile(filehandle)
@@ -645,6 +646,9 @@ def handleEndElement(self, name):
             self.rPh = False
 
 
+XMLPARSER_WINDOWS_NEWLINE_STR = "_x000D_\n"
+
+
 class Sheet:
     def __init__(self, workbook, sharedString, styles, filehandle):
         self.py3 = sys.version_info[0] == 3
@@ -827,11 +831,20 @@ def handleCharData(self, data):
             if self.colType == "s":  # shared string
                 format_type = "string"
                 self.data = self.sharedStrings[int(data)]
+
+                # Handle cell string data that has \r\n by changing the value that expat uses for the \r to an empty string.
+                # This happens a lot with older versions of excel, and the character conversion is happening inside expat.
+                if self.data.find(XMLPARSER_WINDOWS_NEWLINE_STR) > -1:
+                    self.data = self.data.replace(XMLPARSER_WINDOWS_NEWLINE_STR, "\n")
             elif self.colType == "b":  # boolean
                 format_type = "boolean"
                 self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data
             elif self.colType == "str" or self.colType == "inlineStr":
                 format_type = "string"
+
+                # Again, check for the \r\n change and clear the apply hack
+                if data.find(XMLPARSER_WINDOWS_NEWLINE_STR) > -1:
+                    self.data = self.data.replace(XMLPARSER_WINDOWS_NEWLINE_STR, "\n")
             elif self.s_attr:
                 s = int(self.s_attr)