allow latin1 encoding for csv files (#1203)

amaloney · ahuang11 · web-flow · commit 6ca75510a022 · 2025-05-12T19:01:01.000Z
Co-authored-by: Andrew &lt;15331990+ahuang11@users.noreply.github.com&gt;
diff --git a/lumen/ai/controls.py b/lumen/ai/controls.py
@@ -15,6 +15,7 @@
 )
 
 from ..sources.duckdb import DuckDBSource
+from ..util import detect_file_encoding
 from .memory import _Memory, memory
 
 TABLE_EXTENSIONS = ("csv", "parquet", "parq", "json", "xlsx", "geojson", "wkt", "zip")
@@ -197,7 +198,8 @@ def _generate_media_controls(self, event):
             self._upload_tabs.clear()
             self._media_controls.clear()
             for filename, file in self._file_input.value.items():
-                file_obj = io.BytesIO(file) if isinstance(file, bytes) else io.StringIO(file)
+                encoding = detect_file_encoding(file_obj=file)
+                file_obj = io.BytesIO(file.decode(encoding).encode("utf-8")) if isinstance(file, bytes) else io.StringIO(file)
                 if filename.lower().endswith(TABLE_EXTENSIONS):
                     table_controls = TableControls(
                         file_obj,
diff --git a/lumen/ai/ui.py b/lumen/ai/ui.py
@@ -32,7 +32,7 @@
 from ..sources import Source
 from ..sources.duckdb import DuckDBSource
 from ..transforms.sql import SQLLimit
-from ..util import log
+from ..util import detect_file_encoding, log
 from .agents import (
     AnalysisAgent, AnalystAgent, ChatAgent, DocumentListAgent, SourceAgent,
     SQLAgent, TableListAgent, VegaLiteAgent,
@@ -257,7 +257,8 @@ def _resolve_data(self, data: DataT | list[DataT] | None):
                 if src.endswith(('.parq', '.parquet')):
                     table = f"read_parquet('{src}')"
                 elif src.endswith(".csv"):
-                    table = f"read_csv('{src}')"
+                    encoding = detect_file_encoding(file_obj=src)
+                    table = f"read_csv('{src}', encoding='{encoding}')"
                 elif src.endswith(".json"):
                     table = f"read_json_auto('{src}')"
                 else:
diff --git a/lumen/tests/transforms/test_sql.py b/lumen/tests/transforms/test_sql.py
@@ -109,6 +109,13 @@ def test_sql_comments():
     assert result == expected
 
 
+def test_add_encoding_to_read_csv():
+    expression: list = sqlglot.parse("READ_CSV('data/life-expectancy.csv')")
+    result = SQLTransform(identify=True)._add_encoding_to_read_csv(expression[0])
+    expected = "READ_CSV('data/life-expectancy.csv', encoding='utf-8')"
+    assert result.sql() == expected
+
+
 def test_sql_error_level():
     with pytest.raises(
         sqlglot.errors.ParseError, match="Expected table name but got"
diff --git a/lumen/transforms/sql.py b/lumen/transforms/sql.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
 import datetime as dt
+import pathlib
 import re
 
+from copy import deepcopy
 from typing import ClassVar
 
 import param  # type: ignore
@@ -11,12 +13,13 @@
 from sqlglot import parse
 from sqlglot.expressions import (
     LT, Column, Expression, Identifier, Literal as SQLLiteral, Max, Min, Null,
-    Select, Star, Table, TableSample, and_, func, or_, replace_placeholders,
-    replace_tables, select,
+    ReadCSV, Select, Star, Table, TableSample, and_, func, or_,
+    replace_placeholders, replace_tables, select,
 )
 from sqlglot.optimizer import optimize
 
 from ..config import SOURCE_TABLE_SEPARATOR
+from ..util import detect_file_encoding
 from .base import Transform
 
 
@@ -140,6 +143,30 @@ def parse_sql(self, sql_in: str) -> Expression:
         expression = expressions[0]
         return expression
 
+    def _add_encoding_to_read_csv(self, expression: Expression) -> Expression:
+        """
+        Add file encoding when reading CSV files using DuckDB.
+
+        Parameters
+        ----------
+        expression : Expression
+            An sqlglot expression object.
+
+        Returns
+        -------
+        Expression
+            A modified expression that includes the file encoding.
+        """
+        expr = deepcopy(expression)
+        if isinstance(expr, ReadCSV):
+            read_csv = expr.find(ReadCSV) or ReadCSV()
+            literal = read_csv.find(SQLLiteral) or SQLLiteral()
+            if pathlib.Path(literal.this).suffix.lower() == ".csv" and "encoding" not in literal.this:
+                encoding = detect_file_encoding(file_obj=literal.this)
+                expr.find(ReadCSV).find(SQLLiteral).replace(Identifier(this=f"'{literal.this}', encoding='{encoding}'", is_string=literal.is_string))
+
+        return expr
+
     def to_sql(self, expression: Expression) -> str:
         """
         Convert sqlglot expression back to SQL string.
@@ -157,6 +184,8 @@ def to_sql(self, expression: Expression) -> str:
         if self.optimize:
             expression = optimize(expression, dialect=self.read)
 
+        expression = self._add_encoding_to_read_csv(expression=expression)
+
         return expression.sql(
             comments=self.comments,
             dialect=self.write,
@@ -208,10 +237,12 @@ def apply(self, sql_in: str) -> str:
         sql_template = re.sub(r'\{(\w+)\}', r':\1', sql_in)
         expression = self.parse_sql(sql_template)
         if self.parameters:
-            parameters = {
-                k: Identifier(this=v, quoted=self.identify) if isinstance(v, str) else v
-                for k, v in self.parameters.items()
-            }
+            parameters = {}
+            for k, v in self.parameters.items():
+                if isinstance(v, str):
+                    parameters[k] = Identifier(this=v, quoted=self.identify)
+                else:
+                    parameters[k] = v
             replaced_expression = replace_placeholders(expression, **parameters)
         return self.to_sql(replaced_expression,)
 
diff --git a/lumen/util.py b/lumen/util.py
@@ -2,16 +2,19 @@
 
 import datetime as dt
 import importlib
+import io
 import os
 import re
 import sys
 import unicodedata
 
 from functools import partial, wraps
 from logging import getLogger
+from pathlib import Path
 from subprocess import check_output
 
 import bokeh
+import chardet
 import pandas as pd
 import panel as pn
 import param
@@ -349,3 +352,47 @@ def slugify(value, allow_unicode=False) -> str:
         )
     value = re.sub(r"[^\w\s-]", "", value.lower())
     return re.sub(r"[-\s]+", "-", value).strip("-_")
+
+
+def detect_file_encoding(file_obj: Path | io.BytesIO | io.StringIO) -> str:
+    """
+    Detects the given file object's encoding.
+
+    Parameters
+    ----------
+    file_obj : Path | io.BytesIO | io.StringIO
+        File object or path object to detect encoding.
+
+    Returns
+    -------
+    str
+    """
+    if isinstance(file_obj, str):
+        try:
+            path_exists = Path(file_obj).exists()
+            if path_exists:
+                file_obj = Path(file_obj)
+        except OSError:
+            pass
+
+    # Handle if a path is given.
+    if isinstance(file_obj, Path):
+        with file_obj.open("rb") as f:
+            data = f.read()
+        detected_encoding = chardet.detect(data)
+        encoding = detected_encoding["encoding"]
+
+    # Handle if a string or bytes object is given.
+    if isinstance(file_obj, bytes):
+        detected_encoding = chardet.detect(file_obj)
+    elif isinstance(file_obj, str):
+        detected_encoding = chardet.detect(file_obj.encode())
+
+    encoding = detected_encoding["encoding"]
+
+    if encoding == "ISO-8859-1":
+        encoding = "latin-1"
+    elif encoding == "ascii":
+        encoding = "utf-8"
+
+    return encoding.lower()
diff --git a/pixi.toml b/pixi.toml
@@ -21,6 +21,7 @@ bq-dev = ["py313", "ai", "ai-local", "ai-llama", "bigquery", "lint", "sql", "tes
 
 [dependencies]
 bokeh = "*"
+chardet = "*"
 holoviews = ">=1.17.0"
 hvplot = "*"
 intake = "<2"
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,10 @@ HoloViz = "https://holoviz.org/"
 [project.optional-dependencies]
 tests = ['pytest', 'pytest-rerunfailures', 'pytest-asyncio']
 sql = ['duckdb', 'intake-sql', 'sqlalchemy']
-ai = ['griffe', 'nbformat', 'duckdb', 'pyarrow', 'instructor >=1.6.4', 'pydantic >=2.8.0', 'pydantic-extra-types', 'panel-graphic-walker[kernel] >=0.6.4', 'markitdown', 'semchunk', 'tiktoken']
+ai = [
+    'griffe', 'nbformat', 'duckdb', 'pyarrow', 'instructor >=1.6.4', 'pydantic >=2.8.0', 'pydantic-extra-types', 'panel-graphic-walker[kernel] >=0.6.4',
+    'markitdown', 'semchunk', 'tiktoken', 'chardet',
+]
 ai-local = ['lumen[ai]', 'huggingface_hub']
 ai-openai = ['lumen[ai]', 'openai']
 ai-mistralai = ['lumen[ai]', 'mistralai']