Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
436 changes: 212 additions & 224 deletions examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py

Large diffs are not rendered by default.

8 changes: 2 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,20 +72,16 @@ Tracker = "https://github.com/bedrock-engineer/bedrock-ge/issues"

[dependency-groups]
dev = [
"duckdb>=1.2.2",
"frictionless[excel]>=4.40.8",
"jupyter>=1.1.1",
"marimo>=0.12.5",
"marimo[recommended]>=0.13.11",
"mypy>=1.11.2",
"nbconvert>=7.16.6",
"pandas-stubs>=2.2.2.240807",
"ruff>=0.6.7",
"sqlglot>=26.12.1",
]

tests = [
"folium>=0.17.0",
"mapclassify>=2.8.1",
"marimo>=0.13.11",
"matplotlib>=3.9.2",
"pytest>=8.3.3",
]
Expand Down
4 changes: 2 additions & 2 deletions sandbox/data_validation/try_pandera.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "efd86e9f",
"metadata": {},
"outputs": [],
Expand All @@ -12,7 +12,7 @@
"from pprint import pprint\n",
"\n",
"import pandas as pd\n",
"import pandera as pa\n",
"import pandera.pandas as pa\n",
"from pandera.typing import DataFrame, Series"
]
},
Expand Down
74 changes: 36 additions & 38 deletions src/bedrock_ge/gi/ags/read.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import annotations

import codecs
import io
from contextlib import contextmanager, nullcontext
from io import TextIOBase
from pathlib import Path
from typing import IO, Any, ContextManager, Dict, List

Expand All @@ -20,21 +20,20 @@ def detect_encoding(source: str | Path | IO[str] | IO[bytes] | bytes) -> str:

Args:
source (str | Path | IO[str] | IO[bytes] | bytes): The source to detect encoding from.
- str: Treated as a file path if it exists, otherwise as text (returns `DEFAULT_ENCODING`)
- Path: File path to read and detect encoding
- str or Path: File path.
- IO[str]: Already decoded text stream (returns `DEFAULT_ENCODING`)
- IO[bytes]: Binary stream to detect encoding from
- bytes: Binary data to detect encoding from

Returns:
str: The detected encoding name (e.g., 'utf-8', 'iso-8859-1', etc.)
str: The detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'ascii', etc.)

Raises:
TypeError: If the source type is unsupported
FileNotFoundError: If a file path doesn't exist
"""
# Set number of bytes to read for detection and required confidence
SAMPLE_SIZE = 10000
SAMPLE_SIZE = 1_000_000
REQUIRED_CONFIDENCE = 0.7

def _detect_from_bytes(data: bytes) -> str:
Expand All @@ -47,6 +46,9 @@ def _detect_from_bytes(data: bytes) -> str:
if not encoding or confidence < REQUIRED_CONFIDENCE:
return DEFAULT_ENCODING

if encoding.lower() == "ascii":
return "utf-8"

return encoding

def _read_from_path(path: Path):
Expand Down Expand Up @@ -79,18 +81,17 @@ def _read_from_path(path: Path):
# IO[str] object
if hasattr(source, "encoding"):
if source.encoding:
# Could be `None`
# Could be `None`, e.g. io.StringIO has an encoding attribute which is None.
return source.encoding
else:
return DEFAULT_ENCODING

# IO[bytes]
if isinstance(source, io.BytesIO):
original_position = source.tell()
if isinstance(source, io.BufferedIOBase):
try:
original_position = source.tell()
source.seek(0)
sample = source.read(SAMPLE_SIZE)
encoding = _detect_from_bytes(sample)
if isinstance(sample, bytes):
encoding = _detect_from_bytes(sample)
else:
Expand All @@ -105,9 +106,9 @@ def _read_from_path(path: Path):
raise TypeError(f"Unsupported input type for encoding detection: {type(source)}")


def read_ags_source(
def open_ags_source(
source: str | Path | IO[str] | IO[bytes] | bytes, encoding=None
) -> ContextManager[TextIOBase]:
) -> ContextManager[io.TextIOBase]:
"""Opens or wraps a given source for reading AGS (text-based) data.

Args:
Expand All @@ -124,41 +125,42 @@ def read_ags_source(
Raises:
TypeError: If the source type is unsupported or binary streams are not decoded.
"""
try:
codecs.lookup(encoding)
except LookupError:
raise ValueError(f"Unsupported encoding: {encoding}")

@contextmanager
def string_source(content: str):
string_io = io.StringIO(content)
def _bytes_source(bytes_content: bytes):
string_io = io.StringIO(bytes_content.decode(encoding))
try:
yield string_io
finally:
string_io.close()

if isinstance(source, str):
if isinstance(source, (str, Path)):
path = Path(source)
if path.exists() and path.is_file():
return open(path, "r", encoding=encoding)
raise FileNotFoundError(f"Path does not exist or is not a file: {source}")

elif isinstance(source, Path):
if source.exists() and source.is_file():
return open(source, "r", encoding=encoding)
raise FileNotFoundError(f"Path does not exist or is not a file: {source}")

elif isinstance(source, bytes):
return string_source(source.decode(encoding))
elif isinstance(source, io.TextIOBase):
source.seek(0)
return nullcontext(source)

elif isinstance(source, io.BytesIO):
return string_source(source.getvalue().decode(encoding))
elif isinstance(source, io.BufferedIOBase):
text_stream = io.TextIOWrapper(source, encoding=encoding)
text_stream.seek(0)
return nullcontext(text_stream)

elif hasattr(source, "read"):
# reset the cursor to the beginning
try:
source.seek(0)
except (AttributeError, io.UnsupportedOperation):
pass
return nullcontext(source)
elif isinstance(source, bytes):
return _bytes_source(source)

raise TypeError(f"Unsupported input type: {type(source)}")
else:
raise TypeError(
f"Unsupported source type: {type(source)}. "
"Expected str, Path, IO[str], IO[bytes], or bytes."
)


def ags_to_dfs(
Expand All @@ -179,15 +181,11 @@ def ags_to_dfs(
Dict[str, pd.DataFrame]]: A dictionary where keys represent AGS group
names with corresponding DataFrames for the corresponding group data.
"""
# if bytes are provided, convert to IO[bytes] to be file-like
if isinstance(source, bytes):
source = io.BytesIO(source)

if not encoding:
encoding = detect_encoding(source)

# Get first non-blank line, `None` if all lines are blank
with read_ags_source(source, encoding=encoding) as f:
with open_ags_source(source, encoding=encoding) as f:
first_line = next((line.strip() for line in f if line.strip()), None)

if first_line:
Expand Down Expand Up @@ -239,7 +237,7 @@ def ags3_to_dfs(
headers: List[str] = ["", "", ""]
group_data: List[List[Any]] = [[], [], []]

with read_ags_source(source, encoding=encoding) as file:
with open_ags_source(source, encoding=encoding) as file:
for i, line in enumerate(file):
line = line.strip()
last_line_type = line_type
Expand Down Expand Up @@ -333,7 +331,7 @@ def ags4_to_dfs(
object that represents and AGS4 file.

Returns:
Dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, where each key
Dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, where each key
represents a group name from AGS 4 data, and the corresponding value is a
pandas DataFrame containing the data for that group.
"""
Expand Down
2 changes: 1 addition & 1 deletion src/bedrock_ge/gi/ags/schemas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pandera as pa
import pandera.pandas as pa
from pandera.typing import Series


Expand Down
2 changes: 1 addition & 1 deletion src/bedrock_ge/gi/ags/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Dict

import pandas as pd
import pandera as pa
import pandera.pandas as pa
from pandera.typing import DataFrame
from pyproj import CRS

Expand Down
2 changes: 2 additions & 0 deletions src/bedrock_ge/gi/gis_geometry.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from typing import Dict, Tuple, Union

import geopandas as gpd
Expand Down
2 changes: 1 addition & 1 deletion src/bedrock_ge/gi/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import Optional

import pandera as pa
import pandera.pandas as pa
from pandera.typing import Series
from pandera.typing.geopandas import GeoSeries

Expand Down
8 changes: 4 additions & 4 deletions tests/test_bedrock_ge/gi/test_ags.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,15 @@ def test_detect_encoding():
ags4_bio = io.BytesIO(ags4_byte)

sources = {
ags3: ags3_encoding,
ags3: default_encoding,
ags4: ags4_encoding,
ags3_path: ags3_encoding,
ags3_path: default_encoding,
ags4_path: ags4_encoding,
ags3_byte: ags3_encoding,
ags3_byte: default_encoding,
ags4_byte: ags4_encoding,
ags3_sio: default_encoding,
ags4_sio: default_encoding,
ags3_bio: ags3_encoding,
ags3_bio: default_encoding,
ags4_bio: ags4_encoding,
}
for source, expected in sources.items():
Expand Down
10 changes: 7 additions & 3 deletions tests/test_examples/test_hk_kaitak_ags3_to_brgi_geodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import shutil
import sqlite3
import subprocess
import sys
from pathlib import Path
from tempfile import TemporaryDirectory

Expand Down Expand Up @@ -39,13 +40,16 @@ def test_kaitak_ags3_notebook_runs_and_creates_gpkg(examples_dir):
# TODO: implement logging
# NOTE: The env (environment variables) and encoding are required for running
# the notebook as a script from both Windows and Linux. Without => UnicodeDecodeError
# NOTE: `uvx uv run` runs the marimo notebook as a script in a temporary environment,
# NOTE: `(uvx) uv run` runs the marimo notebook as a script in a temporary environment,
# with the Python version and dependencies specified in the PEP 723 inline script metadata.
# The issue with this approach is that it uses the latest version of bedrock-ge,
# rather than the current code in this repo.
env = os.environ.copy()
env["PYTHONIOENCODING"] = "utf-8"
result = subprocess.run(
# ["uvx", "uv", "run", "--no-project", "--no-cache", str(notebook_path)],
["uv", "run", str(notebook_path)],
# ["uv", "run", str(notebook_path)],
[sys.executable, str(notebook_path)],
check=False,
capture_output=True,
text=True,
Expand All @@ -55,7 +59,7 @@ def test_kaitak_ags3_notebook_runs_and_creates_gpkg(examples_dir):

# Check that the script ran successfully
assert result.returncode == 0, (
f"📛 Running `uvx run marimo notebook.py` failed with code {result.returncode}\n"
f"📛 Running `python notebook.py` failed with code {result.returncode}\n"
f"📄 STDOUT:\n{result.stdout}\n"
f"⚠️ STDERR:\n{result.stderr}"
)
Expand Down
Loading