Skip to content

Commit d1b1f8e

Browse files
committed
Update open_ags_source to also work with zip files
1 parent d76777e commit d1b1f8e

File tree

2 files changed

+32
-36
lines changed

2 files changed

+32
-36
lines changed

examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,10 @@ def _():
2929

3030
import io
3131
import platform
32-
import re
3332
import sys
3433
import zipfile
3534
from pathlib import Path
3635

37-
import chardet
3836
import folium
3937
import geopandas as gpd
4038
import mapclassify
@@ -60,7 +58,6 @@ def _():
6058
ags3_db_to_no_gis_brgi_db,
6159
ags_to_dfs,
6260
calculate_gis_geometry,
63-
chardet,
6461
check_brgi_database,
6562
check_no_gis_brgi_database,
6663
concatenate_databases,
@@ -79,7 +76,6 @@ def _():
7976
def _(
8077
ags3_db_to_no_gis_brgi_db,
8178
ags_to_dfs,
82-
chardet,
8379
check_no_gis_brgi_database,
8480
concatenate_databases,
8581
zipfile,
@@ -94,11 +90,8 @@ def zip_of_ags3s_to_bedrock_gi_database(zip, crs):
9490
if file_name.lower().endswith(".ags"):
9591
print(f"\n🖥️ Processing {file_name} ...")
9692
with zip_ref.open(file_name) as ags3_file:
97-
ags3_data = ags3_file.read()
98-
detected_encoding = chardet.detect(ags3_data)["encoding"]
99-
ags3_data = ags3_data.decode(detected_encoding)
100-
# Convert content of a single AGS 3 file to a Dictionary of pandas dataframes (a database)
101-
ags3_db = ags_to_dfs(ags3_data)
93+
# Convert content of a single AGS 3 file to a Dictionary of pandas dataframes (a database)
94+
ags3_db = ags_to_dfs(ags3_file)
10295
report_no = file_name.split("/")[0]
10396
ags3_db["PROJ"]["REPORT_NO"] = int(report_no)
10497
project_uid = f"{ags3_db['PROJ']['PROJ_ID'].iloc[0]}_{file_name}"

src/bedrock_ge/gi/ags/read.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import codecs
44
import io
5-
from contextlib import contextmanager
5+
from contextlib import contextmanager, nullcontext
66
from pathlib import Path
77
from typing import IO, Any, ContextManager, Dict, List
88

@@ -33,7 +33,7 @@ def detect_encoding(source: str | Path | IO[str] | IO[bytes] | bytes) -> str:
3333
FileNotFoundError: If a file path doesn't exist
3434
"""
3535
# Set number of bytes to read for detection and required confidence
36-
SAMPLE_SIZE = 10_000
36+
SAMPLE_SIZE = 1_000_000
3737
REQUIRED_CONFIDENCE = 0.7
3838

3939
def _detect_from_bytes(data: bytes) -> str:
@@ -46,6 +46,9 @@ def _detect_from_bytes(data: bytes) -> str:
4646
if not encoding or confidence < REQUIRED_CONFIDENCE:
4747
return DEFAULT_ENCODING
4848

49+
if encoding.lower() == "ascii":
50+
return "utf-8"
51+
4952
return encoding
5053

5154
def _read_from_path(path: Path):
@@ -89,7 +92,6 @@ def _read_from_path(path: Path):
8992
original_position = source.tell()
9093
source.seek(0)
9194
sample = source.read(SAMPLE_SIZE)
92-
encoding = _detect_from_bytes(sample)
9395
if isinstance(sample, bytes):
9496
encoding = _detect_from_bytes(sample)
9597
else:
@@ -104,9 +106,9 @@ def _read_from_path(path: Path):
104106
raise TypeError(f"Unsupported input type for encoding detection: {type(source)}")
105107

106108

107-
def read_ags_source(
109+
def open_ags_source(
108110
source: str | Path | IO[str] | IO[bytes] | bytes, encoding=None
109-
) -> ContextManager[TextIOBase]:
111+
) -> ContextManager[io.TextIOBase]:
110112
"""Opens or wraps a given source for reading AGS (text-based) data.
111113
112114
Args:
@@ -123,41 +125,42 @@ def read_ags_source(
123125
Raises:
124126
TypeError: If the source type is unsupported or binary streams are not decoded.
125127
"""
128+
try:
129+
codecs.lookup(encoding)
130+
except LookupError:
131+
raise ValueError(f"Unsupported encoding: {encoding}")
126132

127133
@contextmanager
128-
def string_source(content: str):
129-
string_io = io.StringIO(content)
134+
def _bytes_source(bytes_content: bytes):
135+
string_io = io.StringIO(bytes_content.decode(encoding))
130136
try:
131137
yield string_io
132138
finally:
133139
string_io.close()
134140

135-
if isinstance(source, str):
141+
if isinstance(source, (str, Path)):
136142
path = Path(source)
137143
if path.exists() and path.is_file():
138144
return open(path, "r", encoding=encoding)
139145
raise FileNotFoundError(f"Path does not exist or is not a file: {source}")
140146

141-
elif isinstance(source, Path):
142-
if source.exists() and source.is_file():
143-
return open(source, "r", encoding=encoding)
144-
raise FileNotFoundError(f"Path does not exist or is not a file: {source}")
145-
146-
elif isinstance(source, bytes):
147-
return string_source(source.decode(encoding))
147+
elif isinstance(source, io.TextIOBase):
148+
source.seek(0)
149+
return nullcontext(source)
148150

149-
elif isinstance(source, io.BytesIO):
150-
return string_source(source.getvalue().decode(encoding))
151+
elif isinstance(source, io.BufferedIOBase):
152+
text_stream = io.TextIOWrapper(source, encoding=encoding)
153+
text_stream.seek(0)
154+
return nullcontext(text_stream)
151155

152-
elif hasattr(source, "read"):
153-
# reset the cursor to the beginning
154-
try:
155-
source.seek(0)
156-
except (AttributeError, io.UnsupportedOperation):
157-
pass
158-
return nullcontext(source)
156+
elif isinstance(source, bytes):
157+
return _bytes_source(source)
159158

160-
raise TypeError(f"Unsupported input type: {type(source)}")
159+
else:
160+
raise TypeError(
161+
f"Unsupported source type: {type(source)}. "
162+
"Expected str, Path, IO[str], IO[bytes], or bytes."
163+
)
161164

162165

163166
def ags_to_dfs(
@@ -182,7 +185,7 @@ def ags_to_dfs(
182185
encoding = detect_encoding(source)
183186

184187
# Get first non-blank line, `None` if all lines are blank
185-
with read_ags_source(source, encoding=encoding) as f:
188+
with open_ags_source(source, encoding=encoding) as f:
186189
first_line = next((line.strip() for line in f if line.strip()), None)
187190

188191
if first_line:
@@ -234,7 +237,7 @@ def ags3_to_dfs(
234237
headers: List[str] = ["", "", ""]
235238
group_data: List[List[Any]] = [[], [], []]
236239

237-
with read_ags_source(source, encoding=encoding) as file:
240+
with open_ags_source(source, encoding=encoding) as file:
238241
for i, line in enumerate(file):
239242
line = line.strip()
240243
last_line_type = line_type

0 commit comments

Comments
 (0)