22
33import codecs
44import io
5- from contextlib import contextmanager
5+ from contextlib import contextmanager , nullcontext
66from pathlib import Path
77from typing import IO , Any , ContextManager , Dict , List
88
@@ -33,7 +33,7 @@ def detect_encoding(source: str | Path | IO[str] | IO[bytes] | bytes) -> str:
3333 FileNotFoundError: If a file path doesn't exist
3434 """
3535 # Set number of bytes to read for detection and required confidence
36- SAMPLE_SIZE = 10_000
36+ SAMPLE_SIZE = 1_000_000
3737 REQUIRED_CONFIDENCE = 0.7
3838
3939 def _detect_from_bytes (data : bytes ) -> str :
@@ -46,6 +46,9 @@ def _detect_from_bytes(data: bytes) -> str:
4646 if not encoding or confidence < REQUIRED_CONFIDENCE :
4747 return DEFAULT_ENCODING
4848
49+ if encoding .lower () == "ascii" :
50+ return "utf-8"
51+
4952 return encoding
5053
5154 def _read_from_path (path : Path ):
@@ -89,7 +92,6 @@ def _read_from_path(path: Path):
8992 original_position = source .tell ()
9093 source .seek (0 )
9194 sample = source .read (SAMPLE_SIZE )
92- encoding = _detect_from_bytes (sample )
9395 if isinstance (sample , bytes ):
9496 encoding = _detect_from_bytes (sample )
9597 else :
@@ -104,9 +106,9 @@ def _read_from_path(path: Path):
104106 raise TypeError (f"Unsupported input type for encoding detection: { type (source )} " )
105107
106108
107- def read_ags_source (
109+ def open_ags_source (
108110 source : str | Path | IO [str ] | IO [bytes ] | bytes , encoding = None
109- ) -> ContextManager [TextIOBase ]:
111+ ) -> ContextManager [io . TextIOBase ]:
110112 """Opens or wraps a given source for reading AGS (text-based) data.
111113
112114 Args:
@@ -123,41 +125,42 @@ def read_ags_source(
123125 Raises:
124126 TypeError: If the source type is unsupported or binary streams are not decoded.
125127 """
128+ try :
129+ codecs .lookup (encoding )
130+ except LookupError :
131+ raise ValueError (f"Unsupported encoding: { encoding } " )
126132
127133 @contextmanager
128- def string_source ( content : str ):
129- string_io = io .StringIO (content )
134+ def _bytes_source ( bytes_content : bytes ):
135+ string_io = io .StringIO (bytes_content . decode ( encoding ) )
130136 try :
131137 yield string_io
132138 finally :
133139 string_io .close ()
134140
135- if isinstance (source , str ):
141+ if isinstance (source , ( str , Path ) ):
136142 path = Path (source )
137143 if path .exists () and path .is_file ():
138144 return open (path , "r" , encoding = encoding )
139145 raise FileNotFoundError (f"Path does not exist or is not a file: { source } " )
140146
141- elif isinstance (source , Path ):
142- if source .exists () and source .is_file ():
143- return open (source , "r" , encoding = encoding )
144- raise FileNotFoundError (f"Path does not exist or is not a file: { source } " )
145-
146- elif isinstance (source , bytes ):
147- return string_source (source .decode (encoding ))
147+ elif isinstance (source , io .TextIOBase ):
148+ source .seek (0 )
149+ return nullcontext (source )
148150
149- elif isinstance (source , io .BytesIO ):
150- return string_source (source .getvalue ().decode (encoding ))
151+ elif isinstance (source , io .BufferedIOBase ):
152+ text_stream = io .TextIOWrapper (source , encoding = encoding )
153+ text_stream .seek (0 )
154+ return nullcontext (text_stream )
151155
152- elif hasattr (source , "read" ):
153- # reset the cursor to the beginning
154- try :
155- source .seek (0 )
156- except (AttributeError , io .UnsupportedOperation ):
157- pass
158- return nullcontext (source )
156+ elif isinstance (source , bytes ):
157+ return _bytes_source (source )
159158
160- raise TypeError (f"Unsupported input type: { type (source )} " )
159+ else :
160+ raise TypeError (
161+ f"Unsupported source type: { type (source )} . "
162+ "Expected str, Path, IO[str], IO[bytes], or bytes."
163+ )
161164
162165
163166def ags_to_dfs (
@@ -182,7 +185,7 @@ def ags_to_dfs(
182185 encoding = detect_encoding (source )
183186
184187 # Get first non-blank line, `None` if all lines are blank
185- with read_ags_source (source , encoding = encoding ) as f :
188+ with open_ags_source (source , encoding = encoding ) as f :
186189 first_line = next ((line .strip () for line in f if line .strip ()), None )
187190
188191 if first_line :
@@ -234,7 +237,7 @@ def ags3_to_dfs(
234237 headers : List [str ] = ["" , "" , "" ]
235238 group_data : List [List [Any ]] = [[], [], []]
236239
237- with read_ags_source (source , encoding = encoding ) as file :
240+ with open_ags_source (source , encoding = encoding ) as file :
238241 for i , line in enumerate (file ):
239242 line = line .strip ()
240243 last_line_type = line_type
0 commit comments