feat: add input cache parameter

Allob · Allob · commit c48849ee9c7b · 2026-02-04T16:57:49.000+01:00
diff --git a/README.md b/README.md
@@ -50,17 +50,27 @@ Following environment variables could be used for the configuration:
 |`DIAL_LOG_PARSER_DATE`| optional | Date to process logs for (default: yesterday) |
 |`DIAL_LOG_PARSER_DEBUG`| optional | Enables debug logging |
 |`DIAL_LOG_PARSER_FILENAME_REGEX`| optional | Allows to override the regex to match log file names (default: `date=(\d{4}-\d{2}-\d{2})(\d+)-(\w{8}-\w{4}-\w{4}-\w{4}-\w{12}).log(.gz)?`) |
-|`DIAL_LOG_PARSER_INPUT_COMPRESSION`| optional | Compression type for input log files. Possible values: 'detect' - detect compression from file extension (default), 'none' - no compression, or well known compression types [supported by pyarrow](https://arrow.apache.org/docs/python/generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.open_input_stream) (like 'gzip'). |
+|`DIAL_LOG_PARSER_INPUT_COMPRESSION`| optional | Compression type for input log files. Possible values: <br/> `infer` - infer compression from file extension (default), <br/> `none` - no compression, <br/> or well known compression types [supported by fsspec](https://filesystem-spec.readthedocs.io/en/latest/features.html#transparent-text-mode-and-compression) (like `gzip`). |
+|`DIAL_LOG_PARSER_INPUT_CACHE`| optional | Cache type for input filesystem. Possible values: <br/> `default` - use default caching behavior (default), <br/> `none` - disable caching, <br/> or cache types supported by fsspec (like `readahead`, `bytes`, etc.). <br/> See https://filesystem-spec.readthedocs.io/en/latest/api.html#read-buffering and specific filesystem documentation for details. |
 
 ### Storage specific environment variables
 
 Specific storage implementations may require additional environment variables to be set.
 
-For example, for S3, AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY may be required. See https://s3fs.readthedocs.io/en/latest/#credentials
+For example, for S3, `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` may be required. See https://s3fs.readthedocs.io/en/latest/#credentials
 
 Fsspec compatible implementations should be supported (may require to install the extra packages to the docker).
 Check the list [Built-in Fsspec Implementations](https://filesystem-spec.readthedocs.io/en/latest/api.html#implementations) and [Other Known Fsspec Implementations](https://filesystem-spec.readthedocs.io/en/latest/api.html#external-implementations) for more details.
 
+#### Azure Blob Storage
+
+For Azure Blob Storage, see [adlfs documentation](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials) for the list of required environment variables.
+
+**Note**: `AZURE_STORAGE_ANON` should be explicitly set to `false` to use authenticated access. The default value in the adlfs library is `true` which may lead to authentication issues when trying to access private blobs.
+
+If you store the logs compressed as `.logs.gz` and the `Content-Encoding` header for the blob is set to `gzip`, you may encounter an issue where adlfs returns decompressed file content, but reports the file size for the compressed file. This confuses the caching and decompression logic in fsspec and may lead to an error when the parser tries to read the file content.
+
+To work around this issue, you can set the `DIAL_LOG_PARSER_INPUT_COMPRESSION=none` to explicitly disable compression in the parser even if the file name ends with `.gz`, and set `DIAL_LOG_PARSER_INPUT_CACHE=none` to disable caching to avoid issues with the file size mismatch. This way the parser will read the file content as is without trying to decompress it or cache it.
 
 ### Command-line arguments
 ```
@@ -77,10 +87,17 @@ Options:
                             [env var: DIAL_LOG_PARSER_FILENAME_REGEX; default: date=(\d{4}-\d{2}-\d{2})(\d+)-(\w{8}-\w{4}-\w{4}-\w{4}-\w{12}).log(.gz)?]
   --input-compression TEXT  Compression type for input log files.
                             Possible values:
-                            'detect' - detect compression from file extension (default),
+                            'infer' - infer compression from file extension (default),
                             'none' - no compression,
-                            or well known compression types supported by pyarrow (like 'gzip').
-                            [env var: DIAL_LOG_PARSER_INPUT_COMPRESSION; default: detect]
+                            or well known compression types supported by fsspec (like 'gzip').
+                            [env var: DIAL_LOG_PARSER_INPUT_COMPRESSION; default: infer]
+  --input-cache TEXT        Cache type for input filesystem. Possible values:
+                            'default' - use default caching behavior (default),
+                            'none' - disable caching,
+                            or cache types supported by fsspec (like 'readahead', 'bytes', etc.).
+                            See https://filesystem-spec.readthedocs.io/en/latest/api.html#read-buffering and specific filesystem documentation
+                            for details.
+                            [env var: DIAL_LOG_PARSER_INPUT_CACHE; default: default]
   --help                    Show this message and exit.
 ```
 
diff --git a/src/aidial_log_parser/parse_logs.py b/src/aidial_log_parser/parse_logs.py
@@ -27,7 +27,11 @@
 
 DEFAULT_FILENAME_REGEX_COMPILED = re.compile(DEFAULT_FILENAME_REGEX)
 
-DEFAULT_INPUT_COMPRESSION = "detect"
+DEFAULT_INPUT_COMPRESSION = "infer"
+
+DEFAULT_READ_KWARGS = {
+    "compression": DEFAULT_INPUT_COMPRESSION,
+}
 
 DEPLOYMENT_FIELD_NAME = "deployment"
 
@@ -155,8 +159,8 @@ def from_file_path(
 
 def read_data(
     input_files: list[InputFile],
-    filesystem,
-    compression: str | None = DEFAULT_INPUT_COMPRESSION,
+    filesystem: fsspec.AbstractFileSystem,
+    **kwargs,
 ):
     read_options = pj.ReadOptions(
         use_threads=False,
@@ -165,17 +169,15 @@ def read_data(
     with click.progressbar(input_files, label="Reading files", show_pos=True) as files:
         for input_file in files:
             logging.info(f"Reading file {input_file.path}")
-            with filesystem.open_input_stream(
-                input_file.path, compression=compression
-            ) as f:
+            with filesystem.open(input_file.path, mode="rb", **kwargs) as f:
                 table = pj.read_json(f, read_options)
                 for batch in table.to_batches():
                     yield input_file.date, batch
 
 
 def list_files(
     input_dir: str,
-    filesystem,
+    filesystem: fs.PyFileSystem,
     log_date: pa.Date32Scalar,
     filename_regex: re.Pattern,
 ) -> list[InputFile]:
@@ -389,7 +391,7 @@ def parse_logs(
     output_dir: str,
     date: pa.Date32Scalar,
     filename_regex: re.Pattern = DEFAULT_FILENAME_REGEX_COMPILED,
-    input_compression: str | None = DEFAULT_INPUT_COMPRESSION,
+    read_kwargs: dict = DEFAULT_READ_KWARGS,
 ):
     in_fs_fsspec, input_dir_path = fsspec.url_to_fs(input_dir)
     in_fs = fs.PyFileSystem(fs.FSSpecHandler(in_fs_fsspec))
@@ -407,8 +409,8 @@ def parse_logs(
 
     input_batches_iter = read_data(
         input_files,
-        filesystem=in_fs,
-        compression=input_compression,
+        filesystem=in_fs_fsspec,
+        **read_kwargs,
     )
     logging.debug(f"Output schema: {OUT_SCHEMA}")
 
@@ -436,6 +438,21 @@ def parse_compression_param(
     return value
 
 
+def parse_cache_param(
+    ctx: click.Context,
+    param: click.Parameter,
+    value: str,
+) -> dict:
+    # Dict is used to distinguish between default and None value
+    # Setting cache_type to None disables caching in fsspec
+    # But not passing cache_type uses the default cache_type parameter
+    # specific to the filesystem implementation
+    if value.lower() == "default":
+        return {}
+
+    return {"cache_type": value}
+
+
 @click.command()
 @click.option(
     "-i",
@@ -482,16 +499,39 @@ def parse_compression_param(
     "--input-compression",
     type=str,
     help="""Compression type for input log files. Possible values:
-        'detect' - detect compression from file extension (default),
+        'infer' - infer compression from file extension (default),
         'none' - no compression,
-        or well known compression types supported by pyarrow (like 'gzip').
+        or well known compression types supported by fsspec (like 'gzip').
         """,
     callback=parse_compression_param,
     default=DEFAULT_INPUT_COMPRESSION,
     show_default=True,
     show_envvar=True,
 )
-def main(input, output, date, debug, filename_regex, input_compression):
+@click.option(
+    "--input-cache",
+    type=str,
+    help="""Cache type for input filesystem. Possible values:
+        'default' - use default caching behavior (default),
+        'none' - disable caching,
+        or cache types supported by fsspec (like 'readahead', 'bytes', etc.).
+        See https://filesystem-spec.readthedocs.io/en/latest/api.html#read-buffering
+        and specific filesystem documentation for details.
+    """,
+    default="default",
+    show_default=True,
+    show_envvar=True,
+    callback=parse_cache_param,
+)
+def main(
+    input: str,
+    output: str,
+    date: datetime.datetime,
+    debug: bool,
+    filename_regex: str,
+    input_compression: str,
+    input_cache: dict,
+):
     """Parse dial log files and repack it to parquet dataset."""
     if debug:
         logging.getLogger().setLevel(logging.DEBUG)
@@ -505,7 +545,10 @@ def main(input, output, date, debug, filename_regex, input_compression):
         output,
         pa.scalar(date, type=pa.date32()),
         filename_regex_compiled,
-        input_compression,
+        read_kwargs={
+            "compression": input_compression,
+            **input_cache,
+        },
     )