Eventual-Inc
diff --git a/‎Cargo.lock‎
Lines changed: 3 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎daft/daft/__init__.pyi‎
Lines changed: 4 additions & 0 deletions b/‎daft/daft/__init__.pyi‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎daft/dataframe/dataframe.py‎
Lines changed: 29 additions & 0 deletions b/‎daft/dataframe/dataframe.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎daft/execution/metadata.py‎
Lines changed: 10 additions & 0 deletions b/‎daft/execution/metadata.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎daft/io/_csv.py‎
Lines changed: 5 additions & 0 deletions b/‎daft/io/_csv.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎daft/io/_parquet.py‎
Lines changed: 12 additions & 1 deletion b/‎daft/io/_parquet.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎daft/io/iceberg/_iceberg.py‎
Lines changed: 7 additions & 1 deletion b/‎daft/io/iceberg/_iceberg.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎daft/io/iceberg/iceberg_scan.py‎
Lines changed: 6 additions & 1 deletion b/‎daft/io/iceberg/iceberg_scan.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎docs/SUMMARY.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/SUMMARY.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/connectors/generic-file-source-options.md‎
Lines changed: 122 additions & 0 deletions b/‎docs/connectors/generic-file-source-options.md‎
Lines changed: 122 additions & 0 deletions
@@ -282,6 +282,7 @@ class ParquetSourceConfig:
         field_id_mapping: dict[int, PyField] | None = None,
         row_groups: list[list[int]] | None = None,
         chunk_size: int | None = None,
+        ignore_corrupt_files: bool = False,
     ): ...
 
 class CsvSourceConfig:
@@ -308,6 +309,7 @@ class CsvSourceConfig:
         comment: str | None,
         buffer_size: int | None = None,
         chunk_size: int | None = None,
+        ignore_corrupt_files: bool = False,
     ): ...
 
 class JsonSourceConfig:
@@ -2420,6 +2422,8 @@ class PyExecutionStats:
     def query_plan(self) -> str | None: ...
     def encode(self) -> bytes: ...
     def to_recordbatch(self) -> PyRecordBatch: ...
+    @property
+    def skipped_corrupt_files(self) -> list[tuple[str, str, bool]]: ...
 
 class PyResultReceiver:
     def __aiter__(self) -> PyResultReceiver: ...
 
@@ -204,6 +204,26 @@ def metrics(self) -> RecordBatch | None:
         else:
             return self._metadata.to_recordbatch() if self._metadata else None
 
+    @property
+    def skipped_corrupt_files(self) -> list[tuple[str, str, bool]]:
+        """Files skipped during the last execution due to ignore_corrupt_files=True.
+
+        Returns a list of ``(path, reason, partial)`` tuples. ``partial`` is ``True``
+        when some batches were already emitted before corruption was detected (the file
+        was not fully skipped). Only available after ``.collect()``.
+
+        Example::
+
+            df = daft.read_parquet("s3://bucket/data/", ignore_corrupt_files=True)
+            df.collect()
+            for path, reason, partial in df.skipped_corrupt_files:
+                tag = " (partial)" if partial else ""
+                print(f"Skipped{tag} {path}: {reason}")
+        """
+        if self._result_cache is None:
+            raise ValueError("skipped_corrupt_files is not available until the DataFrame has been collected")
+        return self._metadata.skipped_corrupt_files if self._metadata else []
+
     def pipe(
         self,
         function: Callable[Concatenate["DataFrame", P], T],
@@ -5359,6 +5379,15 @@ def _materialize_results(self) -> None:
             assert result is not None
             result.wait()
             self._metadata.write_mermaid()
+            skipped = self._metadata.skipped_corrupt_files if self._metadata else []
+            if skipped:
+                paths = "\n".join(f"  - {path}{' (partial)' if partial else ''}" for path, _, partial in skipped)
+                logger.warning(
+                    "%d file(s) were skipped due to corruption or being missing "
+                    "(ignore_corrupt_files=True). Use df.skipped_corrupt_files for details.\n%s",
+                    len(skipped),
+                    paths,
+                )
 
     @DataframePublicAPI
     def collect(self, num_preview_rows: int | None = 8) -> "DataFrame":
 
@@ -102,6 +102,16 @@ def encode(self) -> bytes:
     def to_recordbatch(self) -> RecordBatch:
         return RecordBatch._from_pyrecordbatch(self._py.to_recordbatch())
 
+    @property
+    def skipped_corrupt_files(self) -> list[tuple[str, str, bool]]:
+        """Files skipped during execution due to ignore_corrupt_files=True.
+
+        Returns a list of (path, reason, partial) tuples. ``partial=True`` means
+        some batches were already emitted before corruption was detected; the file
+        was not fully skipped.
+        """
+        return self._py.skipped_corrupt_files
+
     def _plan_to_mermaid_string(self) -> str:
         """Convert query_plan dict to mermaid diagram string (bottom-up)."""
         metrics = {int(item["id"]): item["stats"] for item in self.to_recordbatch().to_pylist()}
 
@@ -35,6 +35,7 @@ def read_csv(
     io_config: IOConfig | None = None,
     file_path_column: str | None = None,
     hive_partitioning: bool = False,
+    ignore_corrupt_files: bool = False,
     _buffer_size: int | None = None,
     _chunk_size: int | None = None,
     checkpoint: "CheckpointConfig | None" = None,
@@ -57,6 +58,9 @@ def read_csv(
         checkpoint: Optional :class:`daft.CheckpointConfig` for progress tracking across runs. Bundles the
             checkpoint store, the source key column (``on=``), and optional anti-join tuning. Rows whose key
             already exists in the store are skipped on re-run. Requires the Ray runner.
+        ignore_corrupt_files: If True, corrupt or unreadable CSV files are silently skipped instead
+            of raising an error. Skipped files are recorded in ``df.skipped_corrupt_files`` after collection.
+            Defaults to False.
 
     Returns:
         DataFrame: parsed DataFrame
@@ -93,6 +97,7 @@ def read_csv(
         allow_variable_columns=allow_variable_columns,
         buffer_size=_buffer_size,
         chunk_size=_chunk_size,
+        ignore_corrupt_files=ignore_corrupt_files,
     )
     file_format_config = FileFormatConfig.from_csv_config(csv_config)
     storage_config = StorageConfig(True, io_config)
 
@@ -30,6 +30,7 @@ def read_parquet(
     file_path_column: str | None = None,
     hive_partitioning: bool = False,
     coerce_int96_timestamp_unit: str | TimeUnit | None = None,
+    ignore_corrupt_files: bool = False,
     _multithreaded_io: bool | None = None,
     _chunk_size: int | None = None,  # A hidden parameter for testing purposes.
     checkpoint: "CheckpointConfig | None" = None,
@@ -45,6 +46,11 @@ def read_parquet(
         file_path_column: Include the source path(s) as a column with this name. Defaults to None.
         hive_partitioning: Whether to infer hive_style partitions from file paths and include them as columns in the Dataframe. Defaults to False.
         coerce_int96_timestamp_unit: TimeUnit to coerce Int96 TimeStamps to. e.g.: [ns, us, ms], Defaults to None.
+        ignore_corrupt_files: If True, corrupt or unreadable Parquet files are silently skipped
+            instead of raising an error. Skipped files are recorded in ``df.skipped_corrupt_files`` after
+            collection. Only genuine format errors (bad magic bytes, truncated footer, corrupt
+            row-group data) are ignored; network errors and permission errors are still raised.
+            Defaults to False.
         _multithreaded_io: Whether to use multithreading for IO threads. Setting this to False can be helpful in reducing
             the amount of system resources (number of connections and thread contention) when running in the Ray runner.
             Defaults to None, which will let Daft decide based on the runner it is currently using.
@@ -94,7 +100,12 @@ def read_parquet(
         raise ValueError("row_groups are only supported when reading multiple non-globbed/wildcarded files")
 
     file_format_config = FileFormatConfig.from_parquet_config(
-        ParquetSourceConfig(coerce_int96_timestamp_unit=pytimeunit, row_groups=row_groups, chunk_size=_chunk_size)
+        ParquetSourceConfig(
+            coerce_int96_timestamp_unit=pytimeunit,
+            row_groups=row_groups,
+            chunk_size=_chunk_size,
+            ignore_corrupt_files=ignore_corrupt_files,
+        )
     )
     storage_config = StorageConfig(multithreaded_io, io_config)
 
 
@@ -115,6 +115,7 @@ def read_iceberg(
     tag: str | None = None,
     io_config: IOConfig | None = None,
     checkpoint: "CheckpointConfig | None" = None,
+    ignore_corrupt_files: bool = False,
 ) -> DataFrame:
     """Create a DataFrame from an Iceberg table.
 
@@ -130,6 +131,9 @@ def read_iceberg(
         checkpoint: Optional :class:`daft.CheckpointConfig` for progress tracking across runs. Bundles the
             checkpoint store, the source key column (``on=``), and optional anti-join tuning. Rows whose key
             already exists in the store are skipped on re-run. Requires the Ray runner.
+        ignore_corrupt_files (bool): If True, silently skip corrupt or unreadable data files
+            instead of raising an error. Skipped files are recorded in ``df.skipped_corrupt_files``
+            after collection. Defaults to False.
 
     Returns:
         DataFrame: a DataFrame with the schema converted from the specified Iceberg table
@@ -175,7 +179,9 @@ def read_iceberg(
     multithreaded_io = runners.get_or_create_runner().name != "ray"
     storage_config = StorageConfig(multithreaded_io, io_config)
 
-    iceberg_operator = IcebergScanOperator(table, snapshot_id=snapshot_id, storage_config=storage_config)
+    iceberg_operator = IcebergScanOperator(
+        table, snapshot_id=snapshot_id, storage_config=storage_config, ignore_corrupt_files=ignore_corrupt_files
+    )
 
     handle = ScanOperatorHandle.from_python_scan_operator(iceberg_operator)
     builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
 
@@ -87,6 +87,7 @@ def __init__(
         iceberg_table: Table,
         snapshot_id: int | None,
         storage_config: StorageConfig,
+        ignore_corrupt_files: bool = False,
     ) -> None:
         super().__init__()
         iceberg_schema = (
@@ -96,6 +97,7 @@ def __init__(
         self._iceberg_schema = iceberg_schema
         self._snapshot_id = snapshot_id
         self._storage_config = storage_config
+        self._ignore_corrupt_files = ignore_corrupt_files
         self._field_id_mapping = visit(iceberg_schema, SchemaFieldIdMappingVisitor())
         self._schema = convert_iceberg_schema(iceberg_schema)
         self._partition_keys = iceberg_partition_spec_to_fields(iceberg_schema, self._iceberg_table.spec())
@@ -200,7 +202,10 @@ def _create_regular_scan_tasks(self, pushdowns: PyPushdowns) -> Iterator[ScanTas
             file_format = file.file_format
             if file_format == "PARQUET":
                 file_format_config = FileFormatConfig.from_parquet_config(
-                    ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
+                    ParquetSourceConfig(
+                        field_id_mapping=self._field_id_mapping,
+                        ignore_corrupt_files=self._ignore_corrupt_files,
+                    )
                 )
             else:
                 # TODO: Support ORC and AVRO when we can read it
 
@@ -33,6 +33,7 @@
         * [Common Crawl](datasets/common-crawl.md)
     * Data Connectors
         * [Connectors](connectors/index.md)
+        * [Generic File Source Options](connectors/generic-file-source-options.md)
         * [Custom Connectors](connectors/custom.md)
         * [Custom Catalogs](connectors/custom-catalogs.md)
         * [AWS Glue](connectors/glue.md)
 
@@ -0,0 +1,122 @@
+# Generic File Source Options
+
+These options apply to `read_parquet`, `read_csv`, and `read_iceberg`. They are not tied to any single connector or format. Other readers (`read_json`, `read_warc`, `read_text`) do not support these options.
+
+## Ignoring Corrupt Files
+
+When reading large collections of files, some files may be unreadable — corrupt, truncated, or deleted between the time Daft lists them and the time it reads them. By default, Daft raises an error and halts the query. The `ignore_corrupt_files` option changes that behavior: qualifying files are silently skipped and the query continues with the remaining data.
+
+### Enabling `ignore_corrupt_files`
+
+Pass `ignore_corrupt_files=True` to any of the supported reader functions:
+
+```python
+import daft
+
+# Parquet / CSV (glob-based)
+df = daft.read_parquet("s3://my-bucket/data/**/*.parquet", ignore_corrupt_files=True)
+df = daft.read_csv("s3://my-bucket/data/**/*.csv", ignore_corrupt_files=True)
+
+# Iceberg
+import pyiceberg
+table = pyiceberg.table.StaticTable.from_metadata("s3://bucket/iceberg/metadata.json")
+df = daft.read_iceberg(table, ignore_corrupt_files=True)
+
+df.collect()
+```
+
+### What counts as "corrupt"
+
+Daft skips a file when it encounters a problem that is specific to the file itself and cannot be resolved by retrying:
+
+| Category | Examples |
+|---|---|
+| **Invalid format** | Bad Parquet magic bytes, truncated footer, mismatched row/column counts |
+| **Corrupt data** | Unreadable row group, invalid CSV encoding, wrong field count in a row |
+| **Missing file** | File deleted between listing and reading (e.g. concurrent compaction or partition overwrite) |
+
+Daft does **not** skip files for transient infrastructure problems, because those can and should be retried:
+
+| Category | Examples |
+|---|---|
+| **Network errors** | Connection reset, read timeout, throttled I/O |
+| **Permission errors** | Access denied, insufficient credentials |
+
+This distinction matters. Silently retrying a permission error would mask a misconfiguration that needs human attention.
+
+### Observability: knowing what was skipped
+
+`ignore_corrupt_files` is designed around the principle that **errors should be visible, not hidden**. Daft provides two complementary observability mechanisms.
+
+#### Python warning logs
+
+Daft emits a `WARNING`-level log message for every skipped file, including the file path and the reason:
+
+```
+WARNING daft.io - Skipping corrupt Parquet file s3://my-bucket/data/bad.parquet: ...
+WARNING daft.io - Skipping corrupt CSV chunk in s3://my-bucket/data/partial.csv: ...
+```
+
+You can see these with standard Python logging:
+
+```python
+import logging
+logging.basicConfig(level=logging.WARNING)
+```
+
+#### `df.skipped_corrupt_files` — programmatic access
+
+After materializing the dataframe with `.collect()`, the `skipped_corrupt_files` property returns the list of skipped `(path, reason)` pairs as structured data, so your pipeline code can act on them:
+
+```python
+df = daft.read_parquet("s3://my-bucket/data/**/*.parquet", ignore_corrupt_files=True)
+df.collect()
+
+skipped = df.skipped_corrupt_files  # list[tuple[str, str, bool]]
+for path, reason, partial in skipped:
+    tag = " (partial)" if partial else ""
+    print(f"Skipped{tag}: {path}\n  Reason: {reason}")
+```
+
+Each entry is a `(path, reason, partial)` tuple. When `partial` is `True`, some batches from the file were already emitted before the corruption was detected — the file was not fully skipped. This can happen when corruption appears in a later row group.
+
+`skipped_corrupt_files` is available after calling `.collect()` on the dataframe. Other execution methods such as `.count_rows()` do not populate this property, because they operate on an internal dataframe rather than materializing the original one.
+
+### Handling skipped files in production
+
+Because `skipped_corrupt_files` is plain Python data, you can plug it directly into your existing alerting or data-quality workflows:
+
+```python
+import daft
+
+df = daft.read_parquet("s3://my-bucket/nightly/**/*.parquet", ignore_corrupt_files=True)
+df.write_parquet("s3://my-bucket/processed/")
+
+skipped = df.skipped_corrupt_files
+if skipped:
+    # Option 1: send an alert
+    send_alert(f"{len(skipped)} file(s) skipped during nightly run", details=skipped)
+
+    # Option 2: push to a dead-letter queue for later reprocessing
+    for path, reason, partial in skipped:
+        dead_letter_queue.put({"path": path, "reason": reason, "partial": partial, "run": TODAY})
+```
+
+This pattern — **errors visible, impact contained, tooling to fix** — lets automated batch jobs complete reliably while still surfacing problems for human review.
+
+!!! warning "Do not use `ignore_corrupt_files` as a catch-all"
+    This option is designed for files that are genuinely unreadable. It should not be used to suppress transient I/O errors (network issues, throttling) — Daft already retries those automatically. If you find yourself needing `ignore_corrupt_files` for a large fraction of your files, investigate the root cause rather than silencing the errors.
+
+### Supported formats
+
+| Format | File-level skip | Within-file error skip |
+|---|---|---|
+| Parquet (`read_parquet`) | Yes (bad footer, wrong magic bytes, file too small) | Yes (corrupt row group data) |
+| CSV (`read_csv`) | Yes (unreadable file, truncated) | Yes (bad encoding, wrong field count in chunk) |
+| Iceberg (`read_iceberg`) | Yes (data files go through the Rust Parquet reader) | Yes |
+
+!!! note "Iceberg delete files"
+    Corruption in Iceberg *delete files* is not covered. If a delete file is unreadable, Daft will raise an error regardless of `ignore_corrupt_files`. Delete files are small metadata structures and corruption there generally indicates a more serious catalog inconsistency.
+
+!!! note "Count pushdown"
+    When `ignore_corrupt_files` is enabled for Parquet, count pushdown is disabled. This means `df.count()` will read all row-group data instead of using the metadata-only optimization, which may be slower on large datasets.