Eventual-Inc
diff --git a/‎daft/daft/__init__.pyi‎
Lines changed: 1 addition & 1 deletion b/‎daft/daft/__init__.pyi‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎daft/dataframe/dataframe.py‎
Lines changed: 6 additions & 6 deletions b/‎daft/dataframe/dataframe.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎daft/execution/metadata.py‎
Lines changed: 2 additions & 2 deletions b/‎daft/execution/metadata.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎daft/io/_csv.py‎
Lines changed: 1 addition & 1 deletion b/‎daft/io/_csv.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎daft/io/_parquet.py‎
Lines changed: 1 addition & 1 deletion b/‎daft/io/_parquet.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/connectors/ignore-corrupt-files.md‎
Lines changed: 6 additions & 6 deletions b/‎docs/connectors/ignore-corrupt-files.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/daft-csv/src/read.rs‎
Lines changed: 5 additions & 5 deletions b/‎src/daft-csv/src/read.rs‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/daft-distributed/src/statistics/mod.rs‎
Lines changed: 8 additions & 8 deletions b/‎src/daft-distributed/src/statistics/mod.rs‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/daft-local-execution/src/pipeline.rs‎
Lines changed: 3 additions & 3 deletions b/‎src/daft-local-execution/src/pipeline.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/daft-local-execution/src/sources/scan_task.rs‎
Lines changed: 11 additions & 11 deletions b/‎src/daft-local-execution/src/sources/scan_task.rs‎
Lines changed: 11 additions & 11 deletions
@@ -2298,7 +2298,7 @@ class PyExecutionStats:
     def encode(self) -> bytes: ...
     def to_recordbatch(self) -> PyRecordBatch: ...
     @property
-    def skipped_files(self) -> list[tuple[str, str]]: ...
+    def skipped_corrupt_files(self) -> list[tuple[str, str]]: ...
 
 class PyResultReceiver:
     def __aiter__(self) -> PyResultReceiver: ...
 
@@ -186,7 +186,7 @@ def metrics(self) -> RecordBatch | None:
             return self._metadata.to_recordbatch() if self._metadata else None
 
     @property
-    def skipped_files(self) -> list[tuple[str, str]]:
+    def skipped_corrupt_files(self) -> list[tuple[str, str]]:
         """Files skipped during the last execution due to ignore_corrupt_files=True.
 
         Returns a list of ``(path, reason)`` tuples for every file that was skipped.
@@ -196,12 +196,12 @@ def skipped_files(self) -> list[tuple[str, str]]:
 
             df = daft.read_parquet("s3://bucket/data/", ignore_corrupt_files=True)
             df.collect()
-            for path, reason in df.skipped_files:
+            for path, reason in df.skipped_corrupt_files:
                 print(f"Skipped {path}: {reason}")
         """
         if self._result_cache is None:
-            raise ValueError("skipped_files is not available until the DataFrame has been collected")
-        return self._metadata.skipped_files if self._metadata else []
+            raise ValueError("skipped_corrupt_files is not available until the DataFrame has been collected")
+        return self._metadata.skipped_corrupt_files if self._metadata else []
 
     def pipe(
         self,
@@ -4884,12 +4884,12 @@ def _materialize_results(self) -> None:
             assert result is not None
             result.wait()
             self._metadata.write_mermaid()
-            skipped = self._metadata.skipped_files if self._metadata else []
+            skipped = self._metadata.skipped_corrupt_files if self._metadata else []
             if skipped:
                 paths = "\n".join(f"  - {path}" for path, _ in skipped)
                 logger.warning(
                     "%d file(s) were skipped due to corruption or being missing "
-                    "(ignore_corrupt_files=True). Use df.skipped_files for details.\n%s",
+                    "(ignore_corrupt_files=True). Use df.skipped_corrupt_files for details.\n%s",
                     len(skipped),
                     paths,
                 )
 
@@ -103,12 +103,12 @@ def to_recordbatch(self) -> RecordBatch:
         return RecordBatch._from_pyrecordbatch(self._py.to_recordbatch())
 
     @property
-    def skipped_files(self) -> list[tuple[str, str]]:
+    def skipped_corrupt_files(self) -> list[tuple[str, str]]:
         """Files skipped during execution due to ignore_corrupt_files=True.
 
         Returns a list of (path, reason) tuples for every file that was skipped.
         """
-        return self._py.skipped_files
+        return self._py.skipped_corrupt_files
 
     def _plan_to_mermaid_string(self) -> str:
         """Convert query_plan dict to mermaid diagram string (bottom-up)."""
 
@@ -50,7 +50,7 @@ def read_csv(
         file_path_column: Include the source path(s) as a column with this name. Defaults to None.
         hive_partitioning: Whether to infer hive_style partitions from file paths and include them as columns in the Dataframe. Defaults to False.
         ignore_corrupt_files: If True, corrupt or unreadable CSV files are silently skipped instead
-            of raising an error. Skipped files are recorded in ``df.skipped_files`` after collection.
+            of raising an error. Skipped files are recorded in ``df.skipped_corrupt_files`` after collection.
             Defaults to False.
 
     Returns:
 
@@ -41,7 +41,7 @@ def read_parquet(
         hive_partitioning: Whether to infer hive_style partitions from file paths and include them as columns in the Dataframe. Defaults to False.
         coerce_int96_timestamp_unit: TimeUnit to coerce Int96 TimeStamps to. e.g.: [ns, us, ms], Defaults to None.
         ignore_corrupt_files: If True, corrupt or unreadable Parquet files are silently skipped
-            instead of raising an error. Skipped files are recorded in ``df.skipped_files`` after
+            instead of raising an error. Skipped files are recorded in ``df.skipped_corrupt_files`` after
             collection. Only genuine format errors (bad magic bytes, truncated footer, corrupt
             row-group data) are ignored; network errors and permission errors are still raised.
             Defaults to False.
 
@@ -60,32 +60,32 @@ import logging
 logging.basicConfig(level=logging.WARNING)
 ```
 
-### `df.skipped_files` — programmatic access
+### `df.skipped_corrupt_files` — programmatic access
 
-After calling `.collect()`, the `skipped_files` property returns the list of skipped `(path, reason)` pairs as structured data, so your pipeline code can act on them:
+After calling `.collect()`, the `skipped_corrupt_files` property returns the list of skipped `(path, reason)` pairs as structured data, so your pipeline code can act on them:
 
 ```python
 df = daft.read_parquet("s3://my-bucket/data/**/*.parquet", ignore_corrupt_files=True)
 df.collect()
 
-skipped = df.skipped_files  # list[tuple[str, str]]
+skipped = df.skipped_corrupt_files  # list[tuple[str, str]]
 for path, reason in skipped:
     print(f"Skipped: {path}\n  Reason: {reason}")
 ```
 
-`skipped_files` is available after any action that triggers execution (`.collect()`, `.write_parquet()`, etc.).
+`skipped_corrupt_files` is available after any action that triggers execution (`.collect()`, `.write_parquet()`, etc.).
 
 ## Handling skipped files in production
 
-Because `skipped_files` is plain Python data, you can plug it directly into your existing alerting or data-quality workflows:
+Because `skipped_corrupt_files` is plain Python data, you can plug it directly into your existing alerting or data-quality workflows:
 
 ```python
 import daft
 
 df = daft.read_parquet("s3://my-bucket/nightly/**/*.parquet", ignore_corrupt_files=True)
 df.write_parquet("s3://my-bucket/processed/")
 
-skipped = df.skipped_files
+skipped = df.skipped_corrupt_files
 if skipped:
     # Option 1: send an alert
     send_alert(f"{len(skipped)} file(s) skipped during nightly run", details=skipped)
 
@@ -30,7 +30,7 @@ use tokio_util::io::StreamReader;
 
 use crate::{CsvConvertOptions, CsvParseOptions, CsvReadOptions, metadata::read_csv_schema_single};
 
-type SkippedFilesCollector = Option<std::sync::Arc<std::sync::Mutex<Vec<(String, String)>>>>;
+type SkippedCorruptFilesCollector = Option<std::sync::Arc<std::sync::Mutex<Vec<(String, String)>>>>;
 
 trait ByteRecordChunkStream: Stream<Item = super::Result<Vec<csv_async::ByteRecord>>> {}
 impl<S> ByteRecordChunkStream for S where S: Stream<Item = super::Result<Vec<csv_async::ByteRecord>>>
@@ -171,7 +171,7 @@ pub async fn stream_csv(
     io_stats: Option<IOStatsRef>,
     max_chunks_in_flight: Option<usize>,
     ignore_corrupt_files: bool,
-    skipped_files: SkippedFilesCollector,
+    skipped_corrupt_files: SkippedCorruptFilesCollector,
 ) -> DaftResult<BoxStream<'static, DaftResult<RecordBatch>>> {
     let (source_type, _) = parse_url(&uri)?;
     let is_compressed = CompressionCodec::from_uri(&uri).is_some();
@@ -207,10 +207,10 @@ pub async fn stream_csv(
                 // Level 2: filter per-chunk errors that indicate format corruption
                 // (e.g. bad encoding or wrong field count discovered mid-stream).
                 let uri_for_warn = uri.clone();
-                let skipped_files_inner = skipped_files.clone();
+                let skipped_corrupt_files_inner = skipped_corrupt_files.clone();
                 let filtered = stream.filter_map(move |result| {
                     let uri_w = uri_for_warn.clone();
-                    let skipped = skipped_files_inner.clone();
+                    let skipped = skipped_corrupt_files_inner.clone();
                     futures::future::ready(match result {
                         Ok(batch) => Some(Ok(batch)),
                         Err(ref e) if is_csv_corrupt(e) => {
@@ -233,7 +233,7 @@ pub async fn stream_csv(
         // Level 1: file-open / schema-inference errors (format error or truncated file).
         Err(ref e) if ignore_corrupt_files && is_csv_corrupt(e) => {
             log::warn!("Skipping unreadable/corrupt CSV file {uri}: {e}");
-            if let Some(ref collector) = skipped_files
+            if let Some(ref collector) = skipped_corrupt_files
                 && let Ok(mut v) = collector.lock()
             {
                 v.push((uri, e.to_string()));
 
@@ -100,7 +100,7 @@ pub type StatisticsManagerRef = Arc<StatisticsManager>;
 pub struct StatisticsManager {
     runtime_node_managers: Arc<HashMap<NodeID, RuntimeNodeManager>>,
     subscribers: Mutex<Vec<Box<dyn StatisticsSubscriber>>>,
-    skipped_files: Mutex<Vec<(String, String)>>,
+    skipped_corrupt_files: Mutex<Vec<(String, String)>>,
 }
 
 impl StatisticsManager {
@@ -136,17 +136,17 @@ impl StatisticsManager {
         Ok(Arc::new(Self {
             runtime_node_managers,
             subscribers: Mutex::new(subscribers),
-            skipped_files: Mutex::new(vec![]),
+            skipped_corrupt_files: Mutex::new(vec![]),
         }))
     }
 
     pub fn handle_event(&self, event: TaskEvent) -> DaftResult<()> {
         // Accumulate skipped files from completed tasks so they are available in export_metrics().
         if let TaskEvent::Completed { ref stats, .. } = event
-            && !stats.skipped_files.is_empty()
-            && let Ok(mut v) = self.skipped_files.lock()
+            && !stats.skipped_corrupt_files.is_empty()
+            && let Ok(mut v) = self.skipped_corrupt_files.lock()
         {
-            v.extend(stats.skipped_files.iter().cloned());
+            v.extend(stats.skipped_corrupt_files.iter().cloned());
         }
 
         for node_id in &event.context().node_ids {
@@ -173,11 +173,11 @@ impl StatisticsManager {
             .values()
             .map(RuntimeNodeManager::export_snapshot)
             .collect();
-        let skipped_files = self
-            .skipped_files
+        let skipped_corrupt_files = self
+            .skipped_corrupt_files
             .lock()
             .map(|v| v.clone())
             .unwrap_or_default();
-        ExecutionStats::new("".into(), nodes).with_skipped_files(skipped_files)
+        ExecutionStats::new("".into(), nodes).with_skipped_corrupt_files(skipped_corrupt_files)
     }
 }
@@ -263,7 +263,7 @@ pub struct BuilderContext {
     pub meter: Meter,
     context: HashMap<String, String>,
     shuffle_server: Option<(Arc<ShuffleFlightServer>, String)>,
-    pub skipped_files: std::sync::Arc<std::sync::Mutex<Vec<(String, String)>>>,
+    pub skipped_corrupt_files: std::sync::Arc<std::sync::Mutex<Vec<(String, String)>>>,
 }
 
 impl BuilderContext {
@@ -283,7 +283,7 @@ impl BuilderContext {
             meter,
             context,
             shuffle_server,
-            skipped_files: std::sync::Arc::new(std::sync::Mutex::new(Vec::new())),
+            skipped_corrupt_files: std::sync::Arc::new(std::sync::Mutex::new(Vec::new())),
         }
     }
 
@@ -400,7 +400,7 @@ fn physical_plan_to_pipeline(
                 pushdowns.clone(),
                 schema.clone(),
                 cfg,
-                Some(ctx.skipped_files.clone()),
+                Some(ctx.skipped_corrupt_files.clone()),
             );
             SourceNode::new(
                 Box::new(scan_task_source),
 
@@ -30,15 +30,15 @@ use crate::{
     },
 };
 
-type SkippedFilesCollector = Option<Arc<std::sync::Mutex<Vec<(String, String)>>>>;
+type SkippedCorruptFilesCollector = Option<Arc<std::sync::Mutex<Vec<(String, String)>>>>;
 
 pub struct ScanTaskSource {
     receiver: UnboundedReceiver<(InputId, Vec<ScanTaskRef>)>,
     source_config: Option<Arc<SourceConfig>>,
     pushdowns: Pushdowns,
     schema: SchemaRef,
     num_parallel_tasks: usize,
-    skipped_files: SkippedFilesCollector,
+    skipped_corrupt_files: SkippedCorruptFilesCollector,
 }
 
 impl ScanTaskSource {
@@ -48,7 +48,7 @@ impl ScanTaskSource {
         pushdowns: Pushdowns,
         schema: SchemaRef,
         cfg: &DaftExecutionConfig,
-        skipped_files: SkippedFilesCollector,
+        skipped_corrupt_files: SkippedCorruptFilesCollector,
     ) -> Self {
         let num_cpus = get_compute_pool_num_threads();
         let num_parallel_tasks = if cfg.scantask_max_parallel > 0 {
@@ -62,7 +62,7 @@ impl ScanTaskSource {
             pushdowns,
             schema,
             num_parallel_tasks,
-            skipped_files,
+            skipped_corrupt_files,
         }
     }
 
@@ -75,7 +75,7 @@ impl ScanTaskSource {
         chunk_size: usize,
         schema: SchemaRef,
         maintain_order: bool,
-        skipped_files: SkippedFilesCollector,
+        skipped_corrupt_files: SkippedCorruptFilesCollector,
     ) -> common_runtime::RuntimeTask<DaftResult<()>> {
         let io_runtime = get_io_runtime(true);
 
@@ -123,7 +123,7 @@ impl ScanTaskSource {
                         chunk_size,
                         sender,
                         input_id,
-                        skipped_files.clone(),
+                        skipped_corrupt_files.clone(),
                     ));
                 }
 
@@ -241,7 +241,7 @@ impl Source for ScanTaskSource {
             chunk_size,
             self.schema.clone(),
             maintain_order,
-            self.skipped_files.clone(),
+            self.skipped_corrupt_files.clone(),
         );
         let result_stream = output_receiver.into_stream().map(Ok);
         let combined_stream = combine_stream(result_stream, processor_task.map(|x| x?));
@@ -494,7 +494,7 @@ async fn forward_scan_task_stream(
     chunk_size: usize,
     sender: ScanTaskOutputSender,
     input_id: InputId,
-    skipped_files: SkippedFilesCollector,
+    skipped_corrupt_files: SkippedCorruptFilesCollector,
 ) -> DaftResult<InputId> {
     let schema = scan_task.materialized_schema();
     let mut stream = stream_scan_task(
@@ -503,7 +503,7 @@ async fn forward_scan_task_stream(
         delete_map,
         maintain_order,
         chunk_size,
-        skipped_files,
+        skipped_corrupt_files,
     )
     .await?;
     let mut has_data = false;
@@ -557,7 +557,7 @@ async fn stream_scan_task(
     delete_map: Option<Arc<HashMap<String, Vec<i64>>>>,
     maintain_order: bool,
     chunk_size: usize,
-    skipped_files: SkippedFilesCollector,
+    skipped_corrupt_files: SkippedCorruptFilesCollector,
 ) -> DaftResult<impl Stream<Item = DaftResult<MicroPartition>> + Send> {
     let pushdown_columns = scan_task
         .pushdowns
@@ -613,7 +613,7 @@ async fn stream_scan_task(
         delete_map,
         maintain_order,
         chunk_size,
-        skipped_files,
+        skipped_corrupt_files,
     )
     .await?;