ray-project · valerie-cal · Mar 10, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 15, 2025
diff --git a/deltacat/examples/experimental/rivulet/pytorch_demo.ipynb b/deltacat/examples/experimental/rivulet/pytorch_demo.ipynb
@@ -1,8 +1,9 @@
 {
  "cells": [
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "2fb18b4d46a9548",
+   "metadata": {},
    "source": [
     "# PyTorch Demo: Sentiment Analysis and Question Detection with Rivulet Dataset\n",
     "\n",
@@ -13,14 +14,16 @@
     "- **Pytorch Integration:** Easily allows passing of data between pytorch models and transformers.\n",
     "- **Non-Destructive Transformation:** Transforms the data (e.g., adding sentiment and question classification) without modifying the original dataset.\n",
     "- **Exporting Data:** Exports the modified dataset to supported formats such as Parquet and JSON for further analysis."
-   ],
-   "id": "2fb18b4d46a9548"
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
    "metadata": {
     "collapsed": true
    },
-   "cell_type": "code",
+   "outputs": [],
    "source": [
     "import torch\n",
     "from typing import List\n",
@@ -29,14 +32,14 @@
     "import pathlib\n",
     "import pyarrow as pa\n",
     "import pyarrow.csv as csv"
-   ],
-   "id": "initial_id",
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "execution_count": null,
+   "id": "51a2ddaed83da5f3",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Load tokenizer and model for sentiment analysis\n",
     "sentiment_tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-finetuned-sst-2-english\")\n",
@@ -46,14 +49,14 @@
     "question_tokenizer = AutoTokenizer.from_pretrained(\"shahrukhx01/question-vs-statement-classifier\")\n",
     "question_model = AutoModelForSequenceClassification.from_pretrained(\"shahrukhx01/question-vs-statement-classifier\")\n",
     "question_model.eval()"
-   ],
-   "id": "51a2ddaed83da5f3",
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "execution_count": null,
+   "id": "b74792a57b9b28c1",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Create a rivulet dataset using the CSV file\n",
     "cwd = pathlib.Path.cwd()\n",
@@ -65,29 +68,29 @@
     "    merge_keys=\"msg_id\"\n",
     ")\n",
     "ds.print(num_records=10)"
-   ],
-   "id": "b74792a57b9b28c1",
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "execution_count": null,
+   "id": "1b90411fd69378e9",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# define a new schema with fields for pytorch classification\n",
     "ds.add_fields([\n",
     "    (\"msg_id\", dc.Datatype.int64()),\n",
     "    (\"sentiment\", dc.Datatype.float()),\n",
     "    (\"is_question\", dc.Datatype.float())\n",
     "], schema_name=\"message_classifier\", merge_keys=[\"msg_id\"])"
-   ],
-   "id": "1b90411fd69378e9",
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "execution_count": null,
+   "id": "587f17e09e5d306a",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# compute classification values and update records in dataset\n",
     "def compute_sentiments(batch: pa.RecordBatch) -> List[float]:\n",
@@ -134,21 +137,18 @@
     "\n",
     "dataset_writer.flush()\n",
     "print(\"Sentiment and is_question values have been computed and updated in the dataset.\")"
-   ],
-   "id": "587f17e09e5d306a",
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "execution_count": null,
+   "id": "8ef2dd2a1bc4e66a",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# export to a supported format (JSON, PARQUET, FEATHER)\n",
     "ds.export(file_uri=\"./output.json\", format=\"json\")"
-   ],
-   "id": "8ef2dd2a1bc4e66a",
-   "outputs": [],
-   "execution_count": null
+   ]
   }
  ],
  "metadata": {

diff --git a/deltacat/examples/experimental/rivulet/wds_demo.py b/deltacat/examples/experimental/rivulet/wds_demo.py
@@ -0,0 +1,101 @@
+import torch
+from deltacat.storage.rivulet import Dataset
+import pyarrow as pa
+from typing import List
+from PIL import Image
+import io
+from deltacat.storage.rivulet.schema.schema import Datatype
+from transformers import AutoImageProcessor, AutoModelForImageClassification
+
+
+# tar_path = "deltacat/tests/test_utils/resources/imagenet1k-train-0000.tar"
+tar_path = "deltacat/tests/test_utils/resources/nestedjson.tar"
+
+# Load the dataset from the tar file
+ds = Dataset.from_webdataset(
+    name="bird_species_test",  # Name of the dataset
+    file_uri=tar_path,  # Location of the tar file
+    merge_keys="filename",  # Merge batches using the 'filename' key
+)
+
+# Print the available fields in the dataset
+print(ds.fields)
+
+# Load the image processor and classification model from HuggingFace
+processor = AutoImageProcessor.from_pretrained("chriamue/bird-species-classifier")
+model = AutoModelForImageClassification.from_pretrained(
+    "chriamue/bird-species-classifier"
+)
+model.eval()
+
+
+# Function to classify bird species from a record batch
+def compute_bird_species(batch: pa.RecordBatch) -> List[str]:
+    # Extract the binary image column
+    image_column = batch.column("image_binary").to_pylist()
+
+    # Initialize list to store PIL Image objects
+    pil_images = []
+    for img_binary in image_column:
+        try:
+            # Convert binary data to image and convert to RGB
+            img = Image.open(io.BytesIO(img_binary)).convert("RGB")
+            pil_images.append(img)
+        except Exception as e:
+            # Print error if image decoding fails
+            print(f"Error reading image: {e}")
+
+    # If images were successfully decoded
+    if pil_images:
+        # Preprocess images and run them through the model
+        inputs = processor(images=pil_images, return_tensors="pt")
+        with torch.no_grad():  # Disable gradient computation
+            outputs = model(**inputs)
+
+        # Get the predicted label indices
+        predicted_ids = torch.argmax(outputs.logits, dim=1).tolist()
+
+        # Map indices to human-readable class labels
+        predicted_labels = [model.config.id2label[idx] for idx in predicted_ids]
+
+        return predicted_labels
+    else:
+        # Return empty list if no images were valid
+        return []
+
+
+# Add new fields to the dataset: filename and predicted bird species
+ds.add_fields(
+    [
+        ("filename", Datatype.string()),  # String type for filename
+        ("bird_species", Datatype.string()),  # String type for predicted label
+    ],
+    schema_name="bird_species_classifier",
+    merge_keys=["filename"],
+)  # Schema name and merge key
+
+# Initialize writer to store output under the new schema
+dataset_writer = ds.writer(schema_name="bird_species_classifier")
+
+# Iterate over each Arrow batch in the dataset
+for batch in ds.scan().to_arrow():
+    print(batch)  # Print the batch contents
+    filenames = batch.column("filename").to_pylist()  # Extract filenames
+    bird_labels = compute_bird_species(batch)  # Run classification on batch
+
+    rows_to_write = []  # Prepare rows to be written
+    if bird_labels:
+        # Create a list of dictionaries combining filename and predicted species
+        rows_to_write = [
+            {"filename": fname, "bird_species": bird_species}
+            for fname, bird_species in zip(filenames, bird_labels)
+        ]
+    print("ROWS", rows_to_write)  # Print rows to be written
+    dataset_writer.write(rows_to_write)  # Write the output to dataset
+
+dataset_writer.flush()
+
+# Export the results to a local JSON file
+ds.export(file_uri="./bird_classification_species_predictions.json", format="json")
+
+print("Bird species classification complete.")
diff --git a/deltacat/experimental/storage/rivulet/dataset.py b/deltacat/experimental/storage/rivulet/dataset.py
@@ -38,7 +38,9 @@
 from deltacat.experimental.storage.rivulet.reader.query_expression import (
     QueryExpression,
 )
-
+from deltacat.experimental.storage.rivulet.reader.webdataset_reader import (
+    WebDatasetReader,
+)
 from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
 from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
     MemtableDatasetWriter,
@@ -479,6 +481,77 @@ def from_json(
 
         return dataset
 
+    @classmethod
+    def from_webdataset(
+        cls,
+        name: str,
+        file_uri: str,
+        merge_keys: str | Iterable[str] = None,
+        metadata_uri: Optional[str] = None,
+        schema_mode: str = "union",
+        batch_size: Optional[int] = 1,
+        filesystem: Optional[pyarrow.fs.FileSystem] = None,
+        namespace: str = DEFAULT_NAMESPACE,
+    ) -> "Dataset":
+        """
+        Create a Dataset from a single webdataset tar file.
+
+        TODO: Add support for reading directories with multiple WDS files.
+
+        Args:
+            name: Unique identifier for the dataset.
+            metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
+            file_uri: Path to a single webdataset file.
+            merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset.
+            schema_mode: Currently ignored as this is for a single file.
+
+        Returns:
+            Dataset: New dataset instance with the schema automatically inferred
+                     from the tar file.
+        """
+        # TODO: integrate this with filesystem from deltacat catalog
+        file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
+        if metadata_uri is None:
+            metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
+        else:
+            metadata_uri, metadata_fs = FileStore.filesystem(
+                metadata_uri, filesystem=filesystem
+            )
+
+            # TODO: when integrating deltacat consider if we can support multiple filesystems
+            if file_fs.type_name != metadata_fs.type_name:
+                raise ValueError(
+                    "File URI and metadata URI must be on the same filesystem."
+                )
+
+        # Read the WebDataset into a PyArrow Table
+        wds_parser = WebDatasetReader(
+            name=name,
+            file_uri=file_uri,
+            merge_keys=merge_keys,
+            schema_mode=schema_mode,
+            batch_size=batch_size,
+            namespace=namespace,
+        )
+        pyarrow_table = wds_parser.to_pyarrow()
+        # Create the Dataset and write to it
+        dataset_schema = Schema.from_pyarrow(
+            pyarrow_table.schema, merge_keys=merge_keys
+        )
+
+        dataset = cls(
+            dataset_name=name,
+            metadata_uri=metadata_uri,
+            schema=dataset_schema,
+            filesystem=file_fs,
+            namespace=namespace,
+        )
+
+        writer = dataset.writer()
+        writer.write(pyarrow_table.to_batches())
+        writer.flush()
+        return dataset
+
     @classmethod
     def from_csv(
         cls,
@@ -522,7 +595,7 @@ def from_csv(
                 )
 
         # Read the CSV file into a PyArrow Table
-        table = pyarrow.csv.read_csv(file_uri, filesystem=file_fs)
+        table = pyarrow.csv.read_csv(file_uri)
         pyarrow_schema = table.schema
 
         # Create the dataset schema
@@ -718,7 +791,6 @@ def writer(
         :return: new dataset writer with a schema at the conjunction of the given schemas
         """
         schema_name = schema_name or ALL
-
         return MemtableDatasetWriter(
             self._file_provider, self.schemas[schema_name], self._locator, file_format
         )