Add a EncodingFormat for .arff files (#835)

ccl-core · web-flow · commit aa6d9e6485ff · 2025-04-01T13:47:49.000+02:00
diff --git a/datasets/1.0/credit-g/metadata.json b/datasets/1.0/credit-g/metadata.json
diff --git a/datasets/1.0/credit-g/output/data-file-description.jsonl b/datasets/1.0/credit-g/output/data-file-description.jsonl
@@ -0,0 +1,2 @@
+{"features/0-checking_status": "<0", "features/1-duration": 6.0, "features/2-credit_history": "critical/other existing credit", "features/3-purpose": "radio/tv", "features/4-credit_amount": 1169.0, "features/5-savings_status": "no known savings", "features/6-employment": ">=7", "features/7-installment_commitment": 4.0, "features/8-personal_status": "male single", "features/9-other_parties": "none", "features/10-residence_since": 4.0, "features/11-property_magnitude": "real estate", "features/12-age": 67.0, "features/13-other_payment_plans": "none", "features/14-housing": "own", "features/15-existing_credits": 2.0, "features/16-job": "skilled", "features/17-num_dependents": 1.0, "features/18-own_telephone": "yes", "features/19-foreign_worker": "yes", "features/20-class": "good"}
+{"features/0-checking_status": "0<=X<200", "features/1-duration": 48.0, "features/2-credit_history": "existing paid", "features/3-purpose": "radio/tv", "features/4-credit_amount": 5951.0, "features/5-savings_status": "<100", "features/6-employment": "1<=X<4", "features/7-installment_commitment": 2.0, "features/8-personal_status": "female div/dep/mar", "features/9-other_parties": "none", "features/10-residence_since": 2.0, "features/11-property_magnitude": "real estate", "features/12-age": 22.0, "features/13-other_payment_plans": "none", "features/14-housing": "own", "features/15-existing_credits": 1.0, "features/16-job": "skilled", "features/17-num_dependents": 1.0, "features/18-own_telephone": "none", "features/19-foreign_worker": "yes", "features/20-class": "bad"}
diff --git a/docs/croissant-spec-draft.md b/docs/croissant-spec-draft.md
@@ -622,8 +622,8 @@ Most of the important properties needed to describe a `FileObject` are defined i
   <tr>
     <td><a href="https://schema.org/encodingFormat">sc:encodingFormat</a></td>
     <td><a href="http://schema.org/Text">Text</a></td>
-    <td>ONE</td>
-    <td>The format of the file, given as a mime type.</td>
+    <td>MANY</td>
+    <td>The formats of the file, given as a mime type. Unregistered or niche encoding and file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia/Wikidata entry.</td>
   </tr>
   <tr>
     <td><a href="https://schema.org/sameAs">sc:sameAs</a></td>
diff --git a/python/mlcroissant/mlcroissant/_src/core/constants.py b/python/mlcroissant/mlcroissant/_src/core/constants.py
@@ -179,7 +179,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
     SCHEMA_ORG_DATE_PUBLISHED: "date_published",
     SCHEMA_ORG_DESCRIPTION: "description",
     SCHEMA_ORG_DISTRIBUTION: "distribution",
-    SCHEMA_ORG_ENCODING_FORMAT: "encoding_format",
+    SCHEMA_ORG_ENCODING_FORMAT: "encoding_formats",
     SCHEMA_ORG_KEYWORDS: "keywords",
     SCHEMA_ORG_LICENSE: "license",
     SCHEMA_ORG_MD5: "md5",
@@ -209,9 +209,14 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
 class EncodingFormat:
     """Supported MIME Types in Croissant.
 
+    Unregistered or niche encoding and file formats can be indicated instead via the most
+    appropriate URL, e.g. defining Web page or a Wikipedia/Wikidata entry.
+    Supersedes fileFormat.
+
     We inherit the wrong naming `encodingFormat` from https://schema.org/encodingFormat.
     """
 
+    ARFF = "https://ml.cms.waikato.ac.nz/weka/arff.html"
     CSV = "text/csv"
     GIT = "git+https"
     JPG = "image/jpeg"
diff --git a/python/mlcroissant/mlcroissant/_src/core/optional.py b/python/mlcroissant/mlcroissant/_src/core/optional.py
@@ -91,6 +91,11 @@ def librosa(cls) -> types.ModuleType:  # pylint: disable=invalid-name
         """Cached librosa module."""
         return _try_import("librosa", package_name="librosa")
 
+    @cached_class_property
+    def scipy(cls) -> types.ModuleType:  # pylint: disable=invalid-name
+        """Cached scipy module."""
+        return _try_import("scipy", package_name="scipy")
+
     @cached_class_property
     def torchdata_datapipes(cls) -> types.ModuleType:
         """Cached torchdata module."""
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py b/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py
@@ -82,9 +82,9 @@ def _add_operations_for_file_object(
         operation = first_operation
         # Extract the file if needed
         if (
-            should_extract(node.encoding_format)
+            should_extract(node.encoding_formats)
             and isinstance(successor, (FileObject, FileSet))
-            and not should_extract(successor.encoding_format)
+            and not should_extract(successor.encoding_formats)
         ):
             operation = operation >> Extract(operations=operations, node=node)
         if isinstance(successor, FileSet):
@@ -93,7 +93,7 @@ def _add_operations_for_file_object(
                 >> FilterFiles(operations=operations, node=successor)
                 >> Concatenate(operations=operations, node=successor)
             )
-        if node.encoding_format and not should_extract(node.encoding_format):
+        if node.encoding_formats and not should_extract(node.encoding_formats):
             fields = tuple([
                 field for field in node.recursive_successors if isinstance(field, Field)
             ])
@@ -192,7 +192,10 @@ def from_nodes(cls, ctx: Context, metadata: Node) -> "OperationGraph":
         operations = Operations()
         for node in nx.topological_sort(ctx.graph):
             if isinstance(node, FileObject):
-                if node.encoding_format == EncodingFormat.GIT:
+                if (
+                    node.encoding_formats
+                    and EncodingFormat.GIT in node.encoding_formats
+                ):
                     _add_operations_for_git(operations, node, ctx.folder)
                 else:
                     _add_operations_for_file_object(operations, node, ctx.folder)
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/download.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/download.py
@@ -223,7 +223,10 @@ def call(self, *args) -> Path:
         del args  # unused
         filepath = get_download_filepath(self.node)
         if not filepath.exists():
-            if self.node.encoding_format == EncodingFormat.GIT:
+            if (
+                self.node.encoding_formats
+                and EncodingFormat.GIT in self.node.encoding_formats
+            ):
                 self._download_from_git(filepath)
             else:
                 self._download_from_http(filepath)
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/extract.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/extract.py
@@ -17,10 +17,12 @@
 from mlcroissant._src.structure_graph.nodes.file_object import FileObject
 
 
-def should_extract(encoding_format: str | None) -> bool:
+def should_extract(encoding_formats: list[str] | None) -> bool:
     """Whether the encoding format should be extracted (zip or tar)."""
+    if not encoding_formats:
+        return False
     return (
-        encoding_format == EncodingFormat.TAR or encoding_format == EncodingFormat.ZIP
+        EncodingFormat.TAR in encoding_formats or EncodingFormat.ZIP in encoding_formats
     )
 
 
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
@@ -91,7 +91,7 @@ def test_readfield_with_subfields():
                 id="file_id",
                 content_url=csv_file,
                 sha256="None",
-                encoding_format="text/csv",
+                encoding_formats=["text/csv"],
             )
         ]
         fields = [
@@ -244,7 +244,7 @@ def test_extract_lines(separator):
                 id="file_id",
                 content_url=path,
                 sha256="None",
-                encoding_format="text/plain",
+                encoding_formats=["text/plain"],
             )
         ]
         fields = []
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py
@@ -12,6 +12,7 @@
 from mlcroissant._src.core.constants import EncodingFormat
 from mlcroissant._src.core.git import download_git_lfs_file
 from mlcroissant._src.core.git import is_git_lfs_file
+from mlcroissant._src.core.optional import deps
 from mlcroissant._src.core.path import Path
 from mlcroissant._src.operation_graph.base_operation import Operation
 from mlcroissant._src.operation_graph.operations.download import is_url
@@ -21,6 +22,12 @@
 from mlcroissant._src.structure_graph.nodes.file_set import FileSet
 from mlcroissant._src.structure_graph.nodes.source import FileProperty
 
+try:
+    scipy = deps.scipy
+except ModuleNotFoundError:
+    scipy = None
+INSTALL_MESSAGE = "scipy is not installed and is a dependency."
+
 
 class ReadingMethod(enum.Enum):
     """Reading method derived from the fields that consume the FileObject/FileSet."""
@@ -82,65 +89,80 @@ class Read(Operation):
     folder: epath.Path
     fields: tuple[Field, ...]
 
-    def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
+    def _read_file_content(
+        self, encoding_formats: list[str], file: Path
+    ) -> pd.DataFrame:
         """Extracts the `source` file to `target`."""
         filepath = file.filepath
         if is_git_lfs_file(filepath):
             download_git_lfs_file(file)
         reading_method = _reading_method(self.node, self.fields)
+        if EncodingFormat.ARFF in encoding_formats:
+            if scipy is None:
+                raise NotImplementedError(INSTALL_MESSAGE)
+
+            data = scipy.io.arff.loadarff(filepath)
+            if not isinstance(data, list) or len(data) != 1:
+                raise ValueError(
+                    "The loaded data from scipy.io.arff does not have the expected"
+                    " shape (a list with one element). Please ensure the ARFF file is"
+                    " valid."
+                )
+            return pd.DataFrame(data[0])
 
         with filepath.open("rb") as file:
-            # TODO(https://github.com/mlcommons/croissant/issues/635).
-            if filepath.suffix == ".gz":
-                file = gzip.open(file, "rt", newline="")
-            if encoding_format == EncodingFormat.CSV:
-                return pd.read_csv(file)
-            elif encoding_format == EncodingFormat.TSV:
-                return pd.read_csv(file, sep="\t")
-            elif encoding_format == EncodingFormat.JSON:
-                json_content = json.load(file)
-                if reading_method == ReadingMethod.JSON:
-                    return parse_json_content(json_content, self.fields)
-                else:
-                    # Raw files are returned as a one-line pd.DataFrame.
-                    return pd.DataFrame({
-                        FileProperty.content: [json_content],
-                    })
-            elif encoding_format == EncodingFormat.JSON_LINES:
-                return pd.read_json(file, lines=True)
-            elif encoding_format == EncodingFormat.PARQUET:
-                try:
-                    df = pd.read_parquet(file)
-                    # Sometimes the author already set an index in Parquet, so we want
-                    # to reset it to always have the same format.
-                    df.reset_index(inplace=True)
-                    return df
-                except ImportError as e:
-                    raise ImportError(
-                        "Missing dependency to read Parquet files. pyarrow is not"
-                        " installed. Please, install `pip install"
-                        " mlcroissant[parquet]`."
-                    ) from e
-            elif encoding_format == EncodingFormat.TEXT:
-                if reading_method == ReadingMethod.LINES:
-                    return pd.read_csv(
-                        filepath, header=None, names=[FileProperty.lines]
-                    )
-                else:
+            for encoding_format in encoding_formats:
+                # TODO(https://github.com/mlcommons/croissant/issues/635).
+                if filepath.suffix == ".gz":
+                    file = gzip.open(file, "rt", newline="")
+                if encoding_format == EncodingFormat.CSV:
+                    return pd.read_csv(file)
+                elif encoding_format == EncodingFormat.TSV:
+                    return pd.read_csv(file, sep="\t")
+                elif encoding_format == EncodingFormat.JSON:
+                    json_content = json.load(file)
+                    if reading_method == ReadingMethod.JSON:
+                        return parse_json_content(json_content, self.fields)
+                    else:
+                        # Raw files are returned as a one-line pd.DataFrame.
+                        return pd.DataFrame({
+                            FileProperty.content: [json_content],
+                        })
+                elif encoding_format == EncodingFormat.JSON_LINES:
+                    return pd.read_json(file, lines=True)
+                elif encoding_format == EncodingFormat.PARQUET:
+                    try:
+                        df = pd.read_parquet(file)
+                        # Sometimes the author already set an index in Parquet, so we
+                        # want to reset it to always have the same format.
+                        df.reset_index(inplace=True)
+                        return df
+                    except ImportError as e:
+                        raise ImportError(
+                            "Missing dependency to read Parquet files. pyarrow is not"
+                            " installed. Please, install `pip install"
+                            " mlcroissant[parquet]`."
+                        ) from e
+                elif encoding_format == EncodingFormat.TEXT:
+                    if reading_method == ReadingMethod.LINES:
+                        return pd.read_csv(
+                            filepath, header=None, names=[FileProperty.lines]
+                        )
+                    else:
+                        return pd.DataFrame({
+                            FileProperty.content: [file.read()],
+                        })
+                elif (
+                    encoding_format == EncodingFormat.MP3
+                    or encoding_format == EncodingFormat.JPG
+                ):
                     return pd.DataFrame({
                         FileProperty.content: [file.read()],
                     })
-            elif (
-                encoding_format == EncodingFormat.MP3
-                or encoding_format == EncodingFormat.JPG
-            ):
-                return pd.DataFrame({
-                    FileProperty.content: [file.read()],
-                })
-            else:
-                raise ValueError(
-                    f"Unsupported encoding format for file: {encoding_format}"
-                )
+            raise ValueError(
+                f"None of the provided encoding formats: {encoding_format} for file"
+                f" {filepath} returned a valid pandas dataframe."
+            )
 
     def call(self, files: list[Path] | Path) -> pd.DataFrame:
         """See class' docstring."""
@@ -170,8 +192,8 @@ def call(self, files: list[Path] | Path) -> pd.DataFrame:
                     f'In node "{self.node.uuid}", file "{self.node.content_url}" is'
                     " either an invalid URL or an invalid path."
                 )
-            assert self.node.encoding_format, "Encoding format is not specified."
-            file_content = self._read_file_content(self.node.encoding_format, file)
+            assert self.node.encoding_formats, "Encoding format is not specified."
+            file_content = self._read_file_content(self.node.encoding_formats, file)
             if _should_append_line_numbers(self.fields):
                 file_content[FileProperty.lineNumbers] = range(len(file_content))
             file_content[FileProperty.filepath] = file.filepath
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read_test.py
@@ -44,7 +44,7 @@ def test_explicit_message_when_pyarrow_is_not_installed():
             read = Read(
                 operations=operations(),
                 node=create_test_file_object(
-                    encoding_format="application/x-parquet", content_url=content_url
+                    encoding_formats=["application/x-parquet"], content_url=content_url
                 ),
                 folder=folder,
                 fields=(),
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py
@@ -53,9 +53,14 @@ class FileObject(Node):
         input_types=[SDO.Text],
         url=SDO.description,
     )
-    encoding_format: str | None = mlc_dataclasses.jsonld_field(
+    encoding_formats: list[str] | None = mlc_dataclasses.jsonld_field(
+        cardinality="MANY",
         default=None,
-        description="The format of the file, given as a mime type.",
+        description=(
+            "The formats of the file, given as a mime type. Unregistered or niche"
+            " encoding and file formats can be indicated instead via the most"
+            " appropriate URL, e.g. defining Web page or a Wikipedia/Wikidata entry. "
+        ),
         input_types=[SDO.Text],
         url=SDO.encodingFormat,
     )
@@ -102,7 +107,7 @@ def __post_init__(self):
         Node.__post_init__(self)
         self.validate_name()
         uuid_field = "name" if self.ctx.is_v0() else "id"
-        self.assert_has_mandatory_properties("encoding_format", uuid_field)
+        self.assert_has_mandatory_properties("encoding_formats", uuid_field)
 
         if not self.contained_in:
             self.assert_has_mandatory_properties("content_url")
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py
@@ -31,7 +31,7 @@ def test_checks_are_performed(conforms_to, field_uuid):
         ctx = Context(conforms_to=conforms_to)
         create_test_node(FileObject, ctx=ctx)
         mandatory_mock.assert_has_calls([
-            mock.call("encoding_format", field_uuid), mock.call("content_url")
+            mock.call("encoding_formats", field_uuid), mock.call("content_url")
         ])
         exclusive_mock.assert_called_once_with(["md5", "sha256"])
         validate_name_mock.assert_called_once()
@@ -54,7 +54,7 @@ def test_checks_not_performed_for_live_dataset(conforms_to, field_uuid):
         ctx = Context(is_live_dataset=True, conforms_to=conforms_to)
         create_test_node(FileObject, ctx=ctx)
         mandatory_mock.assert_has_calls([
-            mock.call("encoding_format", field_uuid), mock.call("content_url")
+            mock.call("encoding_formats", field_uuid), mock.call("content_url")
         ])
         exclusive_mock.assert_not_called()
         validate_name_mock.assert_called_once()
@@ -86,7 +86,7 @@ def test_from_jsonld(encoding):
     assert file_object.id == "foo_id"
     assert file_object.description == "bar"
     assert file_object.content_url == "https://mlcommons.org"
-    assert file_object.encoding_format == encoding
+    assert file_object.encoding_formats == [encoding]
     assert (
         file_object.sha256
         == "48a7c257f3c90b2a3e529ddd2cca8f4f1bd8e49ed244ef53927649504ac55354"
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py
@@ -35,9 +35,14 @@ class FileSet(Node):
         input_types=[SDO.Text],
         url=SDO.description,
     )
-    encoding_format: str | None = mlc_dataclasses.jsonld_field(
+    encoding_formats: list[str] | None = mlc_dataclasses.jsonld_field(
+        cardinality="MANY",
         default=None,
-        description="The format of the file, given as a mime type.",
+        description=(
+            "The formats of the file, given as a mime type. Unregistered or niche"
+            " encoding and file formats can be indicated instead via the most"
+            " appropriate URL, e.g. defining Web page or a Wikipedia/Wikidata entry. "
+        ),
         input_types=[SDO.Text],
         url=SDO.encodingFormat,
     )
@@ -75,4 +80,4 @@ def __post_init__(self):
         Node.__post_init__(self)
         uuid_field = "name" if self.ctx.is_v0() else "id"
         self.validate_name()
-        self.assert_has_mandatory_properties("includes", "encoding_format", uuid_field)
+        self.assert_has_mandatory_properties("includes", "encoding_formats", uuid_field)
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set_test.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set_test.py
@@ -29,7 +29,7 @@ def test_checks_are_performed(conforms_to, field_uuid):
         ctx = Context(conforms_to=conforms_to)
         create_test_node(FileSet, ctx=ctx)
         mandatory_mock.assert_called_once_with(
-            "includes", "encoding_format", field_uuid
+            "includes", "encoding_formats", field_uuid
         )
         optional_mock.assert_not_called()
         validate_name_mock.assert_called_once()
@@ -54,7 +54,7 @@ def test_from_jsonld(conforms_to):
     assert file_set.id == "foo_id"
     assert file_set.description == "bar"
     assert file_set.contained_in == ["some.zip"]
-    assert file_set.encoding_format == "application/json"
+    assert file_set.encoding_formats == ["application/json"]
     assert file_set.excludes == ["*.csv"]
     assert file_set.includes == ["*.json"]
     assert not ctx.issues.errors
diff --git a/python/mlcroissant/pyproject.toml b/python/mlcroissant/pyproject.toml
diff --git a/python/mlcroissant/recipes/bounding-boxes.ipynb b/python/mlcroissant/recipes/bounding-boxes.ipynb
diff --git a/python/mlcroissant/recipes/introduction.ipynb b/python/mlcroissant/recipes/introduction.ipynb

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+{"features/0-checking_status": "<0", "features/1-duration": 6.0, "features/2-credit_history": "critical/other existing credit", "features/3-purpose": "radio/tv", "features/4-credit_amount": 1169.0, "features/5-savings_status": "no known savings", "features/6-employment": ">=7", "features/7-installment_commitment": 4.0, "features/8-personal_status": "male single", "features/9-other_parties": "none", "features/10-residence_since": 4.0, "features/11-property_magnitude": "real estate", "features/12-age": 67.0, "features/13-other_payment_plans": "none", "features/14-housing": "own", "features/15-existing_credits": 2.0, "features/16-job": "skilled", "features/17-num_dependents": 1.0, "features/18-own_telephone": "yes", "features/19-foreign_worker": "yes", "features/20-class": "good"}
	`2`	+{"features/0-checking_status": "0<=X<200", "features/1-duration": 48.0, "features/2-credit_history": "existing paid", "features/3-purpose": "radio/tv", "features/4-credit_amount": 5951.0, "features/5-savings_status": "<100", "features/6-employment": "1<=X<4", "features/7-installment_commitment": 2.0, "features/8-personal_status": "female div/dep/mar", "features/9-other_parties": "none", "features/10-residence_since": 2.0, "features/11-property_magnitude": "real estate", "features/12-age": 22.0, "features/13-other_payment_plans": "none", "features/14-housing": "own", "features/15-existing_credits": 1.0, "features/16-job": "skilled", "features/17-num_dependents": 1.0, "features/18-own_telephone": "none", "features/19-foreign_worker": "yes", "features/20-class": "bad"}
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ def test_readfield_with_subfields():`
`91`	`91`	`id="file_id",`
`92`	`92`	`content_url=csv_file,`
`93`	`93`	`sha256="None",`
`94`		`- encoding_format="text/csv",`
	`94`	`+ encoding_formats=["text/csv"],`
`95`	`95`	`)`
`96`	`96`	`]`
`97`	`97`	`fields = [`
`@@ -244,7 +244,7 @@ def test_extract_lines(separator):`
`244`	`244`	`id="file_id",`
`245`	`245`	`content_url=path,`
`246`	`246`	`sha256="None",`
`247`		`- encoding_format="text/plain",`
	`247`	`+ encoding_formats=["text/plain"],`
`248`	`248`	`)`
`249`	`249`	`]`
`250`	`250`	`fields = []`