Skip to content

Commit aa6d9e6

Browse files
authored
Add a EncodingFormat for .arff files (#835)
1 parent f441ad7 commit aa6d9e6

File tree

18 files changed

+1078
-79
lines changed

18 files changed

+1078
-79
lines changed

datasets/1.0/credit-g/metadata.json

Lines changed: 945 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"features/0-checking_status": "<0", "features/1-duration": 6.0, "features/2-credit_history": "critical/other existing credit", "features/3-purpose": "radio/tv", "features/4-credit_amount": 1169.0, "features/5-savings_status": "no known savings", "features/6-employment": ">=7", "features/7-installment_commitment": 4.0, "features/8-personal_status": "male single", "features/9-other_parties": "none", "features/10-residence_since": 4.0, "features/11-property_magnitude": "real estate", "features/12-age": 67.0, "features/13-other_payment_plans": "none", "features/14-housing": "own", "features/15-existing_credits": 2.0, "features/16-job": "skilled", "features/17-num_dependents": 1.0, "features/18-own_telephone": "yes", "features/19-foreign_worker": "yes", "features/20-class": "good"}
2+
{"features/0-checking_status": "0<=X<200", "features/1-duration": 48.0, "features/2-credit_history": "existing paid", "features/3-purpose": "radio/tv", "features/4-credit_amount": 5951.0, "features/5-savings_status": "<100", "features/6-employment": "1<=X<4", "features/7-installment_commitment": 2.0, "features/8-personal_status": "female div/dep/mar", "features/9-other_parties": "none", "features/10-residence_since": 2.0, "features/11-property_magnitude": "real estate", "features/12-age": 22.0, "features/13-other_payment_plans": "none", "features/14-housing": "own", "features/15-existing_credits": 1.0, "features/16-job": "skilled", "features/17-num_dependents": 1.0, "features/18-own_telephone": "none", "features/19-foreign_worker": "yes", "features/20-class": "bad"}

docs/croissant-spec-draft.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,8 +622,8 @@ Most of the important properties needed to describe a `FileObject` are defined i
622622
<tr>
623623
<td><a href="https://schema.org/encodingFormat">sc:encodingFormat</a></td>
624624
<td><a href="http://schema.org/Text">Text</a></td>
625-
<td>ONE</td>
626-
<td>The format of the file, given as a mime type.</td>
625+
<td>MANY</td>
626+
<td>The formats of the file, given as a mime type. Unregistered or niche encoding and file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia/Wikidata entry.</td>
627627
</tr>
628628
<tr>
629629
<td><a href="https://schema.org/sameAs">sc:sameAs</a></td>

python/mlcroissant/mlcroissant/_src/core/constants.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
179179
SCHEMA_ORG_DATE_PUBLISHED: "date_published",
180180
SCHEMA_ORG_DESCRIPTION: "description",
181181
SCHEMA_ORG_DISTRIBUTION: "distribution",
182-
SCHEMA_ORG_ENCODING_FORMAT: "encoding_format",
182+
SCHEMA_ORG_ENCODING_FORMAT: "encoding_formats",
183183
SCHEMA_ORG_KEYWORDS: "keywords",
184184
SCHEMA_ORG_LICENSE: "license",
185185
SCHEMA_ORG_MD5: "md5",
@@ -209,9 +209,14 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
209209
class EncodingFormat:
210210
"""Supported MIME Types in Croissant.
211211
212+
Unregistered or niche encoding and file formats can be indicated instead via the most
213+
appropriate URL, e.g. defining Web page or a Wikipedia/Wikidata entry.
214+
Supersedes fileFormat.
215+
212216
We inherit the wrong naming `encodingFormat` from https://schema.org/encodingFormat.
213217
"""
214218

219+
ARFF = "https://ml.cms.waikato.ac.nz/weka/arff.html"
215220
CSV = "text/csv"
216221
GIT = "git+https"
217222
JPG = "image/jpeg"

python/mlcroissant/mlcroissant/_src/core/optional.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ def librosa(cls) -> types.ModuleType: # pylint: disable=invalid-name
9191
"""Cached librosa module."""
9292
return _try_import("librosa", package_name="librosa")
9393

94+
@cached_class_property
95+
def scipy(cls) -> types.ModuleType: # pylint: disable=invalid-name
96+
"""Cached scipy module."""
97+
return _try_import("scipy", package_name="scipy")
98+
9499
@cached_class_property
95100
def torchdata_datapipes(cls) -> types.ModuleType:
96101
"""Cached torchdata module."""

python/mlcroissant/mlcroissant/_src/operation_graph/graph.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,9 @@ def _add_operations_for_file_object(
8282
operation = first_operation
8383
# Extract the file if needed
8484
if (
85-
should_extract(node.encoding_format)
85+
should_extract(node.encoding_formats)
8686
and isinstance(successor, (FileObject, FileSet))
87-
and not should_extract(successor.encoding_format)
87+
and not should_extract(successor.encoding_formats)
8888
):
8989
operation = operation >> Extract(operations=operations, node=node)
9090
if isinstance(successor, FileSet):
@@ -93,7 +93,7 @@ def _add_operations_for_file_object(
9393
>> FilterFiles(operations=operations, node=successor)
9494
>> Concatenate(operations=operations, node=successor)
9595
)
96-
if node.encoding_format and not should_extract(node.encoding_format):
96+
if node.encoding_formats and not should_extract(node.encoding_formats):
9797
fields = tuple([
9898
field for field in node.recursive_successors if isinstance(field, Field)
9999
])
@@ -192,7 +192,10 @@ def from_nodes(cls, ctx: Context, metadata: Node) -> "OperationGraph":
192192
operations = Operations()
193193
for node in nx.topological_sort(ctx.graph):
194194
if isinstance(node, FileObject):
195-
if node.encoding_format == EncodingFormat.GIT:
195+
if (
196+
node.encoding_formats
197+
and EncodingFormat.GIT in node.encoding_formats
198+
):
196199
_add_operations_for_git(operations, node, ctx.folder)
197200
else:
198201
_add_operations_for_file_object(operations, node, ctx.folder)

python/mlcroissant/mlcroissant/_src/operation_graph/operations/download.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,10 @@ def call(self, *args) -> Path:
223223
del args # unused
224224
filepath = get_download_filepath(self.node)
225225
if not filepath.exists():
226-
if self.node.encoding_format == EncodingFormat.GIT:
226+
if (
227+
self.node.encoding_formats
228+
and EncodingFormat.GIT in self.node.encoding_formats
229+
):
227230
self._download_from_git(filepath)
228231
else:
229232
self._download_from_http(filepath)

python/mlcroissant/mlcroissant/_src/operation_graph/operations/extract.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717
from mlcroissant._src.structure_graph.nodes.file_object import FileObject
1818

1919

20-
def should_extract(encoding_format: str | None) -> bool:
20+
def should_extract(encoding_formats: list[str] | None) -> bool:
2121
"""Whether the encoding format should be extracted (zip or tar)."""
22+
if not encoding_formats:
23+
return False
2224
return (
23-
encoding_format == EncodingFormat.TAR or encoding_format == EncodingFormat.ZIP
25+
EncodingFormat.TAR in encoding_formats or EncodingFormat.ZIP in encoding_formats
2426
)
2527

2628

python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def test_readfield_with_subfields():
9191
id="file_id",
9292
content_url=csv_file,
9393
sha256="None",
94-
encoding_format="text/csv",
94+
encoding_formats=["text/csv"],
9595
)
9696
]
9797
fields = [
@@ -244,7 +244,7 @@ def test_extract_lines(separator):
244244
id="file_id",
245245
content_url=path,
246246
sha256="None",
247-
encoding_format="text/plain",
247+
encoding_formats=["text/plain"],
248248
)
249249
]
250250
fields = []

python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py

Lines changed: 73 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from mlcroissant._src.core.constants import EncodingFormat
1313
from mlcroissant._src.core.git import download_git_lfs_file
1414
from mlcroissant._src.core.git import is_git_lfs_file
15+
from mlcroissant._src.core.optional import deps
1516
from mlcroissant._src.core.path import Path
1617
from mlcroissant._src.operation_graph.base_operation import Operation
1718
from mlcroissant._src.operation_graph.operations.download import is_url
@@ -21,6 +22,12 @@
2122
from mlcroissant._src.structure_graph.nodes.file_set import FileSet
2223
from mlcroissant._src.structure_graph.nodes.source import FileProperty
2324

25+
try:
26+
scipy = deps.scipy
27+
except ModuleNotFoundError:
28+
scipy = None
29+
INSTALL_MESSAGE = "scipy is not installed and is a dependency."
30+
2431

2532
class ReadingMethod(enum.Enum):
2633
"""Reading method derived from the fields that consume the FileObject/FileSet."""
@@ -82,65 +89,80 @@ class Read(Operation):
8289
folder: epath.Path
8390
fields: tuple[Field, ...]
8491

85-
def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
92+
def _read_file_content(
93+
self, encoding_formats: list[str], file: Path
94+
) -> pd.DataFrame:
8695
"""Extracts the `source` file to `target`."""
8796
filepath = file.filepath
8897
if is_git_lfs_file(filepath):
8998
download_git_lfs_file(file)
9099
reading_method = _reading_method(self.node, self.fields)
100+
if EncodingFormat.ARFF in encoding_formats:
101+
if scipy is None:
102+
raise NotImplementedError(INSTALL_MESSAGE)
103+
104+
data = scipy.io.arff.loadarff(filepath)
105+
if not isinstance(data, list) or len(data) != 1:
106+
raise ValueError(
107+
"The loaded data from scipy.io.arff does not have the expected"
108+
" shape (a list with one element). Please ensure the ARFF file is"
109+
" valid."
110+
)
111+
return pd.DataFrame(data[0])
91112

92113
with filepath.open("rb") as file:
93-
# TODO(https://github.com/mlcommons/croissant/issues/635).
94-
if filepath.suffix == ".gz":
95-
file = gzip.open(file, "rt", newline="")
96-
if encoding_format == EncodingFormat.CSV:
97-
return pd.read_csv(file)
98-
elif encoding_format == EncodingFormat.TSV:
99-
return pd.read_csv(file, sep="\t")
100-
elif encoding_format == EncodingFormat.JSON:
101-
json_content = json.load(file)
102-
if reading_method == ReadingMethod.JSON:
103-
return parse_json_content(json_content, self.fields)
104-
else:
105-
# Raw files are returned as a one-line pd.DataFrame.
106-
return pd.DataFrame({
107-
FileProperty.content: [json_content],
108-
})
109-
elif encoding_format == EncodingFormat.JSON_LINES:
110-
return pd.read_json(file, lines=True)
111-
elif encoding_format == EncodingFormat.PARQUET:
112-
try:
113-
df = pd.read_parquet(file)
114-
# Sometimes the author already set an index in Parquet, so we want
115-
# to reset it to always have the same format.
116-
df.reset_index(inplace=True)
117-
return df
118-
except ImportError as e:
119-
raise ImportError(
120-
"Missing dependency to read Parquet files. pyarrow is not"
121-
" installed. Please, install `pip install"
122-
" mlcroissant[parquet]`."
123-
) from e
124-
elif encoding_format == EncodingFormat.TEXT:
125-
if reading_method == ReadingMethod.LINES:
126-
return pd.read_csv(
127-
filepath, header=None, names=[FileProperty.lines]
128-
)
129-
else:
114+
for encoding_format in encoding_formats:
115+
# TODO(https://github.com/mlcommons/croissant/issues/635).
116+
if filepath.suffix == ".gz":
117+
file = gzip.open(file, "rt", newline="")
118+
if encoding_format == EncodingFormat.CSV:
119+
return pd.read_csv(file)
120+
elif encoding_format == EncodingFormat.TSV:
121+
return pd.read_csv(file, sep="\t")
122+
elif encoding_format == EncodingFormat.JSON:
123+
json_content = json.load(file)
124+
if reading_method == ReadingMethod.JSON:
125+
return parse_json_content(json_content, self.fields)
126+
else:
127+
# Raw files are returned as a one-line pd.DataFrame.
128+
return pd.DataFrame({
129+
FileProperty.content: [json_content],
130+
})
131+
elif encoding_format == EncodingFormat.JSON_LINES:
132+
return pd.read_json(file, lines=True)
133+
elif encoding_format == EncodingFormat.PARQUET:
134+
try:
135+
df = pd.read_parquet(file)
136+
# Sometimes the author already set an index in Parquet, so we
137+
# want to reset it to always have the same format.
138+
df.reset_index(inplace=True)
139+
return df
140+
except ImportError as e:
141+
raise ImportError(
142+
"Missing dependency to read Parquet files. pyarrow is not"
143+
" installed. Please, install `pip install"
144+
" mlcroissant[parquet]`."
145+
) from e
146+
elif encoding_format == EncodingFormat.TEXT:
147+
if reading_method == ReadingMethod.LINES:
148+
return pd.read_csv(
149+
filepath, header=None, names=[FileProperty.lines]
150+
)
151+
else:
152+
return pd.DataFrame({
153+
FileProperty.content: [file.read()],
154+
})
155+
elif (
156+
encoding_format == EncodingFormat.MP3
157+
or encoding_format == EncodingFormat.JPG
158+
):
130159
return pd.DataFrame({
131160
FileProperty.content: [file.read()],
132161
})
133-
elif (
134-
encoding_format == EncodingFormat.MP3
135-
or encoding_format == EncodingFormat.JPG
136-
):
137-
return pd.DataFrame({
138-
FileProperty.content: [file.read()],
139-
})
140-
else:
141-
raise ValueError(
142-
f"Unsupported encoding format for file: {encoding_format}"
143-
)
162+
raise ValueError(
163+
f"None of the provided encoding formats: {encoding_format} for file"
164+
f" {filepath} returned a valid pandas dataframe."
165+
)
144166

145167
def call(self, files: list[Path] | Path) -> pd.DataFrame:
146168
"""See class' docstring."""
@@ -170,8 +192,8 @@ def call(self, files: list[Path] | Path) -> pd.DataFrame:
170192
f'In node "{self.node.uuid}", file "{self.node.content_url}" is'
171193
" either an invalid URL or an invalid path."
172194
)
173-
assert self.node.encoding_format, "Encoding format is not specified."
174-
file_content = self._read_file_content(self.node.encoding_format, file)
195+
assert self.node.encoding_formats, "Encoding format is not specified."
196+
file_content = self._read_file_content(self.node.encoding_formats, file)
175197
if _should_append_line_numbers(self.fields):
176198
file_content[FileProperty.lineNumbers] = range(len(file_content))
177199
file_content[FileProperty.filepath] = file.filepath

0 commit comments

Comments
 (0)