From b15e3599084d2e9785dac8ee350ecc7d871c32e1 Mon Sep 17 00:00:00 2001 From: Tom Bagby Date: Mon, 14 Apr 2025 21:16:13 -0700 Subject: [PATCH 1/2] Preserve native sample rate, don't auto-resample to 22050 The default in librosa is sr=22050, which forces resampling, it has to be set explicitly to None to preserve the original sample rate. I doubt this is intentional? If resampling is desired, the field description should include an explicit target sample rate to enable. --- .../mlcroissant/_src/operation_graph/operations/field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 056899b6e..e54e5dbf5 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -96,7 +96,7 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None): else: raise ValueError(f"Type {type(value)} is not accepted for an image.") elif data_type == DataType.AUDIO_OBJECT: - output = deps.librosa.load(io.BytesIO(value)) + output = deps.librosa.load(io.BytesIO(value), sr=None) return output elif data_type == DataType.BOUNDING_BOX: # pytype: disable=wrong-arg-types return bounding_box.parse(value) From 66a178c779e9147580cac1c7d3482aad1c1cc776 Mon Sep 17 00:00:00 2001 From: Tom Bagby Date: Thu, 8 May 2025 10:47:00 -0700 Subject: [PATCH 2/2] Preserve np.ndarray in casting, do not cast to list. --- .../_src/operation_graph/operations/field.py | 2 ++ .../_src/operation_graph/operations/field_test.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index e54e5dbf5..d5a1d260e 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -102,6 +102,8 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None): return bounding_box.parse(value) elif not isinstance(data_type, type): raise ValueError(f"No special case for type {data_type}.") + elif isinstance(value, np.ndarray) and issubclass(data_type, np.generic): + return value.astype(data_type) elif isinstance(value, list) or isinstance(value, np.ndarray): return [_cast_value(ctx=ctx, value=v, data_type=data_type) for v in value] elif data_type == bytes and not isinstance(value, bytes): diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 4299193b6..f71d8c3b1 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -51,6 +51,21 @@ def test_cast_value(conforms_to, value, data_type, expected): assert field._cast_value(ctx, value, data_type) == expected +@parametrize_conforms_to() +@pytest.mark.parametrize( + ["value", "data_type", "expected"], + [ + [np.array([1, 2, 3]), DataType.INTEGER, np.array([1, 2, 3])], + [np.array([1, 2, 3]), DataType.FLOAT32, np.array([1.0, 2.0, 3.0])], + ], +) +def test_cast_value_ndarray(): + ctx = Context(conforms_to=conforms_to) + cast_value = field._cast_value(ctx, value, data_type) + assert cast_value == expected + assert cast_value.dtype == expected.dtype + + @parametrize_conforms_to() @pytest.mark.parametrize( ["value", "data_type"],