diff --git a/docs/croissant-spec-draft.md b/docs/croissant-spec-draft.md index 74109090a..5ddfde654 100644 --- a/docs/croissant-spec-draft.md +++ b/docs/croissant-spec-draft.md @@ -1088,6 +1088,7 @@ Croissant supports a few simple transformations that can be applied on the sourc - delimiter: split a string into an array using the supplied character. - regex: A regular expression to parse the data. - jsonPath: A JSON path to evaluate on the (JSON) data source. +- replace: A string of the form `pattern/replacement` to replace occurrences of `pattern` with `replacement`. Additional forward slashes (`/`) can be escaped with a backslash, and `pattern` can be a regular expression. For example, to extract information from a filename using a regular expression, we can write: diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 056899b6e..c5db18b40 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -63,6 +63,20 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: raise ValueError(f"`format` only applies to dates. Got {field.data_type}") elif transform.separator is not None: return value.split(transform.separator) + elif transform.replace is not None: + if isinstance(value, pathlib.PurePath): + value = os.fspath(value) + # Split on unescaped slash. + parts = re.split(r"(?