Skip to content

Commit 52d05b6

Browse files
authored
refactor: use String.split instead of String.splitOn or String.splitToList (#11250)
This PR introduces a function `String.split` which is based on `String.Slice.split` and therefore supports all pattern types and returns a `Std.Iter String.Slice`. This supersedes the functions `String.splitOn` and `String.splitToList`, and we remove all all uses of these functions from core. They will be deprecated in a future PR. Migrating from `String.splitOn` and `String.splitToList` is easy: we introduce functions `Iter.toStringList` and `Iter.toStringArray` that can be used to conveniently go from `Std.Iter String.Slice` to `List String` and `Array String`, so for example `s.splitOn "foo"` can be replaced by `s.split "foo" |>.toStringList`.
1 parent f7031c7 commit 52d05b6

File tree

36 files changed

+254
-180
lines changed

36 files changed

+254
-180
lines changed

src/Init/Data/Format/Instances.lean

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ module
77

88
prelude
99
public import Init.Data.Array.Basic
10-
import Init.Data.String.Basic
10+
import Init.Data.String.Search
1111

1212
public section
1313

@@ -47,7 +47,7 @@ Converts a string to a pretty-printer document, replacing newlines in the string
4747
`Std.Format.line`.
4848
-/
4949
def String.toFormat (s : String) : Std.Format :=
50-
Std.Format.joinSep (s.splitOn "\n") Std.Format.line
50+
Std.Format.joinSep (s.split '\n').toList Std.Format.line
5151

5252
instance : ToFormat String.Pos.Raw where
5353
format p := format p.byteIdx

src/Init/Data/String.lean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@ public import Init.Data.String.Modify
2525
public import Init.Data.String.Termination
2626
public import Init.Data.String.ToSlice
2727
public import Init.Data.String.Search
28+
public import Init.Data.String.Legacy

src/Init/Data/String/Basic.lean

Lines changed: 1 addition & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/-
22
Copyright (c) 2016 Microsoft Corporation. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
4-
Author: Leonardo de Moura, Mario Carneiro
4+
Author: Leonardo de Moura, Mario Carneiro, Markus Himmel
55
-/
66
module
77

@@ -2479,98 +2479,6 @@ where
24792479
def extract : (@& String) → (@& Pos.Raw) → (@& Pos.Raw) → String
24802480
| s, b, e => Pos.Raw.extract s b e
24812481

2482-
@[specialize] def splitAux (s : String) (p : Char → Bool) (b : Pos.Raw) (i : Pos.Raw) (r : List String) : List String :=
2483-
if h : i.atEnd s then
2484-
let r := (b.extract s i)::r
2485-
r.reverse
2486-
else
2487-
have := Nat.sub_lt_sub_left (Nat.gt_of_not_le (mt decide_eq_true h)) (Pos.Raw.lt_next s _)
2488-
if p (i.get s) then
2489-
let i' := i.next s
2490-
splitAux s p i' i' (b.extract s i :: r)
2491-
else
2492-
splitAux s p b (i.next s) r
2493-
termination_by s.rawEndPos.1 - i.1
2494-
2495-
/--
2496-
Splits a string at each character for which `p` returns `true`.
2497-
2498-
The characters that satisfy `p` are not included in any of the resulting strings. If multiple
2499-
characters in a row satisfy `p`, then the resulting list will contain empty strings.
2500-
2501-
Examples:
2502-
* `"coffee tea water".split (·.isWhitespace) = ["coffee", "tea", "water"]`
2503-
* `"coffee tea water".split (·.isWhitespace) = ["coffee", "", "tea", "", "water"]`
2504-
* `"fun x =>\n x + 1\n".split (· == '\n') = ["fun x =>", " x + 1", ""]`
2505-
-/
2506-
@[inline] def splitToList (s : String) (p : Char → Bool) : List String :=
2507-
splitAux s p 0 0 []
2508-
2509-
@[inline, deprecated splitToList (since := "2025-10-17")]
2510-
def split (s : String) (p : Char → Bool) : List String :=
2511-
splitToList s p
2512-
2513-
/--
2514-
Auxiliary for `splitOn`. Preconditions:
2515-
* `sep` is not empty
2516-
* `b <= i` are indexes into `s`
2517-
* `j` is an index into `sep`, and not at the end
2518-
2519-
It represents the state where we have currently parsed some split parts into `r` (in reverse order),
2520-
`b` is the beginning of the string / the end of the previous match of `sep`, and the first `j` bytes
2521-
of `sep` match the bytes `i-j .. i` of `s`.
2522-
-/
2523-
def splitOnAux (s sep : String) (b : Pos.Raw) (i : Pos.Raw) (j : Pos.Raw) (r : List String) : List String :=
2524-
if i.atEnd s then
2525-
let r := (b.extract s i)::r
2526-
r.reverse
2527-
else
2528-
if i.get s == j.get sep then
2529-
let i := i.next s
2530-
let j := j.next sep
2531-
if j.atEnd sep then
2532-
splitOnAux s sep i i 0 (b.extract s (i.unoffsetBy j)::r)
2533-
else
2534-
splitOnAux s sep b i j r
2535-
else
2536-
splitOnAux s sep b ((i.unoffsetBy j).next s) 0 r
2537-
termination_by (s.rawEndPos.1 - (j.byteDistance i), sep.rawEndPos.1 - j.1)
2538-
decreasing_by
2539-
focus
2540-
rename_i h _ _
2541-
left; exact Nat.sub_lt_sub_left
2542-
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
2543-
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Pos.Raw.lt_next s _))
2544-
focus
2545-
rename_i i₀ j₀ _ eq h'
2546-
rw [show (j₀.next sep).byteDistance (i₀.next s) = j₀.byteDistance i₀ by
2547-
change (_ + Char.utf8Size _) - (_ + Char.utf8Size _) = _
2548-
rw [(beq_iff_eq ..).1 eq, Nat.add_sub_add_right]; rfl]
2549-
right; exact Nat.sub_lt_sub_left
2550-
(Nat.lt_of_le_of_lt (Nat.le_add_right ..) (Nat.gt_of_not_le (mt decide_eq_true h')))
2551-
(Pos.Raw.lt_next sep _)
2552-
focus
2553-
rename_i h _
2554-
left; exact Nat.sub_lt_sub_left
2555-
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
2556-
(Pos.Raw.lt_next s _)
2557-
2558-
/--
2559-
Splits a string `s` on occurrences of the separator string `sep`. The default separator is `" "`.
2560-
2561-
When `sep` is empty, the result is `[s]`. When `sep` occurs in overlapping patterns, the first match
2562-
is taken. There will always be exactly `n+1` elements in the returned list if there were `n`
2563-
non-overlapping matches of `sep` in the string. The separators are not included in the returned
2564-
substrings.
2565-
2566-
Examples:
2567-
* `"here is some text ".splitOn = ["here", "is", "some", "text", ""]`
2568-
* `"here is some text ".splitOn "some" = ["here is ", " text "]`
2569-
* `"here is some text ".splitOn "" = ["here is some text "]`
2570-
* `"ababacabac".splitOn "aba" = ["", "bac", "c"]`
2571-
-/
2572-
@[inline] def splitOn (s : String) (sep : String := " ") : List String :=
2573-
if sep == "" then [s] else splitOnAux s sep 0 0 0 []
25742482

25752483

25762484
def Pos.Raw.offsetOfPosAux (s : String) (pos : Pos.Raw) (i : Pos.Raw) (offset : Nat) : Nat :=

src/Init/Data/String/Legacy.lean

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/-
2+
Copyright (c) 2016 Microsoft Corporation. All rights reserved.
3+
Released under Apache 2.0 license as described in the file LICENSE.
4+
Author: Leonardo de Moura, Mario Carneiro
5+
-/
6+
module
7+
8+
prelude
9+
public import Init.Data.String.Basic
10+
11+
/-!
12+
# Legacy string functions
13+
14+
This file contains `String` functions which have since been replaced by different functions and
15+
will be deprecated in the future.
16+
-/
17+
18+
public section
19+
20+
namespace String
21+
22+
@[specialize] def splitAux (s : String) (p : Char → Bool) (b : Pos.Raw) (i : Pos.Raw) (r : List String) : List String :=
23+
if h : i.atEnd s then
24+
let r := (b.extract s i)::r
25+
r.reverse
26+
else
27+
have := Nat.sub_lt_sub_left (Nat.gt_of_not_le (mt decide_eq_true h)) (Pos.Raw.lt_next s _)
28+
if p (i.get s) then
29+
let i' := i.next s
30+
splitAux s p i' i' (b.extract s i :: r)
31+
else
32+
splitAux s p b (i.next s) r
33+
termination_by s.rawEndPos.1 - i.1
34+
35+
/--
36+
Splits a string at each character for which `p` returns `true`.
37+
38+
The characters that satisfy `p` are not included in any of the resulting strings. If multiple
39+
characters in a row satisfy `p`, then the resulting list will contain empty strings.
40+
41+
This is a legacy function. Use `String.split` instead.
42+
43+
Examples:
44+
* `"coffee tea water".split (·.isWhitespace) = ["coffee", "tea", "water"]`
45+
* `"coffee tea water".split (·.isWhitespace) = ["coffee", "", "tea", "", "water"]`
46+
* `"fun x =>\n x + 1\n".split (· == '\n') = ["fun x =>", " x + 1", ""]`
47+
-/
48+
@[inline] def splitToList (s : String) (p : Char → Bool) : List String :=
49+
splitAux s p 0 0 []
50+
51+
/--
52+
Auxiliary for `splitOn`. Preconditions:
53+
* `sep` is not empty
54+
* `b <= i` are indexes into `s`
55+
* `j` is an index into `sep`, and not at the end
56+
57+
It represents the state where we have currently parsed some split parts into `r` (in reverse order),
58+
`b` is the beginning of the string / the end of the previous match of `sep`, and the first `j` bytes
59+
of `sep` match the bytes `i-j .. i` of `s`.
60+
-/
61+
def splitOnAux (s sep : String) (b : Pos.Raw) (i : Pos.Raw) (j : Pos.Raw) (r : List String) : List String :=
62+
if i.atEnd s then
63+
let r := (b.extract s i)::r
64+
r.reverse
65+
else
66+
if i.get s == j.get sep then
67+
let i := i.next s
68+
let j := j.next sep
69+
if j.atEnd sep then
70+
splitOnAux s sep i i 0 (b.extract s (i.unoffsetBy j)::r)
71+
else
72+
splitOnAux s sep b i j r
73+
else
74+
splitOnAux s sep b ((i.unoffsetBy j).next s) 0 r
75+
termination_by (s.rawEndPos.1 - (j.byteDistance i), sep.rawEndPos.1 - j.1)
76+
decreasing_by
77+
focus
78+
rename_i h _ _
79+
left; exact Nat.sub_lt_sub_left
80+
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
81+
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Pos.Raw.lt_next s _))
82+
focus
83+
rename_i i₀ j₀ _ eq h'
84+
rw [show (j₀.next sep).byteDistance (i₀.next s) = j₀.byteDistance i₀ by
85+
change (_ + Char.utf8Size _) - (_ + Char.utf8Size _) = _
86+
rw [(beq_iff_eq ..).1 eq, Nat.add_sub_add_right]; rfl]
87+
right; exact Nat.sub_lt_sub_left
88+
(Nat.lt_of_le_of_lt (Nat.le_add_right ..) (Nat.gt_of_not_le (mt decide_eq_true h')))
89+
(Pos.Raw.lt_next sep _)
90+
focus
91+
rename_i h _
92+
left; exact Nat.sub_lt_sub_left
93+
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
94+
(Pos.Raw.lt_next s _)
95+
96+
/--
97+
Splits a string `s` on occurrences of the separator string `sep`. The default separator is `" "`.
98+
99+
When `sep` is empty, the result is `[s]`. When `sep` occurs in overlapping patterns, the first match
100+
is taken. There will always be exactly `n+1` elements in the returned list if there were `n`
101+
non-overlapping matches of `sep` in the string. The separators are not included in the returned
102+
substrings.
103+
104+
This is a legacy function. Use `String.split` instead.
105+
106+
Examples:
107+
* `"here is some text ".splitOn = ["here", "is", "some", "text", ""]`
108+
* `"here is some text ".splitOn "some" = ["here is ", " text "]`
109+
* `"here is some text ".splitOn "" = ["here is some text "]`
110+
* `"ababacabac".splitOn "aba" = ["", "bac", "c"]`
111+
-/
112+
@[inline] def splitOn (s : String) (sep : String := " ") : List String :=
113+
if sep == "" then [s] else splitOnAux s sep 0 0 0 []
114+
115+
end String

src/Init/Data/String/Search.lean

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,25 @@ Examples:
9696
def find? [ToForwardSearcher ρ σ] (s : String) (pattern : ρ) : Option s.ValidPos :=
9797
s.startValidPos.find? pattern
9898

99+
/--
100+
Splits a string at each subslice that matches the pattern {name}`pat`.
101+
102+
The subslices that matched the pattern are not included in any of the resulting subslices. If
103+
multiple subslices in a row match the pattern, the resulting list will contain empty strings.
104+
105+
This function is generic over all currently supported patterns.
106+
107+
Examples:
108+
* {lean}`("coffee tea water".split Char.isWhitespace).toList == ["coffee".toSlice, "tea".toSlice, "water".toSlice]`
109+
* {lean}`("coffee tea water".split ' ').toList == ["coffee".toSlice, "tea".toSlice, "water".toSlice]`
110+
* {lean}`("coffee tea water".split " tea ").toList == ["coffee".toSlice, "water".toSlice]`
111+
* {lean}`("ababababa".split "aba").toList == ["coffee".toSlice, "water".toSlice]`
112+
* {lean}`("baaab".split "aa").toList == ["b".toSlice, "ab".toSlice]`
113+
-/
114+
@[inline]
115+
def split [ToForwardSearcher ρ σ] (s : String) (pat : ρ) :=
116+
(s.toSlice.split pat : Std.Iter String.Slice)
117+
99118
end
100119

101120
end String

src/Init/Data/String/Slice.lean

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/-
22
Copyright (c) 2025 Lean FRO, LLC. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
4-
Authors: Henrik Böving
4+
Authors: Henrik Böving, Markus Himmel
55
-/
66
module
77

@@ -795,9 +795,8 @@ This function is generic over all currently supported patterns except
795795
{name}`String`/{name}`String.Slice`.
796796
797797
Examples:
798-
* {lean}`("coffee tea water".toSlice.find? Char.isWhitespace).map (·.get!) == some ' '`
799-
* {lean}`"tea".toSlice.find? (fun (c : Char) => c == 'X') == none`
800-
* {lean}`("coffee tea water".toSlice.find? "tea").map (·.get!) == some 't'`
798+
* {lean}`("coffee tea water".toSlice.revFind? Char.isWhitespace).map (·.get!) == some ' '`
799+
* {lean}`"tea".toSlice.revFind? (fun (c : Char) => c == 'X') == none`
801800
-/
802801
@[specialize pat]
803802
def revFind? [ToBackwardSearcher ρ σ] (s : Slice) (pat : ρ) : Option s.Pos :=
@@ -1371,4 +1370,20 @@ hierarchical, and the string is split at the dots ({lean}`'.'`).
13711370
def toName (s : Slice) : Lean.Name :=
13721371
s.toString.toName
13731372

1373+
instance : Std.ToFormat String.Slice where
1374+
format s := Std.ToFormat.format s.copy
1375+
13741376
end String.Slice
1377+
1378+
/-- Converts a {lean}`Std.Iter String.Slice` to a {lean}`List String`. -/
1379+
@[inline]
1380+
def Std.Iterators.Iter.toStringList {α : Type} [Std.Iterators.Iterator α Id String.Slice]
1381+
[Std.Iterators.Finite α Id] [Std.Iterators.IteratorCollect α Id Id]
1382+
(i : Std.Iter (α := α) String.Slice) : List String :=
1383+
i.map String.Slice.copy |>.toList
1384+
1385+
/-- Converts a {lean}`Std.Iter String.Slice` to an {lean}`Array String`. -/
1386+
def Std.Iterators.Iter.toStringArray {α : Type} [Std.Iterators.Iterator α Id String.Slice]
1387+
[Std.Iterators.Finite α Id] [Std.Iterators.IteratorCollect α Id Id]
1388+
(i : Std.Iter (α := α) String.Slice) : Array String :=
1389+
i.map String.Slice.copy |>.toArray

src/Init/System/FilePath.lean

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ module
88
prelude
99
public import Init.Data.String.Basic
1010
import Init.Data.String.Modify
11+
import Init.Data.String.Search
1112

1213
public section
1314

@@ -242,7 +243,7 @@ def withExtension (p : FilePath) (ext : String) : FilePath :=
242243
Splits a path into a list of individual file names at the platform-specific path separator.
243244
-/
244245
def components (p : FilePath) : List String :=
245-
p.normalize |>.toString.splitOn pathSeparator.toString
246+
p.normalize |>.toString.split pathSeparator.toString |>.toStringList
246247

247248
end FilePath
248249

@@ -273,7 +274,7 @@ Separates the entries in the `$PATH` (or `%PATH%`) environment variable by the c
273274
platform-dependent separator character.
274275
-/
275276
def parse (s : String) : SearchPath :=
276-
s.splitToList (fun c => SearchPath.separator == c) |>.map FilePath.mk
277+
s.split SearchPath.separator |>.map (FilePath.mk ∘ String.Slice.copy) |>.toList
277278

278279
/--
279280
Joins a list of paths into a suitable value for the current platform's `$PATH` (or `%PATH%`)

src/Lean/Compiler/FFI.lean

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ namespace Lean.Compiler.FFI
1919
private opaque getLeancExtraFlags : Unit → String
2020

2121
private def flagsStringToArray (s : String) : Array String :=
22-
s.splitOn.toArray |>.filter (· ≠ "")
22+
s.split ' ' |>.filter (!·.isEmpty) |>.toStringArray
2323

2424
/--
2525
Return C compiler flags for including Lean's headers.

src/Lean/Data/Lsp/Communication.lean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ module
99
prelude
1010
public import Lean.Data.JsonRpc
1111
import Init.Data.String.TakeDrop
12+
import Init.Data.String.Search
1213

1314
public section
1415

src/Lean/DocString/Formatter.lean

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,10 @@ partial def versoSyntaxToString' (stx : Syntax) : ReaderT Nat (StateM String) Un
154154
out "\n"
155155
let i ← read
156156
let s := Syntax.decodeStrLit (atomString s) |>.getD ""
157-
|>.splitToList (· == '\n')
158-
|>.map ("".pushn ' ' i ++ · ) |> "\n".intercalate
157+
|>.split '\n'
158+
|>.map (fun (s : String.Slice) => "".pushn ' ' i ++ s)
159+
|>.toList
160+
|> "\n".intercalate
159161
out s
160162
out <| "".pushn ' ' i
161163
out <| atomString tk2

0 commit comments

Comments
 (0)