Skip to content

Commit 16c7442

Browse files
committed
refactor: incorporate UTF8 material from String.Extra into String.Basic
1 parent 663d4d2 commit 16c7442

File tree

3 files changed

+203
-101
lines changed

3 files changed

+203
-101
lines changed

src/Init/Data/String/Basic.lean

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,55 @@ where
9494
(have := c.utf8Size_pos; have := le_size_of_utf8DecodeChar?_eq_some h; by omega)
9595
termination_by structural fuel
9696

97+
@[inline, expose]
98+
def ByteArray.validateUTF8 (b : ByteArray) : Bool :=
99+
go (b.size + 1) 0 (by simp) (by simp)
100+
where
101+
go (fuel : Nat) (i : Nat) (hi : i ≤ b.size) (hf : b.size - i < fuel) : Bool :=
102+
match fuel, hf with
103+
| fuel + 1, _ =>
104+
if hi : i = b.size then
105+
true
106+
else
107+
match h : validateUtf8At b i with
108+
| false => false
109+
| true => go fuel (i + (b[i].utf8ByteSize (isUtf8FirstByte_of_validateUtf8At h)).byteIdx)
110+
?_ ?_
111+
termination_by structural fuel
112+
finally
113+
all_goals rw [ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar?] at h
114+
· rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
115+
exact add_utf8Size_utf8DecodeChar_le_size
116+
· rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
117+
have := add_utf8Size_utf8DecodeChar_le_size (h := h)
118+
have := (b.utf8DecodeChar i h).utf8Size_pos
119+
omega
120+
121+
theorem ByteArray.isSome_utf8Decode?Go_eq_validateUtf8Go {b : ByteArray} {fuel : Nat}
122+
{i : Nat} {acc : Array Char} {hi : i ≤ b.size} {hf : b.size - i < fuel} :
123+
(utf8Decode?.go b fuel i acc hi hf).isSome = validateUTF8.go b fuel i hi hf := by
124+
fun_induction utf8Decode?.go with
125+
| case1 => simp [validateUTF8.go]
126+
| case2 i acc hi fuel hf h₁ h₂ =>
127+
simp only [Option.isSome_none, validateUTF8.go, h₁, ↓reduceDIte, Bool.false_eq]
128+
split
129+
· rfl
130+
· rename_i heq
131+
simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
132+
| case3 i acc hi fuel hf h₁ c h₂ ih =>
133+
simp [validateUTF8.go, h₁]
134+
split
135+
· rename_i heq
136+
simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
137+
· rw [ih]
138+
congr
139+
rw [← ByteArray.utf8Size_utf8DecodeChar (h := by simp [h₂])]
140+
simp [utf8DecodeChar, h₂]
141+
142+
theorem ByteArray.isSome_utf8Decode?_eq_validateUtf8 {b : ByteArray} :
143+
b.utf8Decode?.isSome = b.validateUTF8 :=
144+
b.isSome_utf8Decode?Go_eq_validateUtf8Go
145+
97146
theorem ByteArray.utf8Decode?.go.congr {b b' : ByteArray} {fuel fuel' i i' : Nat} {acc acc' : Array Char} {hi hi' hf hf'}
98147
(hbb' : b = b') (hii' : i = i') (hacc : acc = acc') :
99148
ByteArray.utf8Decode?.go b fuel i acc hi hf = ByteArray.utf8Decode?.go b' fuel' i' acc' hi' hf' := by
@@ -151,7 +200,50 @@ theorem ByteArray.isSome_utf8Decode?_iff {b : ByteArray} :
151200
rw [utf8Decode?, isSome_utf8Decode?go_iff, extract_zero_size]
152201

153202
@[simp]
154-
theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)
203+
theorem ByteArray.validateUTF8_eq_true_iff {b : ByteArray} :
204+
b.validateUTF8 = true ↔ IsValidUtf8 b := by
205+
rw [← isSome_utf8Decode?_eq_validateUtf8, isSome_utf8Decode?_iff]
206+
207+
@[simp]
208+
theorem ByteArray.validateUTF8_eq_false_iff {b : ByteArray} :
209+
b.validateUTF8 = false ↔ ¬ IsValidUtf8 b := by
210+
simp [← Bool.not_eq_true]
211+
212+
instance {b : ByteArray} : Decidable b.IsValidUtf8 :=
213+
decidable_of_iff (b.validateUTF8 = true) ByteArray.validateUTF8_eq_true_iff
214+
215+
/--
216+
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
217+
the corresponding string.
218+
-/
219+
@[inline, expose]
220+
def String.fromUTF8 (a : @& ByteArray) (h : a.IsValidUtf8) : String :=
221+
.ofByteArray a h
222+
223+
/--
224+
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
225+
the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
226+
-/
227+
@[inline, expose] def String.fromUTF8? (a : ByteArray) : Option String :=
228+
if h : a.IsValidUtf8 then some (fromUTF8 a h) else none
229+
230+
/--
231+
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
232+
the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
233+
-/
234+
@[inline, expose] def String.fromUTF8! (a : ByteArray) : String :=
235+
if h : a.IsValidUtf8 then fromUTF8 a h else panic! "invalid UTF-8 string"
236+
237+
/--
238+
Encodes a string in UTF-8 as an array of bytes.
239+
-/
240+
@[extern "lean_string_to_utf8"]
241+
def String.toUTF8 (a : @& String) : ByteArray :=
242+
a.bytes
243+
244+
@[simp] theorem String.toUTF8_eq_bytes {s : String} : s.toUTF8 = s.bytes := (rfl)
245+
246+
@[simp] theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)
155247

156248
/--
157249
Appends two strings. Usually accessed via the `++` operator.

src/Init/Data/String/Decode.lean

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,14 @@ theorem assemble₁_eq_some_iff_utf8EncodeChar_eq {w : UInt8} {c : Char} :
499499
omega
500500
simpa [String.utf8EncodeChar_eq_singleton hc, assemble₁, Char.ext_iff, ← UInt32.toNat_inj]
501501

502+
@[inline, expose]
503+
public def verify₁ (_w : UInt8) (_h : parseFirstByte w = .done) : Bool :=
504+
true
505+
506+
theorem verify₁_eq_isSome_assemble₁ {w : UInt8} {h : parseFirstByte w = .done} :
507+
verify₁ w h = (assemble₁ w h).isSome := by
508+
simp [verify₁, assemble₁]
509+
502510
/-! # `assemble₂` -/
503511

504512
@[inline, expose]
@@ -531,6 +539,14 @@ where finally
531539
rw [Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mul_comm, ← Nat.two_pow_add_eq_or_of_lt hb₁]
532540
omega
533541

542+
@[inline, expose]
543+
public def verify₂ (w x : UInt8) : Bool :=
544+
if isInvalidContinuationByte x then
545+
false
546+
else
547+
let r := assemble₂Unchecked w x
548+
0x80 ≤ r
549+
534550
theorem helper₃ {x : UInt8} (n : Nat) (hn : n < 8) :
535551
(x &&& UInt8.ofNat (2 ^ n - 1)).toUInt32.toBitVec = (x.toBitVec.setWidth n).setWidth 32 := by
536552
apply BitVec.eq_of_toNat_eq
@@ -628,6 +644,12 @@ theorem assemble₂_eq_some_iff_utf8EncodeChar_eq {x y : UInt8} {c : Char} :
628644
BitVec.setWidth_setWidth_eq_self]
629645
simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_two_iff.1 hc).2
630646

647+
theorem verify₂_eq_isSome_assemble₂ {w x : UInt8} : verify₂ w x = (assemble₂ w x).isSome := by
648+
simp only [verify₂, assemble₂]
649+
split
650+
· simp
651+
· split <;> simp_all
652+
631653
/-! # `assemble₃` -/
632654

633655
@[inline, expose]
@@ -671,6 +693,14 @@ where finally
671693
← Nat.two_pow_add_eq_or_of_lt (by omega)]
672694
omega
673695

696+
@[inline, expose]
697+
public def verify₃ (w x y : UInt8) : Bool :=
698+
if isInvalidContinuationByte x || isInvalidContinuationByte y then
699+
false
700+
else
701+
let r := assemble₃Unchecked w x y
702+
0x800 ≤ r ∧ (r < 0xd8000xdfff < r)
703+
674704
theorem toBitVec_assemble₃Unchecked {w x y : UInt8} : (assemble₃Unchecked w x y).toBitVec =
675705
(w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32 := by
676706
have hw : (w &&& 15).toUInt32.toBitVec = (w.toBitVec.setWidth 4).setWidth 32 := helper₃ 4 (by decide)
@@ -772,6 +802,18 @@ theorem assemble₃_eq_some_iff_utf8EncodeChar_eq {w x y : UInt8} {c : Char} :
772802
← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self]
773803
simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_three_iff.1 hc).2
774804

805+
theorem verify₃_eq_isSome_assemble₃ {w x y : UInt8} :
806+
verify₃ w x y = (assemble₃ w x y).isSome := by
807+
simp only [verify₃, assemble₃]
808+
split
809+
· simp
810+
· split
811+
· simp_all [← UInt32.not_le]
812+
· split
813+
· simp_all
814+
· rename_i h h'
815+
simpa [UInt32.not_lt.1 h] using Classical.not_and_iff_not_or_not.1 h'
816+
775817
/-! # `assemble₄` -/
776818

777819
@[inline, expose]
@@ -799,6 +841,14 @@ where finally
799841
simp only [UInt32.not_lt, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at h₁ h₂
800842
exact Or.inr ⟨by omega, by omega⟩
801843

844+
@[inline, expose]
845+
public def verify₄ (w x y z : UInt8) : Bool :=
846+
if isInvalidContinuationByte x || isInvalidContinuationByte y || isInvalidContinuationByte z then
847+
false
848+
else
849+
let r := assemble₄Unchecked w x y z
850+
0x10000 ≤ r ∧ r ≤ 0x10ffff
851+
802852
theorem toBitVec_assemble₄Unchecked {w x y z : UInt8} : (assemble₄Unchecked w x y z).toBitVec =
803853
(w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32 := by
804854
have hw : (w &&& 7).toUInt32.toBitVec = (w.toBitVec.setWidth 3).setWidth 32 := helper₃ 3 (by decide)
@@ -917,11 +967,22 @@ theorem assemble₄_eq_some_iff_utf8EncodeChar_eq {w x y z : UInt8} {c : Char} :
917967
Nat.reducePow, Nat.reduceMod, gt_iff_lt]
918968
omega
919969

970+
theorem verify₄_eq_isSome_assemble₄ {w x y z : UInt8} :
971+
verify₄ w x y z = (assemble₄ w x y z).isSome := by
972+
simp only [verify₄, assemble₄]
973+
split
974+
· simp
975+
· split
976+
· simp_all [← UInt32.not_lt]
977+
· split
978+
· simp_all
979+
· simp_all [← UInt32.not_lt]
980+
920981
end ByteArray.utf8DecodeChar?
921982

922983
open ByteArray.utf8DecodeChar?
923984

924-
/- # `utf8DecodeChar?` -/
985+
/- # `utf8DecodeChar?` and `validateUtf8At` -/
925986

926987
@[inline, expose]
927988
public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char :=
@@ -944,6 +1005,27 @@ public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char
9441005
else none
9451006
else none
9461007

1008+
@[inline, expose]
1009+
public def ByteArray.validateUtf8At (bytes : ByteArray) (i : Nat) : Bool :=
1010+
if h₀ : i < bytes.size then
1011+
match h : parseFirstByte bytes[i] with
1012+
| .invalid => false
1013+
| .done => verify₁ bytes[i] h
1014+
| .oneMore =>
1015+
if h₁ : i + 1 < bytes.size then
1016+
verify₂ bytes[i] bytes[i + 1]
1017+
else
1018+
false
1019+
| .twoMore =>
1020+
if h₁ : i + 2 < bytes.size then
1021+
verify₃ bytes[i] bytes[i + 1] bytes[i + 2]
1022+
else false
1023+
| .threeMore =>
1024+
if h₁ : i + 3 < bytes.size then
1025+
verify₄ bytes[i] bytes[i + 1] bytes[i + 2] bytes[i + 3]
1026+
else false
1027+
else false
1028+
9471029
/-! # `utf8DecodeChar?` low-level API -/
9481030

9491031
theorem parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char}
@@ -1130,6 +1212,14 @@ public theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b :
11301212
rw [← assemble₄_eq_some_iff_utf8EncodeChar_eq]
11311213
exact ⟨this, h⟩
11321214

1215+
public theorem ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} :
1216+
b.validateUtf8At i = (b.utf8DecodeChar? i).isSome := by
1217+
simp only [validateUtf8At, utf8DecodeChar?]
1218+
split
1219+
· split <;> (try split) <;> simp [verify₁_eq_isSome_assemble₁, verify₂_eq_isSome_assemble₂,
1220+
verify₃_eq_isSome_assemble₃, verify₄_eq_isSome_assemble₄]
1221+
· simp
1222+
11331223
/-! # Corollaries -/
11341224

11351225
public theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) :
@@ -1195,6 +1285,10 @@ public theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i :
11951285
have := c.utf8Size_pos
11961286
omega
11971287

1288+
public theorem ByteArray.lt_size_of_validateUtf8At {b : ByteArray} {i : Nat} :
1289+
validateUtf8At b i = true → i < b.size :=
1290+
validateUtf8At_eq_isSome_utf8DecodeChar? ▸ lt_size_of_isSome_utf8DecodeChar?
1291+
11981292
public theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : utf8DecodeChar? b i = some c)
11991293
(b' : ByteArray) : utf8DecodeChar? (b ++ b') i = some c := by
12001294
have := le_size_of_utf8DecodeChar?_eq_some h
@@ -1211,6 +1305,10 @@ public theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat}
12111305
public def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (utf8DecodeChar? bytes i).isSome) : Char :=
12121306
(utf8DecodeChar? bytes i).get h
12131307

1308+
public theorem ByteArray.add_utf8Size_utf8DecodeChar_le_size {b : ByteArray} {i : Nat} {h} :
1309+
i + (b.utf8DecodeChar i h).utf8Size ≤ b.size :=
1310+
le_size_of_utf8DecodeChar?_eq_some (by simp [utf8DecodeChar])
1311+
12141312
public theorem ByteArray.utf8DecodeChar_eq_utf8DecodeChar_extract {b : ByteArray} {i : Nat} {h} :
12151313
utf8DecodeChar b i h =
12161314
utf8DecodeChar (b.extract i b.size) 0 (by rwa [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h) := by
@@ -1363,6 +1461,11 @@ public theorem ByteArray.isUtf8FirstByte_of_isSome_utf8DecodeChar? {b : ByteArra
13631461
conv => congr; congr; rw [eq_of_utf8DecodeChar?_eq_some hc]
13641462
exact isUtf8FirstByte_getElem_zero_utf8EncodeChar_append
13651463

1464+
public theorem ByteArray.isUtf8FirstByte_of_validateUtf8At {b : ByteArray} {i : Nat} :
1465+
(h : validateUtf8At b i = true) → (b[i]'(lt_size_of_validateUtf8At h)).IsUtf8FirstByte := by
1466+
simp only [validateUtf8At_eq_isSome_utf8DecodeChar?]
1467+
exact isUtf8FirstByte_of_isSome_utf8DecodeChar?
1468+
13661469
theorem Char.byteIdx_utf8ByteSize_getElem_utf8EncodeChar {c : Char} :
13671470
(((String.utf8EncodeChar c)[0]'(by simp [c.utf8Size_pos])).utf8ByteSize
13681471
UInt8.isUtf8FirstByte_getElem_zero_utf8EncodeChar).byteIdx = c.utf8Size := by

0 commit comments

Comments
 (0)