@@ -499,6 +499,14 @@ theorem assemble₁_eq_some_iff_utf8EncodeChar_eq {w : UInt8} {c : Char} :
499499 omega
500500 simpa [String.utf8EncodeChar_eq_singleton hc, assemble₁, Char.ext_iff, ← UInt32.toNat_inj]
501501
502+ @[inline, expose]
503+ public def verify₁ (_w : UInt8) (_h : parseFirstByte w = .done) : Bool :=
504+ true
505+
506+ theorem verify₁_eq_isSome_assemble₁ {w : UInt8} {h : parseFirstByte w = .done} :
507+ verify₁ w h = (assemble₁ w h).isSome := by
508+ simp [verify₁, assemble₁]
509+
502510/-! # `assemble₂` -/
503511
504512@[inline, expose]
@@ -531,6 +539,14 @@ where finally
531539 rw [Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mul_comm, ← Nat.two_pow_add_eq_or_of_lt hb₁]
532540 omega
533541
542+ @[inline, expose]
543+ public def verify₂ (w x : UInt8) : Bool :=
544+ if isInvalidContinuationByte x then
545+ false
546+ else
547+ let r := assemble₂Unchecked w x
548+ 0x80 ≤ r
549+
534550theorem helper₃ {x : UInt8} (n : Nat) (hn : n < 8 ) :
535551 (x &&& UInt8.ofNat (2 ^ n - 1 )).toUInt32.toBitVec = (x.toBitVec.setWidth n).setWidth 32 := by
536552 apply BitVec.eq_of_toNat_eq
@@ -628,6 +644,12 @@ theorem assemble₂_eq_some_iff_utf8EncodeChar_eq {x y : UInt8} {c : Char} :
628644 BitVec.setWidth_setWidth_eq_self]
629645 simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_two_iff.1 hc).2
630646
647+ theorem verify₂_eq_isSome_assemble₂ {w x : UInt8} : verify₂ w x = (assemble₂ w x).isSome := by
648+ simp only [verify₂, assemble₂]
649+ split
650+ · simp
651+ · split <;> simp_all
652+
631653/-! # `assemble₃` -/
632654
633655@[inline, expose]
@@ -671,6 +693,14 @@ where finally
671693 ← Nat.two_pow_add_eq_or_of_lt (by omega)]
672694 omega
673695
696+ @[inline, expose]
697+ public def verify₃ (w x y : UInt8) : Bool :=
698+ if isInvalidContinuationByte x || isInvalidContinuationByte y then
699+ false
700+ else
701+ let r := assemble₃Unchecked w x y
702+ 0x800 ≤ r ∧ (r < 0xd800 ∨ 0xdfff < r)
703+
674704theorem toBitVec_assemble₃Unchecked {w x y : UInt8} : (assemble₃Unchecked w x y).toBitVec =
675705 (w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ).setWidth 32 := by
676706 have hw : (w &&& 15 ).toUInt32.toBitVec = (w.toBitVec.setWidth 4 ).setWidth 32 := helper₃ 4 (by decide)
@@ -772,6 +802,18 @@ theorem assemble₃_eq_some_iff_utf8EncodeChar_eq {w x y : UInt8} {c : Char} :
772802 ← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self]
773803 simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_three_iff.1 hc).2
774804
805+ theorem verify₃_eq_isSome_assemble₃ {w x y : UInt8} :
806+ verify₃ w x y = (assemble₃ w x y).isSome := by
807+ simp only [verify₃, assemble₃]
808+ split
809+ · simp
810+ · split
811+ · simp_all [← UInt32.not_le]
812+ · split
813+ · simp_all
814+ · rename_i h h'
815+ simpa [UInt32.not_lt.1 h] using Classical.not_and_iff_not_or_not.1 h'
816+
775817/-! # `assemble₄` -/
776818
777819@[inline, expose]
@@ -799,6 +841,14 @@ where finally
799841 simp only [UInt32.not_lt, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at h₁ h₂
800842 exact Or.inr ⟨by omega, by omega⟩
801843
844+ @[inline, expose]
845+ public def verify₄ (w x y z : UInt8) : Bool :=
846+ if isInvalidContinuationByte x || isInvalidContinuationByte y || isInvalidContinuationByte z then
847+ false
848+ else
849+ let r := assemble₄Unchecked w x y z
850+ 0x10000 ≤ r ∧ r ≤ 0x10ffff
851+
802852theorem toBitVec_assemble₄Unchecked {w x y z : UInt8} : (assemble₄Unchecked w x y z).toBitVec =
803853 (w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6 ).setWidth 32 := by
804854 have hw : (w &&& 7 ).toUInt32.toBitVec = (w.toBitVec.setWidth 3 ).setWidth 32 := helper₃ 3 (by decide)
@@ -917,11 +967,22 @@ theorem assemble₄_eq_some_iff_utf8EncodeChar_eq {w x y z : UInt8} {c : Char} :
917967 Nat.reducePow, Nat.reduceMod, gt_iff_lt]
918968 omega
919969
970+ theorem verify₄_eq_isSome_assemble₄ {w x y z : UInt8} :
971+ verify₄ w x y z = (assemble₄ w x y z).isSome := by
972+ simp only [verify₄, assemble₄]
973+ split
974+ · simp
975+ · split
976+ · simp_all [← UInt32.not_lt]
977+ · split
978+ · simp_all
979+ · simp_all [← UInt32.not_lt]
980+
920981end ByteArray.utf8DecodeChar?
921982
922983open ByteArray.utf8DecodeChar?
923984
924- /- # `utf8DecodeChar?` -/
985+ /- # `utf8DecodeChar?` and `validateUtf8At` -/
925986
926987@[inline, expose]
927988public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char :=
@@ -944,6 +1005,27 @@ public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char
9441005 else none
9451006 else none
9461007
1008+ @[inline, expose]
1009+ public def ByteArray.validateUtf8At (bytes : ByteArray) (i : Nat) : Bool :=
1010+ if h₀ : i < bytes.size then
1011+ match h : parseFirstByte bytes[i] with
1012+ | .invalid => false
1013+ | .done => verify₁ bytes[i] h
1014+ | .oneMore =>
1015+ if h₁ : i + 1 < bytes.size then
1016+ verify₂ bytes[i] bytes[i + 1 ]
1017+ else
1018+ false
1019+ | .twoMore =>
1020+ if h₁ : i + 2 < bytes.size then
1021+ verify₃ bytes[i] bytes[i + 1 ] bytes[i + 2 ]
1022+ else false
1023+ | .threeMore =>
1024+ if h₁ : i + 3 < bytes.size then
1025+ verify₄ bytes[i] bytes[i + 1 ] bytes[i + 2 ] bytes[i + 3 ]
1026+ else false
1027+ else false
1028+
9471029/-! # `utf8DecodeChar?` low-level API -/
9481030
9491031theorem parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char}
@@ -1130,6 +1212,14 @@ public theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b :
11301212 rw [← assemble₄_eq_some_iff_utf8EncodeChar_eq]
11311213 exact ⟨this, h⟩
11321214
1215+ public theorem ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} :
1216+ b.validateUtf8At i = (b.utf8DecodeChar? i).isSome := by
1217+ simp only [validateUtf8At, utf8DecodeChar?]
1218+ split
1219+ · split <;> (try split) <;> simp [verify₁_eq_isSome_assemble₁, verify₂_eq_isSome_assemble₂,
1220+ verify₃_eq_isSome_assemble₃, verify₄_eq_isSome_assemble₄]
1221+ · simp
1222+
11331223/-! # Corollaries -/
11341224
11351225public theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) :
@@ -1195,6 +1285,10 @@ public theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i :
11951285 have := c.utf8Size_pos
11961286 omega
11971287
1288+ public theorem ByteArray.lt_size_of_validateUtf8At {b : ByteArray} {i : Nat} :
1289+ validateUtf8At b i = true → i < b.size :=
1290+ validateUtf8At_eq_isSome_utf8DecodeChar? ▸ lt_size_of_isSome_utf8DecodeChar?
1291+
11981292public theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : utf8DecodeChar? b i = some c)
11991293 (b' : ByteArray) : utf8DecodeChar? (b ++ b') i = some c := by
12001294 have := le_size_of_utf8DecodeChar?_eq_some h
@@ -1211,6 +1305,10 @@ public theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat}
12111305public def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (utf8DecodeChar? bytes i).isSome) : Char :=
12121306 (utf8DecodeChar? bytes i).get h
12131307
1308+ public theorem ByteArray.add_utf8Size_utf8DecodeChar_le_size {b : ByteArray} {i : Nat} {h} :
1309+ i + (b.utf8DecodeChar i h).utf8Size ≤ b.size :=
1310+ le_size_of_utf8DecodeChar?_eq_some (by simp [utf8DecodeChar])
1311+
12141312public theorem ByteArray.utf8DecodeChar_eq_utf8DecodeChar_extract {b : ByteArray} {i : Nat} {h} :
12151313 utf8DecodeChar b i h =
12161314 utf8DecodeChar (b.extract i b.size) 0 (by rwa [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h) := by
@@ -1363,6 +1461,11 @@ public theorem ByteArray.isUtf8FirstByte_of_isSome_utf8DecodeChar? {b : ByteArra
13631461 conv => congr; congr; rw [eq_of_utf8DecodeChar?_eq_some hc]
13641462 exact isUtf8FirstByte_getElem_zero_utf8EncodeChar_append
13651463
1464+ public theorem ByteArray.isUtf8FirstByte_of_validateUtf8At {b : ByteArray} {i : Nat} :
1465+ (h : validateUtf8At b i = true ) → (b[i]'(lt_size_of_validateUtf8At h)).IsUtf8FirstByte := by
1466+ simp only [validateUtf8At_eq_isSome_utf8DecodeChar?]
1467+ exact isUtf8FirstByte_of_isSome_utf8DecodeChar?
1468+
13661469theorem Char.byteIdx_utf8ByteSize_getElem_utf8EncodeChar {c : Char} :
13671470 (((String.utf8EncodeChar c)[0 ]'(by simp [c.utf8Size_pos])).utf8ByteSize
13681471 UInt8.isUtf8FirstByte_getElem_zero_utf8EncodeChar).byteIdx = c.utf8Size := by
0 commit comments