@@ -17,19 +17,42 @@ extension BPETokenizer {
1717
1818    /// Read merges.txt file at URL into a dictionary mapping bigrams to the line number/rank/priority
1919    static  func  readMerges( url:  URL )  throws  ->  [ TokenPair :  Int ]  { 
20-         let  content  =  try   String ( contentsOf:  url) 
21-         let  lines  =  content. split ( separator:  " \n " ) 
22- 
23-         let  merges :  [ ( TokenPair ,  Int ) ]  =  try   lines. enumerated ( ) . compactMap  {  ( index,  line)  in 
24-             if  line. hasPrefix ( " # " )  { 
25-                 return  nil 
26-             } 
27-             let  pair  =  line. split ( separator:  "   " ) 
28-             if  pair. count !=  2  { 
29-                 throw  FileReadError . invalidMergeFileLine ( index+ 1 ) 
20+         let  data  =  try   Data ( contentsOf:  url) 
21+         var  merges  =  [ TokenPair:  Int] ( ) 
22+         var  index  =  0 
23+         var  line  =  [ UInt8] ( ) 
24+         for  byte  in  data { 
25+             if  byte ==  UInt8 ( ascii:  " \n " )  { 
26+                 if  let  pair =  try   parseMergesLine ( line,  index:  index)  { 
27+                     merges [ pair]  =  index
28+                 } 
29+                 line. removeAll ( keepingCapacity:  true ) 
30+                 index +=  1 
31+             }  else  { 
32+                 line. append ( byte) 
3033            } 
31-             return  ( TokenPair ( String ( pair [ 0 ] ) ,  String ( pair [ 1 ] ) ) , index) 
3234        } 
33-         return  [ TokenPair :  Int] ( uniqueKeysWithValues:  merges) 
35+ 
36+         return  merges
37+     } 
38+ 
39+     static  func  parseMergesLine( _ line:  [ UInt8 ] ,  index:  Int )  throws  ->  TokenPair ?   { 
40+         if  line. isEmpty || line. first ==  UInt8 ( ascii:  " # " )  { 
41+             return  nil 
42+         } 
43+         let  pair  =  line. split ( separator:  UInt8 ( ascii:  "   " ) ) 
44+         if  pair. count !=  2  { 
45+             throw  FileReadError . invalidMergeFileLine ( index +  1 ) 
46+         } 
47+         return  TokenPair (  String ( bytes:  pair [ 0 ] ) ,  String ( bytes:  pair [ 1 ] ) ) 
48+     } 
49+ } 
50+ 
51+ extension  String  { 
52+     init ( bytes:  some  Collection < UInt8 > )  { 
53+         self . init ( unsafeUninitializedCapacity:  bytes. count)  {  pointer in 
54+             _ =  pointer. initialize ( fromContentsOf:  bytes) 
55+             return  bytes. count
56+         } 
3457    } 
3558} 
0 commit comments