@@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do
52
52
* `{:str, 6, "foo"}`
53
53
54
54
"""
55
- @ spec tokenize ( binary ) :: { :ok , [ token ] } | { :error , pos_integer , binary }
56
- def tokenize ( str ) do
57
- tokenize_line ( str , _line = 1 , _tokens_acc = [ ] )
55
+ @ spec tokenize ( binary , [ Expo.PO . parse_option ( ) ] ) ::
56
+ { :ok , [ token ] } | { :error , pos_integer , binary }
57
+ def tokenize ( str , opts \\ [ ] ) do
58
+ strip_meta? = Keyword . get ( opts , :strip_meta , false )
59
+ tokenize_line ( str , _line = 1 , strip_meta? , _tokens_acc = [ ] )
58
60
end
59
61
60
62
# Reverse str_lines strings.
@@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do
86
88
end
87
89
88
90
# End of file.
89
- defp tokenize_line ( << >> , line , acc ) do
91
+ defp tokenize_line ( << >> , line , _strip_meta? , acc ) do
90
92
{ :ok , [ { :"$end" , line } | acc ] |> Enum . reverse ( ) |> postprocess_tokens ( ) }
91
93
end
92
94
93
95
# Go to the next line.
94
- defp tokenize_line ( << ?\n , rest :: binary >> , line , acc ) do
95
- tokenize_line ( rest , line + 1 , acc )
96
+ defp tokenize_line ( << ?\n , rest :: binary >> , line , strip_meta? , acc ) do
97
+ tokenize_line ( rest , line + 1 , strip_meta? , acc )
96
98
end
97
99
98
100
# Skip other whitespace.
99
- defp tokenize_line ( << char , rest :: binary >> , line , acc )
101
+ defp tokenize_line ( << char , rest :: binary >> , line , strip_meta? , acc )
100
102
when char in @ whitespace_no_nl do
101
- tokenize_line ( rest , line , acc )
103
+ tokenize_line ( rest , line , strip_meta? , acc )
104
+ end
105
+
106
+ # Skip Meta Information when strip_meta is enabled
107
+ defp tokenize_line ( << ?# , rest :: binary >> , line , true , acc ) do
108
+ from_next_line = discard_until_nl ( rest )
109
+ tokenize_line ( from_next_line , line , true , acc )
102
110
end
103
111
104
112
# Obsolete comment.
105
- defp tokenize_line ( << "#~" , rest :: binary >> , line , acc ) do
106
- tokenize_line ( rest , line , [ { :obsolete , line } | acc ] )
113
+ defp tokenize_line ( << "#~" , rest :: binary >> , line , strip_meta? , acc ) do
114
+ tokenize_line ( rest , line , strip_meta? , [ { :obsolete , line } | acc ] )
107
115
end
108
116
109
117
# Previous comment.
110
- defp tokenize_line ( << "#|" , rest :: binary >> , line , acc ) do
111
- tokenize_line ( rest , line , [ { :previous , line } | acc ] )
118
+ defp tokenize_line ( << "#|" , rest :: binary >> , line , strip_meta? , acc ) do
119
+ tokenize_line ( rest , line , strip_meta? , [ { :previous , line } | acc ] )
112
120
end
113
121
114
122
# Normal comment.
115
- defp tokenize_line ( << ?# , _rest :: binary >> = rest , line , acc ) do
123
+ defp tokenize_line ( << ?# , _rest :: binary >> = rest , line , strip_meta? , acc ) do
116
124
{ contents , rest } = to_eol_or_eof ( rest , "" )
117
- tokenize_line ( rest , line , [ { :comment , line , contents } | acc ] )
125
+ tokenize_line ( rest , line , strip_meta? , [ { :comment , line , contents } | acc ] )
118
126
end
119
127
120
128
# Keywords.
121
129
for kw <- @ string_keywords do
122
- defp tokenize_line ( unquote ( kw ) <> << char , rest :: binary >> , line , acc )
130
+ defp tokenize_line ( unquote ( kw ) <> << char , rest :: binary >> , line , strip_meta? , acc )
123
131
when char in @ whitespace do
124
132
acc = [ { unquote ( String . to_existing_atom ( kw ) ) , line } | acc ]
125
- tokenize_line ( rest , line , acc )
133
+ tokenize_line ( rest , line , strip_meta? , acc )
126
134
end
127
135
128
- defp tokenize_line ( unquote ( kw ) <> _rest , line , _acc ) do
136
+ defp tokenize_line ( unquote ( kw ) <> _rest , line , _strip_meta? , _acc ) do
129
137
{ :error , line , "no space after '#{ unquote ( kw ) } '" }
130
138
end
131
139
end
132
140
133
141
# `msgstr`.
134
- defp tokenize_line ( "msgstr[" <> << rest :: binary >> , line , acc ) do
142
+ defp tokenize_line ( "msgstr[" <> << rest :: binary >> , line , strip_meta? , acc ) do
135
143
case tokenize_plural_form ( rest , "" ) do
136
144
{ :ok , plural_form , rest } ->
137
145
# The order of the :plural_form and :msgstr tokens is inverted since
138
146
# the `acc` array of tokens will be reversed at the end.
139
147
acc = [ { :plural_form , line , plural_form } , { :msgstr , line } | acc ]
140
- tokenize_line ( rest , line , acc )
148
+ tokenize_line ( rest , line , strip_meta? , acc )
141
149
142
150
{ :error , reason } ->
143
151
{ :error , line , reason }
144
152
end
145
153
end
146
154
147
- defp tokenize_line ( "msgstr" <> << char , rest :: binary >> , line , acc )
155
+ defp tokenize_line ( "msgstr" <> << char , rest :: binary >> , line , strip_meta? , acc )
148
156
when char in @ whitespace do
149
157
acc = [ { :msgstr , line } | acc ]
150
- tokenize_line ( rest , line , acc )
158
+ tokenize_line ( rest , line , strip_meta? , acc )
151
159
end
152
160
153
- defp tokenize_line ( "msgstr" <> _rest , line , _acc ) do
161
+ defp tokenize_line ( "msgstr" <> _rest , line , _strip_meta? , _acc ) do
154
162
{ :error , line , "no space after 'msgstr'" }
155
163
end
156
164
157
165
# String.
158
- defp tokenize_line ( << ?" , rest :: binary >> , line , acc ) do
166
+ defp tokenize_line ( << ?" , rest :: binary >> , line , strip_meta? , acc ) do
159
167
case tokenize_string ( rest , "" ) do
160
168
{ :ok , string , rest } ->
161
- tokenize_line ( rest , line , add_str_lines ( line , string , acc ) )
169
+ tokenize_line ( rest , line , strip_meta? , add_str_lines ( line , string , acc ) )
162
170
163
171
{ :error , reason } ->
164
172
{ :error , line , reason }
@@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do
170
178
# a letter (we don't take care of unicode or fancy stuff, just ASCII letters),
171
179
# we assume there's an unknown keyword. We parse it with a regex
172
180
# so that the error message is informative.
173
- defp tokenize_line ( << letter , _rest :: binary >> = binary , line , _acc )
181
+ defp tokenize_line ( << letter , _rest :: binary >> = binary , line , _strip_meta? , _acc )
174
182
when letter in ?a .. ?z or letter in ?A .. ?Z do
175
183
next_word = List . first ( Regex . run ( ~r/ \w +/ u , binary ) )
176
184
{ :error , line , "unknown keyword '#{ next_word } '" }
@@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do
180
188
# Last resort: this is just a plain unexpected token. We take the first
181
189
# Unicode char of the given binary and build an informative error message
182
190
# (with the codepoint of the char).
183
- defp tokenize_line ( binary , line , _acc ) when is_binary ( binary ) do
191
+ defp tokenize_line ( binary , line , _strip_meta? , _acc ) when is_binary ( binary ) do
184
192
# To get the first Unicode char, we convert to char list first.
185
193
[ char | _ ] = String . to_charlist ( binary )
186
194
msg = :io_lib . format ( ~c" unexpected token: \" ~ts\" (codepoint U+~4.16.0B)" , [ [ char ] , char ] )
187
195
{ :error , line , :unicode . characters_to_binary ( msg ) }
188
196
end
189
197
198
+ defp discard_until_nl ( content )
199
+ defp discard_until_nl ( << ?\n , _rest :: binary >> = content ) , do: content
200
+ defp discard_until_nl ( << >> ) , do: << >>
201
+ defp discard_until_nl ( << _char , rest :: binary >> ) , do: discard_until_nl ( rest )
202
+
190
203
@ obsolete_keywords ~w( msgid msgid_plural msgctxt msgstr) a
191
204
192
205
# Collapse the string into the previous str_lines token if there is one *on the same line*.
0 commit comments