Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit bd94b21

Browse files
authoredSep 10, 2024··
Add option to strip metadata when parsing PO files (#141)
1 parent deb65ba commit bd94b21

File tree

4 files changed

+83
-32
lines changed

4 files changed

+83
-32
lines changed
 

‎lib/expo/po.ex

+11-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,17 @@ defmodule Expo.PO do
66
alias Expo.Messages
77
alias Expo.PO.{DuplicateMessagesError, Parser, SyntaxError}
88

9-
@type parse_option :: {:file, Path.t()}
9+
@typedoc """
10+
Parsing option.
11+
12+
* `:file` (`t:Path.t/0`) - path to use in error messages when using `parse_string/2`. If not present, errors
13+
don't have a path.
14+
15+
* `:strip_meta` (`t:boolean/0`) - include only messages (no comments and other metadata) from the `.po` file
16+
to reduce memory usage when meta information is not needed.
17+
Defaults to `false`.
18+
"""
19+
@type parse_option :: {:file, Path.t()} | {:strip_meta, boolean()}
1020

1121
@doc """
1222
Dumps a `Expo.Messages` struct as iodata.

‎lib/expo/po/parser.ex

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ defmodule Expo.PO.Parser do
1212
def parse(content, opts) do
1313
content = prune_bom(content, Keyword.get(opts, :file, "nofile"))
1414

15-
with {:ok, tokens} <- tokenize(content),
15+
with {:ok, tokens} <- tokenize(content, opts),
1616
{:ok, po} <- parse_tokens(tokens),
1717
{:ok, po} <- check_for_duplicates(po) do
1818
{:ok, %Messages{po | file: Keyword.get(opts, :file)}}
@@ -22,8 +22,8 @@ defmodule Expo.PO.Parser do
2222
end
2323
end
2424

25-
defp tokenize(content) do
26-
case Tokenizer.tokenize(content) do
25+
defp tokenize(content, opts) do
26+
case Tokenizer.tokenize(content, opts) do
2727
{:ok, tokens} -> {:ok, tokens}
2828
{:error, line, message} -> {:error, %SyntaxError{line: line, reason: message}}
2929
end

‎lib/expo/po/tokenizer.ex

+39-26
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do
5252
* `{:str, 6, "foo"}`
5353
5454
"""
55-
@spec tokenize(binary) :: {:ok, [token]} | {:error, pos_integer, binary}
56-
def tokenize(str) do
57-
tokenize_line(str, _line = 1, _tokens_acc = [])
55+
@spec tokenize(binary, [Expo.PO.parse_option()]) ::
56+
{:ok, [token]} | {:error, pos_integer, binary}
57+
def tokenize(str, opts \\ []) do
58+
strip_meta? = Keyword.get(opts, :strip_meta, false)
59+
tokenize_line(str, _line = 1, strip_meta?, _tokens_acc = [])
5860
end
5961

6062
# Reverse str_lines strings.
@@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do
8688
end
8789

8890
# End of file.
89-
defp tokenize_line(<<>>, line, acc) do
91+
defp tokenize_line(<<>>, line, _strip_meta?, acc) do
9092
{:ok, [{:"$end", line} | acc] |> Enum.reverse() |> postprocess_tokens()}
9193
end
9294

9395
# Go to the next line.
94-
defp tokenize_line(<<?\n, rest::binary>>, line, acc) do
95-
tokenize_line(rest, line + 1, acc)
96+
defp tokenize_line(<<?\n, rest::binary>>, line, strip_meta?, acc) do
97+
tokenize_line(rest, line + 1, strip_meta?, acc)
9698
end
9799

98100
# Skip other whitespace.
99-
defp tokenize_line(<<char, rest::binary>>, line, acc)
101+
defp tokenize_line(<<char, rest::binary>>, line, strip_meta?, acc)
100102
when char in @whitespace_no_nl do
101-
tokenize_line(rest, line, acc)
103+
tokenize_line(rest, line, strip_meta?, acc)
104+
end
105+
106+
# Skip Meta Information when strip_meta is enabled
107+
defp tokenize_line(<<?#, rest::binary>>, line, true, acc) do
108+
from_next_line = discard_until_nl(rest)
109+
tokenize_line(from_next_line, line, true, acc)
102110
end
103111

104112
# Obsolete comment.
105-
defp tokenize_line(<<"#~", rest::binary>>, line, acc) do
106-
tokenize_line(rest, line, [{:obsolete, line} | acc])
113+
defp tokenize_line(<<"#~", rest::binary>>, line, strip_meta?, acc) do
114+
tokenize_line(rest, line, strip_meta?, [{:obsolete, line} | acc])
107115
end
108116

109117
# Previous comment.
110-
defp tokenize_line(<<"#|", rest::binary>>, line, acc) do
111-
tokenize_line(rest, line, [{:previous, line} | acc])
118+
defp tokenize_line(<<"#|", rest::binary>>, line, strip_meta?, acc) do
119+
tokenize_line(rest, line, strip_meta?, [{:previous, line} | acc])
112120
end
113121

114122
# Normal comment.
115-
defp tokenize_line(<<?#, _rest::binary>> = rest, line, acc) do
123+
defp tokenize_line(<<?#, _rest::binary>> = rest, line, strip_meta?, acc) do
116124
{contents, rest} = to_eol_or_eof(rest, "")
117-
tokenize_line(rest, line, [{:comment, line, contents} | acc])
125+
tokenize_line(rest, line, strip_meta?, [{:comment, line, contents} | acc])
118126
end
119127

120128
# Keywords.
121129
for kw <- @string_keywords do
122-
defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, acc)
130+
defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, strip_meta?, acc)
123131
when char in @whitespace do
124132
acc = [{unquote(String.to_existing_atom(kw)), line} | acc]
125-
tokenize_line(rest, line, acc)
133+
tokenize_line(rest, line, strip_meta?, acc)
126134
end
127135

128-
defp tokenize_line(unquote(kw) <> _rest, line, _acc) do
136+
defp tokenize_line(unquote(kw) <> _rest, line, _strip_meta?, _acc) do
129137
{:error, line, "no space after '#{unquote(kw)}'"}
130138
end
131139
end
132140

133141
# `msgstr`.
134-
defp tokenize_line("msgstr[" <> <<rest::binary>>, line, acc) do
142+
defp tokenize_line("msgstr[" <> <<rest::binary>>, line, strip_meta?, acc) do
135143
case tokenize_plural_form(rest, "") do
136144
{:ok, plural_form, rest} ->
137145
# The order of the :plural_form and :msgstr tokens is inverted since
138146
# the `acc` array of tokens will be reversed at the end.
139147
acc = [{:plural_form, line, plural_form}, {:msgstr, line} | acc]
140-
tokenize_line(rest, line, acc)
148+
tokenize_line(rest, line, strip_meta?, acc)
141149

142150
{:error, reason} ->
143151
{:error, line, reason}
144152
end
145153
end
146154

147-
defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, acc)
155+
defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, strip_meta?, acc)
148156
when char in @whitespace do
149157
acc = [{:msgstr, line} | acc]
150-
tokenize_line(rest, line, acc)
158+
tokenize_line(rest, line, strip_meta?, acc)
151159
end
152160

153-
defp tokenize_line("msgstr" <> _rest, line, _acc) do
161+
defp tokenize_line("msgstr" <> _rest, line, _strip_meta?, _acc) do
154162
{:error, line, "no space after 'msgstr'"}
155163
end
156164

157165
# String.
158-
defp tokenize_line(<<?", rest::binary>>, line, acc) do
166+
defp tokenize_line(<<?", rest::binary>>, line, strip_meta?, acc) do
159167
case tokenize_string(rest, "") do
160168
{:ok, string, rest} ->
161-
tokenize_line(rest, line, add_str_lines(line, string, acc))
169+
tokenize_line(rest, line, strip_meta?, add_str_lines(line, string, acc))
162170

163171
{:error, reason} ->
164172
{:error, line, reason}
@@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do
170178
# a letter (we don't take care of unicode or fancy stuff, just ASCII letters),
171179
# we assume there's an unknown keyword. We parse it with a regex
172180
# so that the error message is informative.
173-
defp tokenize_line(<<letter, _rest::binary>> = binary, line, _acc)
181+
defp tokenize_line(<<letter, _rest::binary>> = binary, line, _strip_meta?, _acc)
174182
when letter in ?a..?z or letter in ?A..?Z do
175183
next_word = List.first(Regex.run(~r/\w+/u, binary))
176184
{:error, line, "unknown keyword '#{next_word}'"}
@@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do
180188
# Last resort: this is just a plain unexpected token. We take the first
181189
# Unicode char of the given binary and build an informative error message
182190
# (with the codepoint of the char).
183-
defp tokenize_line(binary, line, _acc) when is_binary(binary) do
191+
defp tokenize_line(binary, line, _strip_meta?, _acc) when is_binary(binary) do
184192
# To get the first Unicode char, we convert to char list first.
185193
[char | _] = String.to_charlist(binary)
186194
msg = :io_lib.format(~c"unexpected token: \"~ts\" (codepoint U+~4.16.0B)", [[char], char])
187195
{:error, line, :unicode.characters_to_binary(msg)}
188196
end
189197

198+
defp discard_until_nl(content)
199+
defp discard_until_nl(<<?\n, _rest::binary>> = content), do: content
200+
defp discard_until_nl(<<>>), do: <<>>
201+
defp discard_until_nl(<<_char, rest::binary>>), do: discard_until_nl(rest)
202+
190203
@obsolete_keywords ~w(msgid msgid_plural msgctxt msgstr)a
191204

192205
# Collapse the string into the previous str_lines token if there is one *on the same line*.

‎test/expo/parser_test.exs

+30-2
Original file line numberDiff line numberDiff line change
@@ -454,8 +454,36 @@ defmodule Expo.ParserTest do
454454
end
455455
end
456456

457-
defp parse(string) do
458-
case PO.parse_string(string) do
457+
describe "strip meta" do
458+
test "does not include extra information" do
459+
assert [
460+
%Message.Plural{
461+
msgid: ["foo"],
462+
msgid_plural: ["foos"],
463+
msgstr: %{0 => ["bar"], 1 => ["bars"]},
464+
comments: [],
465+
extracted_comments: [],
466+
references: []
467+
}
468+
] =
469+
parse(
470+
"""
471+
# This is a message
472+
#: lib/foo.ex:32
473+
# Ah, another comment!
474+
#. An extracted comment
475+
msgid "foo"
476+
msgid_plural "foos"
477+
msgstr[0] "bar"
478+
msgstr[1] "bars"
479+
""",
480+
strip_meta: true
481+
)
482+
end
483+
end
484+
485+
defp parse(string, options \\ []) do
486+
case PO.parse_string(string, options) do
459487
{:ok, %Messages{messages: messages}} ->
460488
messages
461489

0 commit comments

Comments
 (0)
Please sign in to comment.