Skip to content

Commit 3bb3c9d

Browse files
rstzcopybara-github
authored andcommitted
[YDF] Use RE2 for regexes during inference
PiperOrigin-RevId: 911192343
1 parent 0a54b57 commit 3bb3c9d

10 files changed

Lines changed: 64 additions & 41 deletions

File tree

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ Changelog under `yggdrasil_decision_forests/port/python/CHANGELOG.md`.
55

66
## HEAD
77

8+
### Breaking changes
9+
10+
- Replace std::regex by Google's RE2 during inference. Models using Regex
11+
expressions for tokenization must now follow the RE2 syntax. See
12+
https://github.com/google/re2/wiki/Syntax for the a detailed breakdown of
13+
this syntax. Note that other regexes (notably for the data spec columns)
14+
can still use the more permissive std::regex syntax.
15+
16+
## HEAD
17+
818
### Features
919

1020
- "True" and "False" are now recognized as boolean in CSV files. This change

MODULE.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ bazel_dep(name = "rules_cc", version = "0.2.0")
2121
bazel_dep(name = "rules_proto", version = "7.1.0")
2222
bazel_dep(name = "rules_python", version = "1.6.0")
2323
bazel_dep(name = "zlib", version = "1.3.1.bcr.7")
24+
bazel_dep(name = "re2", version = "2025-11-05.bcr.1")
2425

2526
bazel_dep(name = "emsdk", version = "4.0.13", dev_dependency = True)
2627

yggdrasil_decision_forests/dataset/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ cc_library_ydf(
6969
"@com_google_absl//absl/strings",
7070
"@com_google_absl//absl/strings:str_format",
7171
"@com_google_absl//absl/types:span",
72+
"@re2//re2",
7273
],
7374
)
7475

yggdrasil_decision_forests/dataset/data_spec.cc

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include "absl/strings/string_view.h"
4444
#include "absl/strings/substitute.h"
4545
#include "absl/types/span.h"
46+
#include "re2/re2.h"
4647
#include "yggdrasil_decision_forests/dataset/data_spec.pb.h"
4748
#include "yggdrasil_decision_forests/dataset/example.pb.h"
4849
#include "yggdrasil_decision_forests/utils/logging.h"
@@ -703,12 +704,23 @@ absl::Status Tokenize(const absl::string_view text,
703704
absl::StrSplit(cased_text, absl::ByAnyChar(tokenizer.separator()));
704705
break;
705706
case proto::Tokenizer::REGEX_MATCH: {
706-
std::string remaining = cased_text;
707-
std::regex re(tokenizer.regex());
708-
std::smatch sm;
709-
while (std::regex_search(remaining, sm, re)) {
710-
unit_tokens.emplace_back(sm.str());
711-
remaining = sm.suffix();
707+
RE2 re(tokenizer.regex());
708+
if (!re.ok()) {
709+
return absl::InvalidArgumentError(
710+
absl::StrCat("Invalid regular expression: ", tokenizer.regex()));
711+
}
712+
absl::string_view sp(cased_text);
713+
size_t startpos = 0;
714+
absl::string_view submatch;
715+
while (re.Match(sp, startpos, sp.size(), RE2::UNANCHORED, &submatch, 1)) {
716+
unit_tokens.emplace_back(submatch.data(), submatch.size());
717+
startpos = (submatch.data() - sp.data()) + submatch.size();
718+
if (submatch.empty()) {
719+
startpos++;
720+
}
721+
if (startpos > sp.size()) {
722+
break;
723+
}
712724
}
713725
} break;
714726
case proto::Tokenizer::CHARACTER:

yggdrasil_decision_forests/port/python/ydf/learner/wrapper/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ cc_library_ydf(
3434
"@com_google_absl//absl/status:statusor",
3535
"@com_google_absl//absl/strings",
3636
"@com_google_absl//absl/strings:str_format",
37+
"@re2//re2",
3738
"@ydf_cc//yggdrasil_decision_forests/learner:abstract_learner",
3839
"@ydf_cc//yggdrasil_decision_forests/learner:abstract_learner_cc_proto",
3940
"@ydf_cc//yggdrasil_decision_forests/learner:learner_library",

yggdrasil_decision_forests/port/python/ydf/learner/wrapper/wrapper_generator.cc

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include <cstddef>
2121
#include <iterator>
2222
#include <memory>
23-
#include <regex>
2423
#include <string>
2524
#include <tuple>
2625
#include <utility>
@@ -39,6 +38,7 @@
3938
#include "absl/strings/str_split.h"
4039
#include "absl/strings/string_view.h"
4140
#include "absl/strings/substitute.h"
41+
#include "re2/re2.h"
4242
#include "yggdrasil_decision_forests/learner/abstract_learner.h"
4343
#include "yggdrasil_decision_forests/learner/abstract_learner.pb.h"
4444
#include "yggdrasil_decision_forests/learner/learner_library.h"
@@ -934,15 +934,15 @@ absl::Status FixDefaultFieldsDocumentation(std::string* fields_documentation,
934934
return absl::InvalidArgumentError(
935935
"Missing documentation for discretize_numerical_columns");
936936
}
937-
std::regex end_pattern("\\n {4}[^ \\t\\n\\r\\f\\v]");
938-
auto search_start_it =
939-
fields_documentation->cbegin() + start_pos + start_marker.length();
940-
std::smatch match;
941-
size_t end_pos;
942-
943-
if (std::regex_search(search_start_it, fields_documentation->cend(), match,
944-
end_pattern)) {
945-
end_pos = std::distance(fields_documentation->cbegin(), match[0].first);
937+
static const LazyRE2 end_pattern(R"((\n {4}[^ \t\n\r\f\v]))");
938+
STATUS_CHECK(end_pattern->ok());
939+
940+
absl::string_view subset = absl::string_view(*fields_documentation)
941+
.substr(start_pos + start_marker.length());
942+
absl::string_view match;
943+
944+
if (RE2::PartialMatch(subset, *end_pattern, &match)) {
945+
size_t end_pos = match.data() - fields_documentation->data();
946946
size_t length = end_pos - start_pos;
947947
fields_documentation->replace(start_pos, length, replacement_string);
948948
} else {

yggdrasil_decision_forests/utils/BUILD

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ cc_library_ydf(
114114
"@com_google_absl//absl/strings",
115115
"@com_google_absl//absl/strings:str_format",
116116
"@com_google_absl//absl/types:optional",
117+
"@re2//re2",
117118
],
118119
)
119120

@@ -130,6 +131,7 @@ cc_library_ydf(
130131
"@com_google_absl//absl/status",
131132
"@com_google_absl//absl/strings",
132133
"@google_cloud_cpp//:storage",
134+
"@re2//re2",
133135
],
134136
alwayslink = 1,
135137
)
@@ -218,6 +220,7 @@ cc_library_ydf(
218220
"@com_google_absl//absl/strings",
219221
"@com_google_absl//absl/strings:str_format",
220222
"@com_google_protobuf//:protobuf",
223+
"@re2//re2",
221224
] + select({
222225
"//yggdrasil_decision_forests:tensorflow_with_header_lib": [
223226
"@local_config_tf//:tensorflow",

yggdrasil_decision_forests/utils/filesystem_default.cc

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
#include <initializer_list>
2424
#include <ios>
2525
#include <memory>
26-
#include <regex> // NOLINT
2726
#include <string>
2827
#include <utility>
2928
#include <vector>
@@ -36,6 +35,7 @@
3635
#include "absl/strings/str_replace.h"
3736
#include "absl/strings/string_view.h"
3837
#include "absl/types/optional.h"
38+
#include "re2/re2.h"
3939
#include "yggdrasil_decision_forests/utils/filesystem_interface.h"
4040
#include "yggdrasil_decision_forests/utils/logging.h"
4141
#include "yggdrasil_decision_forests/utils/status_macros.h"
@@ -164,18 +164,12 @@ std::string JoinPathList(std::initializer_list<absl::string_view> paths) {
164164

165165
bool GenerateShardedFilenames(absl::string_view spec,
166166
std::vector<std::string>* names) {
167-
std::regex num_shard_pattern(R"((.*)\@(\*|[0-9]+)(?:(\..+))?)");
168-
std::smatch match;
167+
static const LazyRE2 num_shard_pattern = {R"((.*)\@(\*|[0-9]+)(?:(\..+))?)"};
168+
std::string prefix, count, suffix;
169169
std::string str_spec(spec);
170-
if (!std::regex_match(str_spec, match, num_shard_pattern)) {
170+
if (!RE2::FullMatch(str_spec, *num_shard_pattern, &prefix, &count, &suffix)) {
171171
return false;
172172
}
173-
if (match.size() != 4) {
174-
return false;
175-
}
176-
const auto prefix = match[1].str();
177-
const auto count = match[2].str();
178-
const auto suffix = match[3].str();
179173

180174
int int_count;
181175
if (count == "*") {
@@ -212,7 +206,8 @@ absl::Status Match(absl::string_view pattern, std::vector<std::string>* results,
212206
const auto filename = fs::path(SV_ABSL_TO_STD(pattern)).filename().string();
213207
std::string regexp_filename =
214208
absl::StrReplaceAll(filename, {{".", "\\."}, {"*", ".*"}, {"?", "."}});
215-
std::regex regexp_pattern(regexp_filename);
209+
RE2 regexp_pattern(regexp_filename);
210+
STATUS_CHECK(regexp_pattern.ok());
216211
std::error_code error;
217212

218213
const fs::directory_iterator path_end;
@@ -221,7 +216,7 @@ absl::Status Match(absl::string_view pattern, std::vector<std::string>* results,
221216
if (!fs::is_regular_file(path->path())) {
222217
continue;
223218
}
224-
if (std::regex_match(path->path().filename().string(), regexp_pattern)) {
219+
if (RE2::FullMatch(path->path().filename().string(), regexp_pattern)) {
225220
results->push_back(path->path().string());
226221
}
227222
}

yggdrasil_decision_forests/utils/filesystem_default_gcs.cc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
#include <cstring>
1919
#include <initializer_list>
2020
#include <memory>
21-
#include <regex> // NOLINT
2221
#include <string>
2322
#include <utility>
2423
#include <vector>
@@ -28,6 +27,7 @@
2827
#include "absl/strings/str_replace.h"
2928
#include "absl/strings/string_view.h"
3029
#include "google/cloud/storage/client.h"
30+
#include "re2/re2.h"
3131
#include "yggdrasil_decision_forests/utils/bytestream.h"
3232
#include "yggdrasil_decision_forests/utils/filesystem_default.h"
3333
#include "yggdrasil_decision_forests/utils/filesystem_interface.h"
@@ -198,11 +198,15 @@ class FileSystemImplementation : public FileSystemInterface {
198198

199199
std::string regexp_filename = absl::StrReplaceAll(
200200
cloud_path.object, {{".", "\\."}, {"*", ".*"}, {"?", "."}});
201-
std::regex regexp_pattern(regexp_filename);
201+
RE2 regexp_pattern(regexp_filename);
202+
if (!regexp_pattern.ok()) {
203+
return absl::InvalidArgumentError(
204+
"Invalid regular expression generated from pattern.");
205+
}
202206

203207
for (const auto& candidate : candidates) {
204208
GCS_RETURN_IF_ERROR(candidate.status());
205-
if (!std::regex_match(candidate->name(), regexp_pattern)) {
209+
if (!RE2::FullMatch(candidate->name(), regexp_pattern)) {
206210
continue;
207211
}
208212
results->push_back(

yggdrasil_decision_forests/utils/filesystem_tensorflow_impl.cc

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
#include <cstring>
1919
#include <initializer_list>
2020
#include <memory>
21-
#include <regex> // NOLINT
2221
#include <string>
2322
#include <vector>
2423

@@ -31,6 +30,7 @@
3130
#include "src/google/protobuf/message.h"
3231
#include "src/google/protobuf/message_lite.h"
3332
#include "src/google/protobuf/text_format.h"
33+
#include "re2/re2.h"
3434
#include "tensorflow/core/platform/env.h"
3535
#include "tensorflow/core/platform/file_system.h"
3636
#include "tensorflow/core/platform/path.h"
@@ -225,18 +225,14 @@ class FileSystemImplementation : public FileSystemInterface {
225225

226226
bool GenerateShardedFilenames(absl::string_view spec,
227227
std::vector<std::string>* names) override {
228-
std::regex num_shard_pattern(R"((.*)\@(\*|[0-9]+)(?:(\..+))?)");
229-
std::smatch match;
228+
static const LazyRE2 num_shard_pattern = {
229+
R"((.*)\@(\*|[0-9]+)(?:(\..+))?)"};
230+
std::string prefix, count, suffix;
230231
std::string str_spec(spec);
231-
if (!std::regex_match(str_spec, match, num_shard_pattern)) {
232+
if (!RE2::FullMatch(str_spec, *num_shard_pattern, &prefix, &count,
233+
&suffix)) {
232234
return false;
233235
}
234-
if (match.size() != 4) {
235-
return false;
236-
}
237-
const auto prefix = match[1].str();
238-
const auto count = match[2].str();
239-
const auto suffix = match[3].str();
240236

241237
int int_count;
242238
if (count == "*") {

0 commit comments

Comments
 (0)