Skip to content

Commit 5ddcf40

Browse files
Add regex replace functionality to transformation filter extractors [Revised] (#309)
* Add regex replace functionality to transformation filter extractors (#301) * initial extractor implementation of replace functionality * minor changes, testing against mergeExtractorsToBody and extraction callback * add changelog entry * update API to use new mode selector update transformation_filter proto * minor updates to comments in transformation_filter.proto * remove existing references to no-longer-existing replace_all setting * update replacement_text_ to a std::optional<std::string> * remove duplicate mode enum * update comment indicating that subgroup should never exceed regex_result size * add AttemptReplaceFromNoMatchNonNilSubgroup test * prevent string reallocation * remove unnecessary if block + variable in replaceAllValues * clean up new tests * inline replacement_text in inja_transformer_test.cc * more test cleanup * update function signatures, remove replaced_value_ * support dynamic metadata as extractor input * update changelog location * add API changes to go with 3175ca9 * revert support for dynamic metadata as an extractor input 3175ca9 and e2668be * refactor calls to extract/replace * rename replace to extractDestructive, add breaks to switch statement * update data types to match updated function signatures in inja_transformer_test.cc * respond to review comments * update changelog location * update changelog location * separate destructive extractors and non-destructive extractors * fix match_not_null edge case * update inline documentation for new proto field * add test demonstrating use of format specifiers * update REPLACE_ALL mode to return input on no match * return input on no match in single replace case
1 parent fb359b8 commit 5ddcf40

File tree

7 files changed

+832
-35
lines changed

7 files changed

+832
-35
lines changed

api/envoy/config/filter/http/transformation/v2/transformation_filter.proto

+39-4
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,19 @@ message Transformation {
153153
// Extractions can be used to extract information from the request/response.
154154
// The extracted information can then be referenced in template fields.
155155
message Extraction {
156+
// The mode of operation for the extraction.
157+
enum Mode {
158+
// Default mode. Extract the value of the subgroup-th capturing group.
159+
EXTRACT = 0;
160+
// Replace the value of the subgroup-th capturing group with the replacement_text.
161+
// Note: replacement_text must be set for this mode.
162+
SINGLE_REPLACE = 1;
163+
// Replace all matches of the regex in the source with the replacement_text.
164+
// Note: replacement_text must be set for this mode.
165+
// Note: subgroup is ignored for this mode. configuration will fail if subgroup is set.
166+
// Note: restrictions on the regex are different for this mode. See the regex field for more details.
167+
REPLACE_ALL = 2;
168+
}
156169

157170
// The source of the extraction
158171
oneof source {
@@ -162,15 +175,37 @@ message Extraction {
162175
google.protobuf.Empty body = 4;
163176
}
164177

165-
// Only strings matching this regular expression will be part of the
166-
// extraction. The most simple value for this field is '.*', which matches the
167-
// whole source. The field is required. If extraction fails the result is an
168-
// empty value.
178+
// The regex field specifies the regular expression used for matching against the source content. This field is required.
179+
// - In EXTRACT mode, the entire source must match the regex. The subgroup-th capturing group,
180+
// if specified, determines which part of the match is extracted. if the regex does not match the source
181+
// the result of the extraction will be an empty value.
182+
// - In SINGLE_REPLACE mode, the regex also needs to match the entire source. The subgroup-th capturing group
183+
// is targeted for replacement with the replacement_text. if the regex does not match the source
184+
// the result of the extraction will be the source itself.
185+
// - In REPLACE_ALL mode, the regex is applied repeatedly to find all occurrences within the source that match.
186+
// Each matching occurrence is replaced with the replacement_text, and the subgroup field is not used. if the
187+
// regex does not match the source the result of the extraction will be the source itself.
169188
string regex = 2;
170189

171190
// If your regex contains capturing groups, use this field to determine which
172191
// group should be selected.
192+
// For EXTRACT and SINGLE_REPLACE, refers to the portion of the text
193+
// to extract/replace.
194+
// Config will be rejected if this is specified in REPLACE_ALL mode.
173195
uint32 subgroup = 3;
196+
197+
// Used in SINGLE_REPLACE and REPLACE_ALL modes.
198+
// `replacement_text` is used to format the substitution for matched sequences in the input string
199+
// - In SINGLE_REPLACE mode, the content in the subgroup-th capturing group is replaced with the `replacement_text`.
200+
// - In REPLACE_ALL mode, each sequence matching the specified regex in the in the input is replaced with the `replacement_text`.
201+
// The replacement_text may contain special syntax, such as $1, $2, etc., to refer to captured groups within the regular expression.
202+
// The value contained within `replacement_text` is treated as a string, and is passed to std::regex_replace as the replacement string.
203+
// see https://en.cppreference.com/w/cpp/regex/regex_replace for more details.
204+
google.protobuf.StringValue replacement_text = 5;
205+
206+
// The mode of operation for the extraction.
207+
// Defaults to EXTRACT.
208+
Mode mode = 6;
174209
}
175210

176211
// Defines a transformation template.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
changelog:
2+
- type: NEW_FEATURE
3+
resolvesIssue: false
4+
issueLink: https://github.com/solo-io/gloo/issues/8706
5+
description: >
6+
Update transformation filter extractors to support regex
7+
replace/replace all operations on extracted values.

source/extensions/filters/http/transformation/inja_transformer.cc

+170-10
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ getHeader(const Http::RequestOrResponseHeaderMap &header_map,
5656
Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
5757
: headername_(extractor.header()), body_(extractor.has_body()),
5858
group_(extractor.subgroup()),
59-
extract_regex_(Solo::Regex::Utility::parseStdRegex(extractor.regex())) {
59+
extract_regex_(Solo::Regex::Utility::parseStdRegex(extractor.regex())),
60+
replacement_text_(extractor.has_replacement_text() ? std::make_optional(extractor.replacement_text().value()) : std::nullopt),
61+
mode_(extractor.mode()) {
6062
// mark count == number of sub groups, and we need to add one for match number
6163
// 0 so we test for < instead of <= see:
6264
// http://www.cplusplus.com/reference/regex/basic_regex/mark_count/
@@ -65,6 +67,26 @@ Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
6567
fmt::format("group {} requested for regex with only {} sub groups",
6668
group_, extract_regex_.mark_count()));
6769
}
70+
71+
switch (mode_) {
72+
case ExtractionApi::EXTRACT:
73+
break;
74+
case ExtractionApi::SINGLE_REPLACE:
75+
if (!replacement_text_.has_value()) {
76+
throw EnvoyException("SINGLE_REPLACE mode set but no replacement text provided");
77+
}
78+
break;
79+
case ExtractionApi::REPLACE_ALL:
80+
if (!replacement_text_.has_value()) {
81+
throw EnvoyException("REPLACE_ALL mode set but no replacement text provided");
82+
}
83+
if (group_ != 0) {
84+
throw EnvoyException("REPLACE_ALL mode set but subgroup is not 0");
85+
}
86+
break;
87+
default:
88+
throw EnvoyException("Unknown mode");
89+
}
6890
}
6991

7092
absl::string_view
@@ -84,6 +106,37 @@ Extractor::extract(Http::StreamFilterCallbacks &callbacks,
84106
}
85107
}
86108

109+
std::string
110+
Extractor::extractDestructive(Http::StreamFilterCallbacks &callbacks,
111+
const Http::RequestOrResponseHeaderMap &header_map,
112+
GetBodyFunc &body) const {
113+
// determines which destructive extraction function to call based on the mode
114+
auto extractFunc = [&](Http::StreamFilterCallbacks& callbacks, absl::string_view sv) {
115+
switch (mode_) {
116+
case ExtractionApi::SINGLE_REPLACE:
117+
return replaceIndividualValue(callbacks, sv);
118+
case ExtractionApi::REPLACE_ALL:
119+
return replaceAllValues(callbacks, sv);
120+
default:
121+
// Handle unknown mode
122+
throw EnvoyException("Cannot use extractDestructive with unsupported mode");
123+
}
124+
};
125+
126+
if (body_) {
127+
const std::string &string_body = body();
128+
absl::string_view sv(string_body);
129+
return extractFunc(callbacks, sv);
130+
} else {
131+
const Http::HeaderMap::GetResult header_entries = getHeader(header_map, headername_);
132+
if (header_entries.empty()) {
133+
return "";
134+
}
135+
const auto &header_value = header_entries[0]->value().getStringView();
136+
return extractFunc(callbacks, header_value);
137+
}
138+
}
139+
87140
absl::string_view
88141
Extractor::extractValue(Http::StreamFilterCallbacks &callbacks,
89142
absl::string_view value) const {
@@ -105,6 +158,63 @@ Extractor::extractValue(Http::StreamFilterCallbacks &callbacks,
105158
return "";
106159
}
107160

161+
// Match a regex against the input value and replace the matched subgroup with the replacement_text_ value
162+
std::string
163+
Extractor::replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
164+
absl::string_view value) const {
165+
std::match_results<absl::string_view::const_iterator> regex_result;
166+
167+
// if there are no matches, return the original input value
168+
if (!std::regex_search(value.begin(), value.end(), regex_result, extract_regex_)) {
169+
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: extractor regex did not match input. Returning input", callbacks);
170+
return std::string(value.begin(), value.end());
171+
}
172+
173+
// if the subgroup specified is greater than the number of subgroups in the regex, return the original input value
174+
if (group_ >= regex_result.size()) {
175+
// this should never happen as we test this in the ctor.
176+
ASSERT("no such group in the regex");
177+
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: invalid group specified for regex. Returning input", callbacks);
178+
return std::string(value.begin(), value.end());
179+
}
180+
181+
// if the regex doesn't match the entire input value, return the original input value
182+
if (regex_result[0].length() != long(value.length())) {
183+
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: Regex did not match entire input value. This is not allowed in SINGLE_REPLACE mode. Returning input", callbacks);
184+
return std::string(value.begin(), value.end());
185+
}
186+
187+
// Create a new string with the maximum possible length after replacement
188+
auto max_possible_length = value.length() + replacement_text_.value().length();
189+
std::string replaced;
190+
replaced.reserve(max_possible_length);
191+
192+
auto subgroup_start = regex_result[group_].first;
193+
auto subgroup_end = regex_result[group_].second;
194+
195+
// Copy the initial part of the string until the match
196+
replaced.assign(value.begin(), subgroup_start);
197+
198+
// Append the replacement text
199+
replaced += replacement_text_.value();
200+
201+
// Append the remaining part of the string after the match
202+
replaced.append(subgroup_end, value.end());
203+
204+
return replaced;
205+
}
206+
207+
// Match a regex against the input value and replace all instances of the regex with the replacement_text_ value
208+
std::string
209+
Extractor::replaceAllValues(Http::StreamFilterCallbacks&,
210+
absl::string_view value) const {
211+
std::string input(value.begin(), value.end());
212+
std::string replaced;
213+
214+
// replace all instances of the regex in the input value with the replacement_text_ value
215+
return std::regex_replace(input, extract_regex_, replacement_text_.value(), std::regex_constants::match_not_null);
216+
}
217+
108218
// A TransformerInstance is constructed by the InjaTransformer constructor at config time
109219
// on the main thread. It access thread-local storage which is populated during the
110220
// InjaTransformer::transform method call, which happens on the request path on any
@@ -181,6 +291,11 @@ json TransformerInstance::extracted_callback(const inja::Arguments &args) const
181291
if (value_it != ctx.extractions_->end()) {
182292
return value_it->second;
183293
}
294+
295+
const auto destructive_value_it = ctx.destructive_extractions_->find(name);
296+
if (destructive_value_it != ctx.destructive_extractions_->end()) {
297+
return destructive_value_it->second;
298+
}
184299
return "";
185300
}
186301

@@ -546,26 +661,70 @@ void InjaTransformer::transform(Http::RequestOrResponseHeaderMap &header_map,
546661
}
547662
// get the extractions
548663
std::unordered_map<std::string, absl::string_view> extractions;
664+
std::unordered_map<std::string, std::string> destructive_extractions;
665+
549666
if (advanced_templates_) {
550-
extractions.reserve(extractors_.size());
667+
auto extractions_size = 0;
668+
auto destructive_extractions_size = 0;
669+
for (const auto &named_extractor : extractors_) {
670+
switch(named_extractor.second.mode()) {
671+
case ExtractionApi::REPLACE_ALL:
672+
case ExtractionApi::SINGLE_REPLACE: {
673+
destructive_extractions_size++;
674+
break;
675+
}
676+
case ExtractionApi::EXTRACT: {
677+
extractions_size++;
678+
break;
679+
}
680+
default: {
681+
PANIC_DUE_TO_CORRUPT_ENUM
682+
}
683+
}
684+
}
685+
686+
extractions.reserve(extractions_size);
687+
destructive_extractions.reserve(destructive_extractions_size);
551688
}
552689

553690
for (const auto &named_extractor : extractors_) {
554691
const std::string &name = named_extractor.first;
555-
if (advanced_templates_) {
556-
extractions[name] =
557-
named_extractor.second.extract(callbacks, header_map, get_body);
558-
} else {
559-
absl::string_view name_to_split = name;
560-
json *current = &json_body;
692+
693+
// prepare variables for non-advanced_templates_ scenario
694+
absl::string_view name_to_split;
695+
json* current = nullptr;
696+
if (!advanced_templates_) {
697+
name_to_split = name;
698+
current = &json_body;
561699
for (size_t pos = name_to_split.find("."); pos != std::string::npos;
562700
pos = name_to_split.find(".")) {
563701
auto &&field_name = name_to_split.substr(0, pos);
564702
current = &(*current)[std::string(field_name)];
565703
name_to_split = name_to_split.substr(pos + 1);
566704
}
567-
(*current)[std::string(name_to_split)] =
568-
named_extractor.second.extract(callbacks, header_map, get_body);
705+
}
706+
707+
switch(named_extractor.second.mode()) {
708+
case ExtractionApi::REPLACE_ALL:
709+
case ExtractionApi::SINGLE_REPLACE: {
710+
if (advanced_templates_) {
711+
destructive_extractions[name] = named_extractor.second.extractDestructive(callbacks, header_map, get_body);
712+
} else {
713+
(*current)[std::string(name_to_split)] = named_extractor.second.extractDestructive(callbacks, header_map, get_body);
714+
}
715+
break;
716+
}
717+
case ExtractionApi::EXTRACT: {
718+
if (advanced_templates_) {
719+
extractions[name] = named_extractor.second.extract(callbacks, header_map, get_body);
720+
} else {
721+
(*current)[std::string(name_to_split)] = named_extractor.second.extract(callbacks, header_map, get_body);
722+
}
723+
break;
724+
}
725+
default: {
726+
PANIC_DUE_TO_CORRUPT_ENUM
727+
}
569728
}
570729
}
571730

@@ -584,6 +743,7 @@ void InjaTransformer::transform(Http::RequestOrResponseHeaderMap &header_map,
584743
typed_tls_data.request_headers_ = request_headers;
585744
typed_tls_data.body_ = &get_body;
586745
typed_tls_data.extractions_ = &extractions;
746+
typed_tls_data.destructive_extractions_ = &destructive_extractions;
587747
typed_tls_data.context_ = &json_body;
588748
typed_tls_data.environ_ = &environ_;
589749
typed_tls_data.cluster_metadata_ = cluster_metadata;

source/extensions/filters/http/transformation/inja_transformer.h

+12-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ namespace HttpFilters {
2525
namespace Transformation {
2626

2727
using GetBodyFunc = std::function<const std::string &()>;
28+
using ExtractionApi = envoy::api::v2::filter::http::Extraction;
2829

2930
struct ThreadLocalTransformerContext : public ThreadLocal::ThreadLocalObject {
3031
public:
@@ -33,6 +34,7 @@ struct ThreadLocalTransformerContext : public ThreadLocal::ThreadLocalObject {
3334
const Http::RequestOrResponseHeaderMap *header_map_;
3435
const Http::RequestHeaderMap *request_headers_;
3536
const GetBodyFunc *body_;
37+
const std::unordered_map<std::string, std::string> *destructive_extractions_;
3638
const std::unordered_map<std::string, absl::string_view> *extractions_;
3739
const nlohmann::json *context_;
3840
const std::unordered_map<std::string, std::string> *environ_;
@@ -82,15 +84,24 @@ class Extractor : Logger::Loggable<Logger::Id::filter> {
8284
absl::string_view extract(Http::StreamFilterCallbacks &callbacks,
8385
const Http::RequestOrResponseHeaderMap &header_map,
8486
GetBodyFunc &body) const;
85-
87+
std::string extractDestructive(Http::StreamFilterCallbacks &callbacks,
88+
const Http::RequestOrResponseHeaderMap &header_map,
89+
GetBodyFunc &body) const;
90+
const ExtractionApi::Mode& mode() const { return mode_; }
8691
private:
8792
absl::string_view extractValue(Http::StreamFilterCallbacks &callbacks,
8893
absl::string_view value) const;
94+
std::string replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
95+
absl::string_view value) const;
96+
std::string replaceAllValues(Http::StreamFilterCallbacks &callbacks,
97+
absl::string_view value) const;
8998

9099
const Http::LowerCaseString headername_;
91100
const bool body_;
92101
const unsigned int group_;
93102
const std::regex extract_regex_;
103+
const std::optional<const std::string> replacement_text_;
104+
const ExtractionApi::Mode mode_;
94105
};
95106

96107
class InjaTransformer : public Transformer {

test/extensions/filters/http/transformation/BUILD

+15
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,21 @@ envoy_gloo_cc_test(
2828
],
2929
)
3030

31+
envoy_gloo_cc_test(
32+
name = "inja_transformer_replace_test",
33+
srcs = ["inja_transformer_replace_test.cc"],
34+
repository = "@envoy",
35+
deps = [
36+
"//source/extensions/filters/http/transformation:inja_transformer_lib",
37+
"@envoy//source/common/common:random_generator_lib",
38+
"@envoy//source/common/common:base64_lib",
39+
"@envoy//test/test_common:environment_lib",
40+
"@envoy//test/mocks/http:http_mocks",
41+
"@envoy//test/mocks/server:server_mocks",
42+
"@envoy//test/mocks/upstream:upstream_mocks",
43+
],
44+
)
45+
3146
envoy_cc_test_binary(
3247
name = "inja_transformer_speed_test",
3348
srcs = ["inja_transformer_speed_test.cc"],

0 commit comments

Comments
 (0)