diff --git a/CMakeLists.txt b/CMakeLists.txt index 270abcf5..89c9ef45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,9 @@ message(STATUS "Found fmt ${fmt_VERSION}.") find_package(Microsoft.GSL 4.0.0 REQUIRED) message(STATUS "Found Microsoft.GSL ${Microsoft.GSL_VERSION}.") +find_package(ystdlib 0.1.0 REQUIRED) +message(STATUS "Found ystdlib ${ystdlib_VERSION}.") + if(log_surgeon_ENABLE_TESTS) find_package(Catch2 3.8.1 REQUIRED) message(STATUS "Found Catch2 ${Catch2_VERSION}.") @@ -137,6 +140,7 @@ target_link_libraries( PUBLIC fmt::fmt Microsoft.GSL::GSL + ystdlib::error_handling ) target_include_directories( diff --git a/README.md b/README.md index 818e2c57..87c92371 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ while (false == parser.done()) { if (ErrorCode err{parser.parse_next_event()}; ErrorCode::Success != err) { throw runtime_error("Parsing Failed"); } + LogEventView const& event{parser.get_log_parser().get_log_event_view()}; // Get and print the timestamp Token* timestamp{event.get_timestamp()}; @@ -63,7 +64,7 @@ while (false == parser.done()) { } // Get and print the log-level - auto const& loglevels = event.get_variables(*loglevel_id); + auto const& loglevels{event.get_variables(*loglevel_id)}; if (false == loglevels.empty()) { // In case there are multiple matches, just get the first one cout << "loglevel:" << loglevels[0]->to_string_view() << endl; @@ -72,8 +73,7 @@ while (false == parser.done()) { // Other analysis... // Print the entire event - LogEventView const& event = parser.get_log_parser().get_log_event_view(); - cout << event->to_string() << endl; + cout << event.to_string() << endl; } ``` @@ -91,6 +91,7 @@ Requirements: * [GSL] >= 4.0.0 * [Task] >= 3.38 * [uv] >= 0.7.10 +* [ystdlib-cpp] >= 0.1.0 To build and install the project to `$HOME/.local`: @@ -193,3 +194,4 @@ The following are issues we're aware of and working on: [GSL]: https://github.com/microsoft/GSL [Task]: https://taskfile.dev/ [uv]: https://docs.astral.sh/uv +[ystdlib-cpp]: https://github.com/y-scope/ystdlib-cpp diff --git a/cmake/log_surgeon-config.cmake.in b/cmake/log_surgeon-config.cmake.in index 2a838746..31f11c44 100644 --- a/cmake/log_surgeon-config.cmake.in +++ b/cmake/log_surgeon-config.cmake.in @@ -10,6 +10,10 @@ if(@Microsoft.GSL_FOUND@) find_dependency(Microsoft.GSL) endif() +if(@ystdlib_FOUND@) + find_dependency(ystdlib) +endif() + set_and_check(log_surgeon_INCLUDE_DIR "@PACKAGE_LOG_SURGEON_INSTALL_INCLUDE_DIR@") check_required_components(log_surgeon) diff --git a/src/.clang-format b/src/.clang-format index d371375d..b7953357 100644 --- a/src/.clang-format +++ b/src/.clang-format @@ -9,7 +9,7 @@ IncludeCategories: Priority: 4 # External library headers. Update when adding new libraries. - - Regex: "^<(fmt|gsl)" + - Regex: "^<(fmt|gsl|ystdlib)" Priority: 3 # C system headers diff --git a/src/log_surgeon/LogEvent.cpp b/src/log_surgeon/LogEvent.cpp index 35392451..730f90b8 100644 --- a/src/log_surgeon/LogEvent.cpp +++ b/src/log_surgeon/LogEvent.cpp @@ -1,12 +1,20 @@ #include "LogEvent.hpp" +#include #include +#include #include #include +#include +#include #include #include +#include +#include + #include +#include #include #include #include @@ -57,51 +65,104 @@ auto LogEventView::get_logtype() const -> std::string { auto token_view{m_log_output_buffer->get_mutable_token(i)}; auto const rule_id{token_view.get_type_ids()->at(0)}; if (static_cast(SymbolId::TokenUncaughtString) == rule_id) { - logtype += token_view.to_string_view(); + logtype.append(token_view.to_string_view()); + continue; + } + + bool is_first_token{}; + if (m_log_output_buffer->has_header()) { + is_first_token = 0 == i; } else { - bool is_first_token; - if (m_log_output_buffer->has_header()) { - is_first_token = 0 == i; - } else { - is_first_token = 1 == i; - } - if (static_cast(SymbolId::TokenNewline) != rule_id && false == is_first_token) - { - logtype += token_view.release_delimiter(); - } - auto const& optional_captures{m_log_parser.m_lexer.get_captures_from_rule_id(rule_id)}; - if (optional_captures.has_value()) { - auto capture_view{token_view}; - auto const& captures{optional_captures.value()}; - for (auto const capture : captures) { - auto const [reg_start_id, reg_end_id]{ - m_log_parser.m_lexer.get_reg_ids_from_capture(capture) - }; - auto const start_positions{ - capture_view.get_reversed_reg_positions(reg_start_id) - }; - auto const end_positions{capture_view.get_reversed_reg_positions(reg_end_id)}; - - auto const& capture_name{capture->get_name()}; - if (false == start_positions.empty() && -1 < start_positions[0] - && false == end_positions.empty() && -1 < end_positions[0]) - { - capture_view.set_end_pos(start_positions[0]); - logtype.append(capture_view.to_string_view()); - logtype.append("<" + capture_name + ">"); - capture_view.set_start_pos(end_positions[0]); - } - } - capture_view.set_end_pos(token_view.get_end_pos()); - logtype.append(capture_view.to_string_view()); - } else { - logtype += "<" + m_log_parser.get_id_symbol(rule_id) + ">"; + is_first_token = 1 == i; + } + if (static_cast(SymbolId::TokenNewline) != rule_id && false == is_first_token) { + logtype += token_view.release_delimiter(); + } + + auto const matches{get_capture_matches(token_view)}; + if (matches.has_error()) { + logtype.append("<" + m_log_parser.get_id_symbol(rule_id) + ">"); + continue; + } + auto prev_end_pos{token_view.get_start_pos()}; + for (auto const& match : matches.value()) { + if (match.m_leaf) { + logtype.append( + token_view.get_sub_token(prev_end_pos, match.m_pos.m_start).to_string_view() + ); + logtype.append("<" + match.m_capture->get_name() + ">"); + prev_end_pos = match.m_pos.m_end; } } + logtype.append( + token_view.get_sub_token(prev_end_pos, token_view.get_end_pos()).to_string_view() + ); } return logtype; } +auto LogEventView::get_capture_matches(Token const& root_var) const + -> ystdlib::error_handling::Result> { + auto captures{ + get_log_parser().m_lexer.get_captures_from_rule_id(root_var.get_type_ids()->at(0)) + }; + if (false == captures.has_value()) { + return LogEventErrorCode{LogEventErrorCodeEnum::NoCaptureGroups}; + } + + auto cmp{[](Token::CaptureMatch const& a, Token::CaptureMatch const& b) -> bool { + if (a.m_pos.m_start != b.m_pos.m_start) { + return a.m_pos.m_start < b.m_pos.m_start; + } + return a.m_pos.m_end > b.m_pos.m_end; + }}; + std::set ordered_matches; + for (auto const* const capture : captures.value()) { + auto position{get_capture_position(root_var, capture)}; + if (position.has_error()) { + if (LogEventErrorCode{LogEventErrorCodeEnum::NoCaptureGroupMatch} == position.error()) { + continue; + } + return position.error(); + } + ordered_matches.emplace(capture, position.value(), true); + } + if (ordered_matches.empty()) { + return {{}}; + } + + std::vector matches; + matches.reserve(ordered_matches.size()); + auto const last_match{std::prev(ordered_matches.end())}; + for (auto match{ordered_matches.begin()}; match != last_match; ++match) { + auto next_match{std::next(match)}; + auto leaf{false}; + if (match->m_pos.m_end <= next_match->m_pos.m_start) { + leaf = true; + } + matches.emplace_back(match->m_capture, match->m_pos, leaf); + } + matches.emplace_back(last_match->m_capture, last_match->m_pos, true); + return matches; +} + +auto LogEventView::get_capture_position( + Token const& root_var, + finite_automata::Capture const* const& capture +) const -> ystdlib::error_handling::Result { + auto const [start_reg_id, end_reg_id]{ + get_log_parser().m_lexer.get_reg_ids_from_capture(capture) + }; + auto const start_positions{root_var.get_reversed_reg_positions(start_reg_id)}; + auto const end_positions{root_var.get_reversed_reg_positions(end_reg_id)}; + if (start_positions.empty() || 0 > start_positions[0] || end_positions.empty() + || 0 > end_positions[0]) + { + return LogEventErrorCode{LogEventErrorCodeEnum::NoCaptureGroupMatch}; + } + return {start_positions[0], end_positions[0]}; +} + LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()} { set_multiline(src.is_multiline()); m_log_output_buffer->set_has_header(src.m_log_output_buffer->has_header()); @@ -147,3 +208,24 @@ LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()} } } } // namespace log_surgeon + +using log_surgeon::LogEventErrorCodeEnum; + +using LogEventErrorCategory = ystdlib::error_handling::ErrorCategory; + +template <> +auto LogEventErrorCategory::name() const noexcept -> char const* { + return "log_surgeon::LogEvent"; +} + +template <> +auto LogEventErrorCategory::message(LogEventErrorCodeEnum error_enum) const -> std::string { + switch (error_enum) { + case LogEventErrorCodeEnum::NoCaptureGroups: + return "LogEvent NoCaptureGroups"; + case LogEventErrorCodeEnum::NoCaptureGroupMatch: + return "LogEvent NoCaptureGroupMatch"; + default: + return "Unrecognized LogEventErrorCode"; + } +} diff --git a/src/log_surgeon/LogEvent.hpp b/src/log_surgeon/LogEvent.hpp index d2f87729..e012f882 100644 --- a/src/log_surgeon/LogEvent.hpp +++ b/src/log_surgeon/LogEvent.hpp @@ -1,10 +1,17 @@ #ifndef LOG_SURGEON_LOG_EVENT_HPP #define LOG_SURGEON_LOG_EVENT_HPP +#include +#include #include +#include #include #include +#include +#include + +#include #include #include @@ -93,13 +100,16 @@ class LogEventView { [[nodiscard]] auto to_string() const -> std::string; /** - * Constructs a user friendly/readable representation of the log event's - * logtype. A logtype is essentially the static text of a log event with the - * variable components replaced with their name. Therefore, two separate log - * events from the same logging source code may have the same logtype. + * Constructs a user friendly/readable representation of the log event's logtype. A logtype is + * essentially the static text of a log event with the variable components replaced with their + * name. Therefore, two separate log events from the same logging source code may have the same + * logtype. + * + * If a schema variable may contain capture groups, any leaf capture group matches will be + * replaced with their name while all other text is treated as static text. * @return The logtype of the log. */ - auto get_logtype() const -> std::string; + [[nodiscard]] auto get_logtype() const -> std::string; /** * Adds a Token to the array of tokens of a particular token type. @@ -115,6 +125,37 @@ class LogEventView { m_log_var_occurrences[token_type_id].push_back(token_ptr); } + /** + * Retrieves the position of match of type `capture` within `root_var`. `root_var` must be the + * root parent variable containing `capture`. + * @param root_var The parent log surgeon schema variable for `capture`. + * @param capture The capture group type. + * @return A result containing a `CapturePosition` on success, or an error code indicating the + * failure: + * - LogEventErrorCodeEnum::NoCaptureGroupMatch if `root_var` contains no valid match positions + * for `capture`. + */ + [[nodiscard]] auto get_capture_position( + Token const& root_var, + finite_automata::Capture const* const& capture + ) const -> ystdlib::error_handling::Result; + + /** + * Returns the capture group matches within `root_var` sorted by their appearance within the + * text, with parent capture groups appearing before their children. More formally, they are + * sorted by increasing start position and then by decreasing end position. + * + * Since capture groups can only overlap when nested and cannot span across the boundary of + * another group, a capture group is a leaf if its end position is less than the end position of + * the next capture group (or it is the last capture group). + * @param root_var The root variable to get the capture groups from. + * @return A result containing the sorted capture group matches (empty if no matches were + * found), or an error code indicating the failure: + * - LogEventErrorCodeEnum::NoCaptureGroups if no capture groups exist for `root_var`. + */ + [[nodiscard]] auto get_capture_matches(log_surgeon::Token const& root_var) const + -> ystdlib::error_handling::Result>; + // TODO: have LogParser own the output buffer as a LogEventView is already // tied to a single log parser std::unique_ptr m_log_output_buffer; @@ -143,6 +184,15 @@ class LogEvent : public LogEventView { private: std::vector m_buffer; }; + +enum class LogEventErrorCodeEnum : uint8_t { + NoCaptureGroups, + NoCaptureGroupMatch +}; + +using LogEventErrorCode = ystdlib::error_handling::ErrorCode; } // namespace log_surgeon +YSTDLIB_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(log_surgeon::LogEventErrorCodeEnum); + #endif // LOG_SURGEON_LOG_EVENT_HPP diff --git a/src/log_surgeon/Token.hpp b/src/log_surgeon/Token.hpp index 6bc68701..f9503afa 100644 --- a/src/log_surgeon/Token.hpp +++ b/src/log_surgeon/Token.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -16,6 +17,35 @@ namespace log_surgeon { class Token { public: + using position_t = finite_automata::PrefixTree::position_t; + + /** + * Stores the position of a capture group's match within a `Token`. + */ + struct CaptureMatchPosition { + CaptureMatchPosition(position_t start, position_t end) : m_start(start), m_end(end) {} + + position_t m_start; + position_t m_end; + }; + + /** + * Stores information on a capture group's match within a `Token`. + * - `m_capture`: reference to the type of capture + * - `m_pos`: the position within the `Token` + * - `m_leaf`: true if the match contains no nested capture group matches + */ + struct CaptureMatch { + CaptureMatch(finite_automata::Capture const* capture, CaptureMatchPosition pos, bool leaf) + : m_capture(capture), + m_pos(pos), + m_leaf(leaf) {} + + finite_automata::Capture const* m_capture; + CaptureMatchPosition m_pos; + bool m_leaf; + }; + Token() = default; Token(size_t start_pos, @@ -75,7 +105,7 @@ class Token { [[nodiscard]] auto get_length() const -> size_t; [[nodiscard]] auto get_reversed_reg_positions(reg_id_t const reg_id) const - -> std::vector { + -> std::vector { return m_reg_handler.get_reversed_positions(reg_id); } diff --git a/taskfiles/deps.yaml b/taskfiles/deps.yaml index 33ffc438..f1bf8b7a 100644 --- a/taskfiles/deps.yaml +++ b/taskfiles/deps.yaml @@ -11,6 +11,9 @@ vars: # This path must be kept in-sync with its usage in CMakeLists.txt and examples/CMakeLists.txt. G_DEPS_CMAKE_SETTINGS_DIR: "{{.G_DEPS_DIR}}/cmake-settings" + # Library names + G_BOOST_LIB_NAME: "Boost" + tasks: install-all: run: "once" @@ -27,6 +30,20 @@ tasks: - "install-catch2" - "install-fmt" - "install-microsoft.gsl" + - "install-ystdlib" + + install-boost: + internal: true + run: "once" + cmds: + - task: "utils:boost:download-and-install" + vars: + CMAKE_SETTINGS_DIR: "{{.G_DEPS_CMAKE_SETTINGS_DIR}}" + FILE_SHA256: "d6c69e4459eb5d6ec208250291221e7ff4a2affde9af6e49c9303b89c687461f" + URL: "https://github.com/boostorg/boost/releases/download/boost-1.87.0\ + /boost-1.87.0-b2-nodocs.tar.gz" + TARGETS: ["headers"] + WORK_DIR: "{{.G_DEPS_DIR}}" install-catch2: internal: true @@ -76,3 +93,22 @@ tasks: TAR_SHA256: "f0e32cb10654fea91ad56bde89170d78cfbf4363ee0b01d8f097de2ba49f6ce9" TAR_URL: "https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.tar.gz" WORK_DIR: "{{.G_DEPS_DIR}}" + + install-ystdlib: + internal: true + run: "once" + deps: + - "install-boost" + cmds: + - task: "utils:cmake:install-remote-tar" + vars: + CMAKE_GEN_ARGS: + - "-C {{.G_DEPS_CMAKE_SETTINGS_DIR}}/{{.G_BOOST_LIB_NAME}}.cmake" + - "-DCMAKE_BUILD_TYPE=Release" + - "-DCMAKE_INSTALL_MESSAGE=LAZY" + - "-Dystdlib_BUILD_TESTING=OFF" + CMAKE_PACKAGE_NAME: "ystdlib" + CMAKE_SETTINGS_DIR: "{{.G_DEPS_CMAKE_SETTINGS_DIR}}" + TAR_SHA256: "379c04c86715f39cffa09bb3ebc22b4c45af9a63ea3efa2081d210289fcd2af6" + TAR_URL: "https://github.com/y-scope/ystdlib-cpp/archive/c03806a.tar.gz" + WORK_DIR: "{{.G_DEPS_DIR}}" diff --git a/tests/test-buffer-parser.cpp b/tests/test-buffer-parser.cpp index fbe426b9..c86b5c5a 100644 --- a/tests/test-buffer-parser.cpp +++ b/tests/test-buffer-parser.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -22,7 +23,6 @@ using log_surgeon::finite_automata::PrefixTree; using log_surgeon::rule_id_t; using log_surgeon::Schema; using log_surgeon::SymbolId; -using std::pair; using std::string; using std::string_view; using std::unordered_map; @@ -34,10 +34,24 @@ struct CapturePositions { vector m_end_positions; }; +struct ExpectedCaptureMatch { + ExpectedCaptureMatch(string name, CapturePositions pos, bool leaf) + : m_name(std::move(name)), + m_pos(std::move(pos)), + m_leaf(leaf) {} + + ExpectedCaptureMatch(string name, CapturePositions pos) + : ExpectedCaptureMatch(std::move(name), std::move(pos), true) {} + + string m_name; + CapturePositions m_pos; + bool m_leaf; +}; + struct ExpectedToken { string_view m_raw_string; string m_type; - vector> m_captures; + vector m_captures; }; struct ExpectedEvent { @@ -128,10 +142,11 @@ auto parse_and_validate( return; } + // Testing direct usage of fields to inspect capture group matches. REQUIRE(expected_captures.size() == optional_captures.value().size()); for (uint32_t j{0}; j < optional_captures.value().size(); j++) { - auto const capture{optional_captures.value()[j]}; - auto const [expected_name, expected_positions]{expected_captures[j]}; + auto const* capture{optional_captures.value()[j]}; + auto const [expected_name, expected_positions, unused]{expected_captures[j]}; REQUIRE(expected_name == capture->get_name()); auto const [start_reg_id, end_reg_id]{lexer.get_reg_ids_from_capture(capture)}; auto actual_start_positions{token.get_reversed_reg_positions(start_reg_id)}; @@ -145,6 +160,51 @@ auto parse_and_validate( REQUIRE(expected_start_positions == actual_start_positions); REQUIRE(expected_end_positions == actual_end_positions); } + + // Testing event API for capture group matches. + auto sorted_expected_captures{expected_tokens[i].m_captures}; + std::sort( + sorted_expected_captures.begin(), + sorted_expected_captures.end(), + [](ExpectedCaptureMatch const& a, ExpectedCaptureMatch const& b) -> bool { + if (a.m_pos.m_start_positions.empty() + || 0 > a.m_pos.m_start_positions[0]) { + return false; + } + if (b.m_pos.m_start_positions.empty() + || 0 > b.m_pos.m_start_positions[0]) { + return true; + } + if (a.m_pos.m_start_positions[0] != b.m_pos.m_start_positions[0]) { + return a.m_pos.m_start_positions[0] < b.m_pos.m_start_positions[0]; + } + return a.m_pos.m_end_positions[0] > b.m_pos.m_end_positions[0]; + } + ); + auto matches{event.get_capture_matches(token)}; + REQUIRE(false == matches.has_error()); + size_t captures_with_no_match{0}; + for (size_t j{0}; j < sorted_expected_captures.size(); ++j) { + auto const [expected_name, expected_positions, expected_leaf]{ + sorted_expected_captures[j] + }; + auto const [expected_start_positions, expected_end_positions]{ + expected_positions + }; + if (expected_start_positions.empty() || 0 > expected_start_positions[0] + || expected_end_positions.empty() || 0 > expected_end_positions[0]) + { + ++captures_with_no_match; + continue; + } + auto const capture{matches.value().at(j - captures_with_no_match)}; + REQUIRE(expected_name == capture.m_capture->get_name()); + REQUIRE(expected_start_positions[0] == capture.m_pos.m_start); + REQUIRE(expected_end_positions[0] == capture.m_pos.m_end); + REQUIRE(expected_leaf == capture.m_leaf); + } + REQUIRE(sorted_expected_captures.size() - captures_with_no_match + == matches.value().size()); } } } @@ -356,6 +416,63 @@ TEST_CASE("single_line_with_optional_capture", "[BufferParser]") { parse_and_validate(buffer_parser, cInput, {expected_event}); } +/** + * @ingroup test_buffer_parser_capture + * @brief Validates tokenization behavior when using nested capture groups in variable schemas. + * + * This test is an extension of `single_line_with_capture` that verifies the correct behaviour when + * a nested capture groups are not found. + * + * ### Schema Definition + * @code + * delimiters: \n\r[:, + * myVar:userID=(?abc_(?\d{3})) + * @endcode + * + * ### Test Input + * @code + * "userID=abc_123 userID=abc_456" + * @endcode + * + * ### Expected Logtype + * @code + * "userID= userID=" + * @endcode + * + * ### Expected Tokenization + * @code + * "userID=abc_123" -> "myVar" with "abc_123" -> "full", "123" -> "uid" + * " userID=abc_456" -> "myVar" with "abc_456" -> "full", "456" -> "uid" + * @endcode + */ +TEST_CASE("single_line_with_nested_capture", "[BufferParser]") { + constexpr string_view cDelimitersSchema{R"(delimiters: \n\r[:,)"}; + constexpr string_view cVarSchema{R"(myVar:userID=(?abc_(?\d{3})))"}; + constexpr string_view cInput{"userID=abc_123 userID=abc_456"}; + + ExpectedEvent const expected_event{ + .m_logtype{R"(userID=abc_ userID=abc_)"}, + .m_timestamp_raw{""}, + .m_tokens{ + {{"userID=abc_123", + "myVar", + {{{"uid", {.m_start_positions{11}, .m_end_positions{14}}}, + {"full", {.m_start_positions{7}, .m_end_positions{14}}, false}}}}, + {" userID=abc_456", + "myVar", + {{{"uid", {.m_start_positions{26}, .m_end_positions{29}}}, + {"full", {.m_start_positions{22}, .m_end_positions{29}}, false}}}}} + } + }; + + Schema schema; + schema.add_delimiters(cDelimitersSchema); + schema.add_variable(cVarSchema, -1); + BufferParser buffer_parser(std::move(schema.release_schema_ast_ptr())); + + parse_and_validate(buffer_parser, cInput, {expected_event}); +} + /** * @defgroup test_buffer_parser_default_schema Buffer parser using the default schema. * @brief Tests for CLP's default variable schema: timestamp, int, float, hex, key-value pairs,