Skip to content

Commit 4cebdaa

Browse files
committed
Merge branch develop to master
2 parents 7729ade + ae8c8ee commit 4cebdaa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+1534
-245
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22

33
This is a list of notable changes to Hyperscan, in reverse chronological order.
44

5+
## [5.2.0] 2019-07-12
6+
- Literal API: add new API `hs_compile_lit()` and `hs_compile_lit_multi()` to
7+
process pure literal rule sets. The 2 literal APIs treat each expression text
8+
in a literal sense without recognizing any regular grammers.
9+
- Logical combination: add support for purely negative combinations, which
10+
report match at EOD in case of no sub-expressions matched.
11+
- Windows porting: support shared library (DLL) on Windows with available tools
12+
hscheck, hsbench and hsdump.
13+
- Bugfix for issue #148: fix uninitialized use of `scatter_unit_uX` due to
14+
padding.
15+
- Bugfix for issue #155: fix numerical result out of range error.
16+
- Bugfix for issue #165: avoid corruption of pending combination report in
17+
streaming mode.
18+
- Bugfix for issue #174: fix scratch free issue when memory allocation fails.
19+
520
## [5.1.1] 2019-04-03
621
- Add extra detection and handling when invalid rose programs are triggered.
722
- Bugfix for issue #136: fix CMake parsing of CPU architecure for GCC-9.

CMakeLists.txt

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
22
project (hyperscan C CXX)
33

44
set (HS_MAJOR_VERSION 5)
5-
set (HS_MINOR_VERSION 1)
6-
set (HS_PATCH_VERSION 1)
5+
set (HS_MINOR_VERSION 2)
6+
set (HS_PATCH_VERSION 0)
77
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
88

99
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -31,6 +31,7 @@ else()
3131
endif()
3232

3333
if(CMAKE_BUILD_TYPE MATCHES RELEASE|RELWITHDEBINFO|MINSIZEREL)
34+
message(STATUS "using release build")
3435
set(RELEASE_BUILD TRUE)
3536
else()
3637
set(RELEASE_BUILD FALSE)
@@ -109,11 +110,9 @@ option(BUILD_SHARED_LIBS "Build shared libs instead of static" OFF)
109110
option(BUILD_STATIC_AND_SHARED "Build shared libs as well as static" OFF)
110111

111112
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
112-
if (WIN32)
113-
message(FATAL_ERROR "Windows DLLs currently not supported")
114-
else()
115113
message(STATUS "Building shared libraries")
116-
endif()
114+
else()
115+
message(STATUS "Building static libraries")
117116
endif()
118117

119118
if (NOT BUILD_SHARED_LIBS)
@@ -151,9 +150,6 @@ if(MSVC OR MSVC_IDE)
151150
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /wd4267 /Qdiag-disable:remark")
152151
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
153152
else()
154-
# todo: change these as required
155-
set(ARCH_C_FLAGS "/arch:AVX2")
156-
set(ARCH_CXX_FLAGS "/arch:AVX2")
157153
set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 /wd4996 -D_CRT_SECURE_NO_WARNINGS")
158154
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 ${MSVC_WARNS}")
159155
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD")
@@ -1298,12 +1294,14 @@ endif()
12981294
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
12991295
if (NOT FAT_RUNTIME)
13001296
add_library(hs_runtime_shared SHARED src/hs_version.c
1301-
src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec_shared>)
1297+
src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec_shared>
1298+
hs_runtime.def)
13021299
else()
13031300
add_library(hs_runtime_shared SHARED src/hs_version.c
13041301
src/hs_valid_platform.c
13051302
$<TARGET_OBJECTS:hs_exec_common_shared>
1306-
${RUNTIME_SHLIBS})
1303+
${RUNTIME_SHLIBS}
1304+
hs_runtime.def)
13071305
endif()
13081306
set_target_properties(hs_runtime_shared PROPERTIES
13091307
VERSION ${LIB_VERSION}
@@ -1349,7 +1347,7 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
13491347
${RUNTIME_SHLIBS})
13501348
endif ()
13511349

1352-
add_library(hs_shared SHARED ${hs_shared_SRCS})
1350+
add_library(hs_shared SHARED ${hs_shared_SRCS} hs.def)
13531351

13541352
add_dependencies(hs_shared ragel_Parser)
13551353
set_target_properties(hs_shared PROPERTIES

chimera/ch_compile.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ PatternData::PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in,
322322
ch_misc_free(info);
323323

324324
u32 guardflags;
325-
guardflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH;
325+
guardflags = flags | HS_FLAG_PREFILTER;
326326
guard = isHyperscanSupported(pattern, guardflags, platform);
327327
} else {
328328
// We can't even prefilter this pattern, so we're dependent on Big Dumb

doc/dev-reference/compilation.rst

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,75 @@ version of Hyperscan used to scan with it.
5454
Hyperscan provides support for targeting a database at a particular CPU
5555
platform; see :ref:`instr_specialization` for details.
5656

57+
=====================
58+
Compile Pure Literals
59+
=====================
60+
61+
Pure literal is a special case of regular expression. A character sequence is
62+
regarded as a pure literal if and only if each character is read and
63+
interpreted independently. No syntax association happens between any adjacent
64+
characters.
65+
66+
For example, given an expression written as :regexp:`/bc?/`. We could say it is
67+
a regluar expression, with the meaning that character ``b`` followed by nothing
68+
or by one character ``c``. On the other view, we could also say it is a pure
69+
literal expression, with the meaning that this is a character sequence of 3-byte
70+
length, containing characters ``b``, ``c`` and ``?``. In regular case, the
71+
question mark character ``?`` has a particular syntax role called 0-1 quantifier,
72+
which has an syntax association with the character ahead of it. Similar
73+
characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``,
74+
``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``.
75+
While in pure literal case, all these meta characters lost extra meanings
76+
expect for that they are just common ASCII codes.
77+
78+
Hyperscan is initially designed to process common regualr expressions. It is
79+
hence embedded with a complex parser to do comprehensive regular grammer
80+
interpretion. Particularly, the identification of above meta characters is the
81+
basic step for the interpretion of far more complex regular grammers.
82+
83+
However in real cases, patterns may not always be regualr expressions. They
84+
could just be pure literals. Problem will come if the pure literals contain
85+
regular meta characters. Supposing fed directly into traditional Hyperscan
86+
compile API, all these meta characters will be interpreted in predefined ways,
87+
which is unnecessary and the result is totally out of expectation. To avoid
88+
such misunderstanding by traditional API, users have to preprocess these
89+
literal patterns by converting the meta characters into some other formats:
90+
either by adding a backslash ``\`` before certain meta characters, or by
91+
converting all the characters into a hexadecimal representation.
92+
93+
In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns:
94+
95+
#. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
96+
database.
97+
98+
#. :c:func:`hs_compile_lit_multi`: compiles an array of pure literals into a
99+
pattern database. All of the supplied patterns will be scanned for
100+
concurrently at scan time, with user-supplied identifiers returned when they
101+
match.
102+
103+
These 2 APIs are designed for use cases where all patterns contained in the
104+
target rule set are pure literals. Users can pass the initial pure literal
105+
content directly into these APIs without worrying about writing regular meta
106+
characters in their patterns. No preprocessing work is needed any more.
107+
108+
For new APIs, the ``length`` of each literal pattern is a newly added parameter.
109+
Hyperscan needs to locate the end position of the input expression via clearly
110+
knowing each literal's length, not by simply identifying character ``\0`` of a
111+
string.
112+
113+
Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_MULTILINE`,
114+
:c:member:`HS_FLAG_SINGLEMATCH`, :c:member:`HS_FLAG_SOM_LEFTMOST`.
115+
116+
.. note:: We don't support literal compilation API with :ref:`extparam`. And
117+
for runtime implementation, traditional runtime APIs can still be
118+
used to match pure literal patterns.
119+
120+
.. note:: If the target rule set contains at least one regular expression,
121+
please use traditional compile APIs :c:func:`hs_compile`,
122+
:c:func:`hs_compile_multi` and :c:func:`hs_compile_ext_multi`.
123+
The new literal APIs introduced here are designed for rule sets
124+
containing only pure literal expressions.
125+
57126
***************
58127
Pattern Support
59128
***************

hs.def

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
; Hyperscan DLL export definitions
2+
3+
LIBRARY hs
4+
5+
EXPORTS
6+
hs_alloc_scratch
7+
hs_clone_scratch
8+
hs_close_stream
9+
hs_compile
10+
hs_compile_ext_multi
11+
hs_compile_multi
12+
hs_compress_stream
13+
hs_copy_stream
14+
hs_database_info
15+
hs_database_size
16+
hs_deserialize_database
17+
hs_deserialize_database_at
18+
hs_expand_stream
19+
hs_expression_ext_info
20+
hs_expression_info
21+
hs_free_compile_error
22+
hs_free_database
23+
hs_free_scratch
24+
hs_open_stream
25+
hs_populate_platform
26+
hs_reset_and_copy_stream
27+
hs_reset_and_expand_stream
28+
hs_reset_stream
29+
hs_scan
30+
hs_scan_stream
31+
hs_scan_vector
32+
hs_scratch_size
33+
hs_serialize_database
34+
hs_serialized_database_info
35+
hs_serialized_database_size
36+
hs_set_allocator
37+
hs_set_database_allocator
38+
hs_set_misc_allocator
39+
hs_set_scratch_allocator
40+
hs_set_stream_allocator
41+
hs_stream_size
42+
hs_valid_platform
43+
hs_version

hs_runtime.def

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
; Hyperscan DLL export definitions
2+
3+
LIBRARY hs_runtime
4+
5+
EXPORTS
6+
hs_alloc_scratch
7+
hs_clone_scratch
8+
hs_close_stream
9+
hs_compress_stream
10+
hs_copy_stream
11+
hs_database_info
12+
hs_database_size
13+
hs_deserialize_database
14+
hs_deserialize_database_at
15+
hs_expand_stream
16+
hs_free_database
17+
hs_free_scratch
18+
hs_open_stream
19+
hs_reset_and_copy_stream
20+
hs_reset_and_expand_stream
21+
hs_reset_stream
22+
hs_scan
23+
hs_scan_stream
24+
hs_scan_vector
25+
hs_scratch_size
26+
hs_serialize_database
27+
hs_serialized_database_info
28+
hs_serialized_database_size
29+
hs_set_allocator
30+
hs_set_database_allocator
31+
hs_set_misc_allocator
32+
hs_set_scratch_allocator
33+
hs_set_stream_allocator
34+
hs_stream_size
35+
hs_valid_platform
36+
hs_version

src/compiler/compiler.cpp

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2015-2018, Intel Corporation
2+
* Copyright (c) 2015-2019, Intel Corporation
33
*
44
* Redistribution and use in source and binary forms, with or without
55
* modification, are permitted provided that the following conditions are met:
@@ -56,11 +56,13 @@
5656
#include "parser/unsupported.h"
5757
#include "parser/utf8_validate.h"
5858
#include "rose/rose_build.h"
59+
#include "rose/rose_internal.h"
5960
#include "som/slot_manager_dump.h"
6061
#include "util/bytecode_ptr.h"
6162
#include "util/compile_error.h"
6263
#include "util/target_info.h"
6364
#include "util/verify_types.h"
65+
#include "util/ue2string.h"
6466

6567
#include <algorithm>
6668
#include <cassert>
@@ -107,6 +109,46 @@ void validateExt(const hs_expr_ext &ext) {
107109

108110
}
109111

112+
void ParsedLitExpression::parseLiteral(const char *expression, size_t len,
113+
bool nocase) {
114+
const char *c = expression;
115+
for (size_t i = 0; i < len; i++) {
116+
lit.push_back(*c, nocase);
117+
c++;
118+
}
119+
}
120+
121+
ParsedLitExpression::ParsedLitExpression(unsigned index_in,
122+
const char *expression,
123+
size_t expLength, unsigned flags,
124+
ReportID report)
125+
: expr(index_in, false, flags & HS_FLAG_SINGLEMATCH, false, false,
126+
SOM_NONE, report, 0, MAX_OFFSET, 0, 0, 0, false) {
127+
// For pure literal expression, below 'HS_FLAG_'s are unuseful:
128+
// DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET
129+
130+
if (flags & ~HS_FLAG_ALL) {
131+
DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags);
132+
throw CompileError("Unrecognised flag.");
133+
}
134+
135+
// FIXME: we disallow highlander + SOM, see UE-1850.
136+
if ((flags & HS_FLAG_SINGLEMATCH) && (flags & HS_FLAG_SOM_LEFTMOST)) {
137+
throw CompileError("HS_FLAG_SINGLEMATCH is not supported in "
138+
"combination with HS_FLAG_SOM_LEFTMOST.");
139+
}
140+
141+
// Set SOM type.
142+
if (flags & HS_FLAG_SOM_LEFTMOST) {
143+
expr.som = SOM_LEFT;
144+
}
145+
146+
// Transfer expression text into ue2_literal.
147+
bool nocase = flags & HS_FLAG_CASELESS ? true : false;
148+
parseLiteral(expression, expLength, nocase);
149+
150+
}
151+
110152
ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
111153
unsigned flags, ReportID report,
112154
const hs_expr_ext *ext)
@@ -345,6 +387,49 @@ void addExpression(NG &ng, unsigned index, const char *expression,
345387
}
346388
}
347389

390+
void addLitExpression(NG &ng, unsigned index, const char *expression,
391+
unsigned flags, const hs_expr_ext *ext, ReportID id,
392+
size_t expLength) {
393+
assert(expression);
394+
const CompileContext &cc = ng.cc;
395+
DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s', len='%zu'\n", index,
396+
id, flags, expression, expLength);
397+
398+
// Extended parameters are not supported for pure literal patterns.
399+
if (ext && ext->flags != 0LLU) {
400+
throw CompileError("Extended parameters are not supported for pure "
401+
"literal matching API.");
402+
}
403+
404+
// Ensure that our pattern isn't too long (in characters).
405+
if (strlen(expression) > cc.grey.limitPatternLength) {
406+
throw CompileError("Pattern length exceeds limit.");
407+
}
408+
409+
// filter out flags not supported by pure literal API.
410+
u64a not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
411+
HS_FLAG_UCP | HS_FLAG_PREFILTER | HS_FLAG_COMBINATION |
412+
HS_FLAG_QUIET;
413+
414+
if (flags & not_supported) {
415+
throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_MULTILINE, "
416+
"HS_FLAG_SINGLEMATCH and HS_FLAG_SOM_LEFTMOST are "
417+
"supported in literal API.");
418+
}
419+
420+
// This expression must be a pure literal, we can build ue2_literal
421+
// directly based on expression text.
422+
ParsedLitExpression ple(index, expression, expLength, flags, id);
423+
424+
// Feed the ue2_literal into Rose.
425+
const auto &expr = ple.expr;
426+
if (ng.addLiteral(ple.lit, expr.index, expr.report, expr.highlander,
427+
expr.som, expr.quiet)) {
428+
DEBUG_PRINTF("took pure literal\n");
429+
return;
430+
}
431+
}
432+
348433
static
349434
bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
350435
const u32 minWidth =
@@ -416,10 +501,13 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
416501
}
417502

418503

419-
struct hs_database *build(NG &ng, unsigned int *length) {
504+
struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) {
420505
assert(length);
421506

422507
auto rose = generateRoseEngine(ng);
508+
struct RoseEngine *roseHead = rose.get();
509+
roseHead->pureLiteral = pureFlag;
510+
423511
if (!rose) {
424512
throw CompileError("Unable to generate bytecode.");
425513
}

0 commit comments

Comments
 (0)