From f793ad36b23f4a26262b7e41637798911290fccd Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sat, 21 Feb 2015 22:01:30 -0800 Subject: [PATCH 01/14] Change the initial vector sizes for attributes and stringbuffers. This is based on statistical analysis of ~60K webpages from one segment of the CommonCrawl corpus. As a result, initial attribute vector size goes from 4 to 1, and initial stringbuffer size goes from 10 to 5 (initial child vector size was found to already be the optimum value, at 1). The program I used to analyze memory usage & allocations is here: https://github.com/nostrademons/GumboStats Benchmark results, using first that program and then ./benchmark: v0.10.0 This is the current master. parse_time: mean=4936.51, median=3395.50, 95th%=14145.60, max=167992.00 allocations: mean=19782.47, median=14190.00, 95th%=55512.45, max=805162.00 bytes_allocated: mean=697919.74, median=498341.00, 95th%=1962711.40, max=50036916.00 high_water_mark: mean=572190.85, median=204060.00, 95th%=832980.15, max=4294967294.00 hacker_news.html: 4015 microseconds. html5_spec.html: 690645 microseconds. xinhua.html: 27348 microseconds. baidu.html: 5443 microseconds. yahoo.html: 24486 microseconds. google.html: 23592 microseconds. bbc.html: 7975 microseconds. arabic_newspapers.html: 5998 microseconds. wikipedia.html: 36205 microseconds. With attributes = 0, string buffer size = 3 I tried this first as a "good enough" solution, but it lost a couple percentage points in speed in exchange for halving typical memory consumption. parse_time: mean=5227.30, median=3591.00, 95th%=14954.30, max=170137.00 allocations: mean=22808.84, median=16349.00, 95th%=64060.30, max=849723.00 bytes_allocated: mean=634346.26, median=453030.50, 95th%=1805861.60, max=49914829.00 high_water_mark: mean=5833772.92, median=108432.50, 95th%=472434.40, max=4294967295.00 hacker_news.html: 6021 microseconds. html5_spec.html: 723033 microseconds. xinhua.html: 20712 microseconds. baidu.html: 5426 microseconds. yahoo.html: 25010 microseconds. google.html: 23691 microseconds. bbc.html: 8555 microseconds. arabic_newspapers.html: 6216 microseconds. wikipedia.html: 36706 microseconds. With attributes = 1, string buffer size = 3 This is the final commit. Speed is statistically unchanged, but memory usage is down 30%. parse_time: mean=4924.18, median=3421.50, 95th%=14013.30, max=170302.00 allocations: mean=21730.23, median=15597.00, 95th%=61035.05, max=852718.00 bytes_allocated: mean=645380.09, median=461001.00, 95th%=1828158.55, max=49884481.00 high_water_mark: mean=3985020.84, median=139070.00, 95th%=591329.45, max=4294967295.00 hacker_news.html: 4670 microseconds. html5_spec.html: 697663 microseconds. xinhua.html: 24023 microseconds. baidu.html: 5576 microseconds. yahoo.html: 25100 microseconds. google.html: 24000 microseconds. bbc.html: 8214 microseconds. arabic_newspapers.html: 6377 microseconds. wikipedia.html: 36961 microseconds. --- src/string_buffer.c | 4 +++- src/tokenizer.c | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/string_buffer.c b/src/string_buffer.c index f7d9712f..03282bbf 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -26,7 +26,9 @@ struct GumboInternalParser; -static const size_t kDefaultStringBufferSize = 10; +// Size chosen via statistical analysis of ~60K websites. +// 99% of text nodes and 98% of attribute names/values fit in this initial size. +static const size_t kDefaultStringBufferSize = 5; static void maybe_resize_string_buffer( struct GumboInternalParser* parser, size_t additional_chars, diff --git a/src/tokenizer.c b/src/tokenizer.c index 8c0b4db4..75f6c0a2 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -697,7 +697,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) { gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer); assert(tag_state->_attributes.data == NULL); - gumbo_vector_init(parser, 4, &tag_state->_attributes); + // Initial size chosen by statistical analysis of a corpus of 60k webpages. + // 99.5% of elements have 0 elements, 93% of the remainder have 1. These + // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1 + // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs. + gumbo_vector_init(parser, 1, &tag_state->_attributes); tag_state->_drop_next_attr_value = false; tag_state->_is_start_tag = is_start_tag; tag_state->_is_self_closing = false; From e9e3817ec8c38916a20439faacc4b132e3cf8ff6 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sat, 21 Feb 2015 14:55:48 -0800 Subject: [PATCH 02/14] Bump version number to 0.9.4. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1c20cef5..67240c41 100644 --- a/setup.py +++ b/setup.py @@ -169,7 +169,7 @@ def run(self): ] setup(name='gumbo', - version='0.9.2', + version='0.9.4', description='Python bindings for Gumbo HTML parser', long_description=README, url='http://github.com/google/gumbo-parser', From 89340dfdc8ab0b01a8beeab65e2e91ffdb1e5d13 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 02:35:42 -0800 Subject: [PATCH 03/14] Fix bug in StringBuffer tests that depends upon the default size of a stringbuffer (it fails if there are less than 7 characters available and doesn't explicitly reserve space). --- tests/string_buffer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/string_buffer.cc b/tests/string_buffer.cc index b8966cf8..8e6f8d83 100644 --- a/tests/string_buffer.cc +++ b/tests/string_buffer.cc @@ -95,6 +95,7 @@ TEST_F(GumboStringBufferTest, AppendCodepoint_4Bytes) { } TEST_F(GumboStringBufferTest, ToString) { + gumbo_string_buffer_reserve(&parser_, 8, &buffer_); strcpy(buffer_.data, "012345"); buffer_.length = 7; From edb6508addfa8d2fea340a8ee694c6fe39a2824e Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 02:37:53 -0800 Subject: [PATCH 04/14] Reinitialize va_args with each call to vsnprintf in the error printing routines. vsnprintf may modify the va_args state, and in doing so it introduced potential memory corruption in the buffer it writes to. --- src/error.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/error.c b/src/error.c index 0cae4639..563dfde2 100644 --- a/src/error.c +++ b/src/error.c @@ -35,10 +35,11 @@ static const size_t kMessageBufferSize = 256; static int print_message(GumboParser* parser, GumboStringBuffer* output, const char* format, ...) { va_list args; - va_start(args, format); int remaining_capacity = output->capacity - output->length; + va_start(args, format); int bytes_written = vsnprintf(output->data + output->length, remaining_capacity, format, args); + va_end(args); #ifdef _MSC_VER if (bytes_written == -1) { // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of @@ -47,6 +48,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, // we retry (letting it fail and returning 0 if it doesn't), since there's // no way to smartly resize the buffer. gumbo_string_buffer_reserve(parser, output->capacity * 2, output); + va_start(args, format); int result = vsnprintf(output->data + output->length, remaining_capacity, format, args); va_end(args); @@ -55,7 +57,6 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, #else // -1 in standard C99 indicates an encoding error. Return 0 and do nothing. if (bytes_written == -1) { - va_end(args); return 0; } #endif @@ -64,11 +65,12 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, gumbo_string_buffer_reserve( parser, output->capacity + bytes_written, output); remaining_capacity = output->capacity - output->length; + va_start(args, format); bytes_written = vsnprintf(output->data + output->length, remaining_capacity, format, args); + va_end(args); } output->length += bytes_written; - va_end(args); return bytes_written; } From b84193e09d7cc8e9c09b3e790b1f60687b46562d Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 12:15:27 -0800 Subject: [PATCH 05/14] Change repeated calls to string_buffer_destroy/string_buffer_init to a string_buffer_clear that just zeros out the vector instead of reallocating. Benchmarks: parse_time: median=3217.00, 95th%=13199.00 traversal_time: median=51000.00, 95th%=343000.00 allocations: median=12459.50, 95th%=49388.75 bytes_allocated: median=441710.00, 95th%=1756603.10 high_water_mark: median=194527.00, 95th%=796959.05 hacker_news.html: 4574 microseconds. html5_spec.html: 654969 microseconds. xinhua.html: 19776 microseconds. baidu.html: 5675 microseconds. yahoo.html: 27398 microseconds. google.html: 30849 microseconds. bbc.html: 7762 microseconds. arabic_newspapers.html: 5798 microseconds. wikipedia.html: 35138 microseconds. This gives a roughly 8-10% improvement in CPU performance, and reduces the total number of bytes allocated by about 5%. However, the cost of it is that it eliminates virtually all the memory savings of changing the initial vector numbers (it's maybe 5% over master, vs. about 30%). The source of that is that now the temporary vectors can grow unbounded, and so a large string will continue consuming memory until parsing is complete. I'm investigating some approaches to mitigate that. --- src/parser.c | 3 +-- src/string_buffer.c | 5 +++++ src/string_buffer.h | 5 +++++ src/tokenizer.c | 9 +++------ 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/parser.c b/src/parser.c index 8ca85933..12fa5fec 100644 --- a/src/parser.c +++ b/src/parser.c @@ -938,8 +938,7 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { insert_node(parser, text_node, location); } - gumbo_string_buffer_destroy(parser, &buffer_state->_buffer); - gumbo_string_buffer_init(parser, &buffer_state->_buffer); + gumbo_string_buffer_clear(parser, &buffer_state->_buffer); buffer_state->_type = GUMBO_NODE_WHITESPACE; assert(buffer_state->_buffer.length == 0); } diff --git a/src/string_buffer.c b/src/string_buffer.c index 03282bbf..4bc90581 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -102,6 +102,11 @@ char* gumbo_string_buffer_to_string( return buffer; } +void gumbo_string_buffer_clear( + struct GumboInternalParser* parser, GumboStringBuffer* input) { + input->length = 0; +} + void gumbo_string_buffer_destroy( struct GumboInternalParser* parser, GumboStringBuffer* buffer) { gumbo_parser_deallocate(parser, buffer->data); diff --git a/src/string_buffer.h b/src/string_buffer.h index 4ddff8a9..cc139862 100644 --- a/src/string_buffer.h +++ b/src/string_buffer.h @@ -70,6 +70,11 @@ void gumbo_string_buffer_append_string( char* gumbo_string_buffer_to_string( struct GumboInternalParser* parser, GumboStringBuffer* input); +// Reinitialize this string buffer. This clears it by setting length=0. It +// does not zero out the buffer itself. +void gumbo_string_buffer_clear( + struct GumboInternalParser* parser, GumboStringBuffer* input); + // Deallocates this GumboStringBuffer. void gumbo_string_buffer_destroy( struct GumboInternalParser* parser, GumboStringBuffer* buffer); diff --git a/src/tokenizer.c b/src/tokenizer.c index 75f6c0a2..89bbb499 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -356,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; assert(!tokenizer->_temporary_buffer_emit); utf8iterator_mark(&tokenizer->_input); - gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer); - gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer); + gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer); // The temporary buffer and script data buffer are the same object in the // spec, so the script data buffer should be cleared as well. - gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer); - gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer); + gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); } // Appends a codepoint to the temporary buffer. @@ -1595,8 +1593,7 @@ static StateResult handle_script_double_escaped_lt_state( int c, GumboToken* output) { if (c == '/') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END); - gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer); - gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer); + gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); return emit_current_char(parser, output); } else { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); From 814d53775b5ca17c464d94048f997be2762610b9 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 14:31:57 -0800 Subject: [PATCH 06/14] Bound string_buffer_clear to 8x initial size; any more than that and its reinitialized to a fresh buffer. This reduces the median high water mark from 195K to ~180K (still well above the 140K that we'd get by always reallocating the temporary buffers), but at the cost of about 50% of the CPU gains we got from gumbo_string_buffer_clear. I'm curious how it interacts with original-buffer-return and arenas, though. --- src/string_buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/string_buffer.c b/src/string_buffer.c index 4bc90581..fc6c5a24 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -105,6 +105,13 @@ char* gumbo_string_buffer_to_string( void gumbo_string_buffer_clear( struct GumboInternalParser* parser, GumboStringBuffer* input) { input->length = 0; + if (input->capacity > kDefaultStringBufferSize * 8) { + // This approach to clearing means that the buffer can grow unbounded and + // tie up memory that may be needed for parsing the rest of the document, so + // we free and reinitialize the buffer if its grown more than 3 doublings. + gumbo_string_buffer_destroy(parser, input); + gumbo_string_buffer_init(parser, input); + } } void gumbo_string_buffer_destroy( From 64af6c3ce31b5b8d7136ee26c34e210960e894c1 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 15:12:52 -0800 Subject: [PATCH 07/14] Move the buffer out of the temporary stringbuffer to final string nodes. The effect of this change is actually (and counterintuitively) starkly negative: CPU time increases by ~5%, total bytes allocated increases by almost 50%, and memory used increases by 25%. The reason for this is that most text strings are very small, 1-2 chars, and so moving a 6-char buffer to the parse tree and allocating another one instead of just allocating the 1-2 chars necessary and resetting the original buffer is a big loss. I'm keeping this commit solely to serve as a base for an arena implementation, which may be able to win back much of this. --- src/string_buffer.c | 7 ++++--- tests/string_buffer.cc | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/string_buffer.c b/src/string_buffer.c index fc6c5a24..bb5a7fcb 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -28,7 +28,7 @@ struct GumboInternalParser; // Size chosen via statistical analysis of ~60K websites. // 99% of text nodes and 98% of attribute names/values fit in this initial size. -static const size_t kDefaultStringBufferSize = 5; +static const size_t kDefaultStringBufferSize = 6; static void maybe_resize_string_buffer( struct GumboInternalParser* parser, size_t additional_chars, @@ -96,9 +96,10 @@ void gumbo_string_buffer_append_string( char* gumbo_string_buffer_to_string( struct GumboInternalParser* parser, GumboStringBuffer* input) { - char* buffer = gumbo_parser_allocate(parser, input->length + 1); - memcpy(buffer, input->data, input->length); + maybe_resize_string_buffer(parser, input->length + 1, input); + char* buffer = input->data; buffer[input->length] = '\0'; + gumbo_string_buffer_init(parser, input); return buffer; } diff --git a/tests/string_buffer.cc b/tests/string_buffer.cc index 8e6f8d83..c72170cb 100644 --- a/tests/string_buffer.cc +++ b/tests/string_buffer.cc @@ -47,7 +47,7 @@ class GumboStringBufferTest : public GumboTest { TEST_F(GumboStringBufferTest, Reserve) { gumbo_string_buffer_reserve(&parser_, 21, &buffer_); - EXPECT_EQ(40, buffer_.capacity); + EXPECT_EQ(24, buffer_.capacity); strcpy(buffer_.data, "01234567890123456789"); buffer_.length = 20; NullTerminateBuffer(); From 71533990e75db6e7e1c8e707d9ed073fd3507ae1 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 21:55:36 -0800 Subject: [PATCH 08/14] Simple arena implementation. Seems to save about 10-20% on runtime over the previous performance fixes. I'm trying to get memory-usage numbers, but since this removes the custom allocator machinery, it'll take some invasive temporary functions to let the stats program query it. Benchmarks: hacker_news.html: 4153 microseconds. html5_spec.html: 588906 microseconds. xinhua.html: 15219 microseconds. baidu.html: 5563 microseconds. yahoo.html: 23156 microseconds. google.html: 23392 microseconds. bbc.html: 7011 microseconds. arabic_newspapers.html: 4687 microseconds. wikipedia.html: 28795 microseconds. --- Makefile.am | 2 ++ src/arena.c | 60 +++++++++++++++++++++++++++++++++++++++++++ src/arena.h | 55 +++++++++++++++++++++++++++++++++++++++ src/gumbo.h | 29 ++++++++++++--------- src/parser.c | 40 ++++++++--------------------- src/util.c | 6 ++--- tests/string_piece.cc | 7 +++-- tests/test_utils.cc | 49 +++-------------------------------- 8 files changed, 154 insertions(+), 94 deletions(-) create mode 100644 src/arena.c create mode 100644 src/arena.h diff --git a/Makefile.am b/Makefile.am index 1f9db1e8..3665e774 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,6 +50,8 @@ lib_LTLIBRARIES = libgumbo.la libgumbo_la_CFLAGS = -Wall libgumbo_la_LDFLAGS = -version-info 1:0:0 -no-undefined libgumbo_la_SOURCES = \ + src/arena.c \ + src/arena.h \ src/attribute.c \ src/attribute.h \ src/char_ref.c \ diff --git a/src/arena.c b/src/arena.c new file mode 100644 index 00000000..dfb86b38 --- /dev/null +++ b/src/arena.c @@ -0,0 +1,60 @@ +// Copyright 2015 Jonathan Tang. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jonathan.d.tang@gmail.com (Jonathan Tang) + +#include "arena.h" + +#include +#include + +#include "util.h" + +void arena_init(GumboArena* arena) { + assert(arena != NULL); + arena->head = malloc(sizeof(GumboArenaChunk)); + arena->head->next = NULL; + arena->allocation_ptr = arena->head->data; + gumbo_debug("Initializing arena @%x\n", arena->head); +} + +void arena_destroy(GumboArena* arena) { + GumboArenaChunk* chunk = arena->head; + while (chunk) { + gumbo_debug("Freeing arena chunk @%x\n", chunk); + GumboArenaChunk* to_free = chunk; + chunk = chunk->next; + free(to_free); + } +} + +void* gumbo_arena_malloc(void* userdata, size_t size) { + GumboArena* arena = userdata; + GumboArenaChunk* current_chunk = arena->head; + if (arena->allocation_ptr >= current_chunk->data + CHUNK_SIZE - size) { + GumboArenaChunk* new_chunk = malloc(sizeof(GumboArenaChunk)); + gumbo_debug("Allocating new arena chunk @%x\n", new_chunk); + new_chunk->next = current_chunk; + arena->head = new_chunk; + arena->allocation_ptr = new_chunk->data; + } + void* obj = arena->allocation_ptr; + arena->allocation_ptr += (size + ALIGNMENT - 1) & ~(ALIGNMENT - 1); + return obj; +} + +void arena_free(void* userdata, void* obj) { + // No-op. +} + diff --git a/src/arena.h b/src/arena.h new file mode 100644 index 00000000..162fcd73 --- /dev/null +++ b/src/arena.h @@ -0,0 +1,55 @@ +// Copyright 2015 Jonathan Tang. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jonathan.d.tang@gmail.com (Jonathan Tang) + +#ifndef GUMBO_ARENA_H_ +#define GUMBO_ARENA_H_ + +#include "gumbo.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// 400K is right around the median total memory allocation (on a corpus of ~60K +// webpages taken from CommonCrawl), and allows up to 50K 8-byte allocations. +// The 95th percentile is roughly 2M, which would give 5 arena chunks. +// This should allow ~50% of webpages to parse in a single arena chunk, and the +// vast majority to use no more than 5, while still keeping typical memory usage +// well under a meg. +#define CHUNK_SIZE 400000 +#define ALIGNMENT 8 + +typedef struct GumboInternalArenaChunk { + struct GumboInternalArenaChunk* next; + char data[CHUNK_SIZE]; +} GumboArenaChunk; + +// Initialize an arena, allocating the first chunk for it. +void arena_init(GumboArena* arena); + +// Destroy an arena, freeing all memory used by it and all objects contained. +void arena_destroy(GumboArena* arena); + +// gumbo_arena_malloc is defined in gumbo.h + +// No-op free function for use as a custom allocator. +void arena_free(void* arena, void* obj); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_ARENA_H_ diff --git a/src/gumbo.h b/src/gumbo.h index cbc2b0f0..05630e9d 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -576,18 +576,6 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); * Use kGumboDefaultOptions for sensible defaults, and only set what you need. */ typedef struct GumboInternalOptions { - /** A memory allocator function. Default: malloc. */ - GumboAllocatorFunction allocator; - - /** A memory deallocator function. Default: free. */ - GumboDeallocatorFunction deallocator; - - /** - * An opaque object that's passed in as the first argument to all callbacks - * used by this library. Default: NULL. - */ - void* userdata; - /** * The tab-stop size, for computing positions in source code that uses tabs. * Default: 8. @@ -613,6 +601,16 @@ typedef struct GumboInternalOptions { /** Default options struct; use this with gumbo_parse_with_options. */ extern const GumboOptions kGumboDefaultOptions; +/** Base struct for an arena. */ +struct GumboInternalArenaChunk; + +typedef struct GumboInternalArena { + struct GumboInternalArenaChunk* head; + char* allocation_ptr; +} GumboArena; + +void* gumbo_arena_malloc(void* userdata, size_t size); + /** The output struct containing the results of the parse. */ typedef struct GumboInternalOutput { /** @@ -635,6 +633,13 @@ typedef struct GumboInternalOutput { * reported so we can work out something appropriate for your use-case. */ GumboVector /* GumboError */ errors; + + /** + * Arena for default memory allocation. This is initialized on parse start + * when using the default memory allocator; it consumes little memory (a + * couple pointers) when a custom memory allocator is supplied. + */ + GumboArena arena; } GumboOutput; /** diff --git a/src/parser.c b/src/parser.c index 12fa5fec..c6510579 100644 --- a/src/parser.c +++ b/src/parser.c @@ -21,6 +21,7 @@ #include #include +#include "arena.h" #include "attribute.h" #include "error.h" #include "gumbo.h" @@ -56,18 +57,7 @@ static bool handle_in_template(GumboParser*, GumboToken*); static GumboNode* destroy_node(GumboParser*, GumboNode*); -static void* malloc_wrapper(void* unused, size_t size) { - return malloc(size); -} - -static void free_wrapper(void* unused, void* ptr) { - free(ptr); -} - const GumboOptions kGumboDefaultOptions = { - &malloc_wrapper, - &free_wrapper, - NULL, 8, false, -1, @@ -501,11 +491,9 @@ static GumboNode* new_document_node(GumboParser* parser) { return document_node; } -static void output_init(GumboParser* parser) { - GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput)); +static void output_init(GumboParser* parser, GumboOutput* output) { output->root = NULL; output->document = new_document_node(parser); - parser->_output = output; gumbo_init_errors(parser); } @@ -4055,10 +4043,16 @@ GumboOutput* gumbo_parse_fragment( const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace) { GumboParser parser; parser._options = options; + // Must come first, since all the other init functions allocate memory. The + // arena is stored in the GumboOutput structure, so that must be allocated + // manually. + parser._output = malloc(sizeof(GumboOutput)); + arena_init(&parser._output->arena); + // Next initialize the parser state. parser_state_init(&parser); // Must come after parser_state_init, since creating the document node must // reference parser_state->_current_node. - output_init(&parser); + output_init(&parser, parser._output); // And this must come after output_init, because initializing the tokenizer // reads the first character and that may cause a UTF-8 decode error // (inserting into output->errors) if that's invalid. @@ -4155,18 +4149,6 @@ GumboOutput* gumbo_parse_fragment( } void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) { - // Need a dummy GumboParser because the allocator comes along with the - // options object. - GumboParser parser; - parser._parser_state = NULL; - parser._options = options; - GumboNode* current = output->document; - while (current) { - current = destroy_node(&parser, current); - } - for (int i = 0; i < output->errors.length; ++i) { - gumbo_error_destroy(&parser, output->errors.data[i]); - } - gumbo_vector_destroy(&parser, &output->errors); - gumbo_parser_deallocate(&parser, output); + arena_destroy(&output->arena); + free(output); } diff --git a/src/util.c b/src/util.c index a3dafd79..5d6ca6e1 100644 --- a/src/util.c +++ b/src/util.c @@ -32,12 +32,10 @@ const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 }; void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) { - return parser->_options->allocator(parser->_options->userdata, num_bytes); + return gumbo_arena_malloc(&parser->_output->arena, num_bytes); } -void gumbo_parser_deallocate(GumboParser* parser, void* ptr) { - parser->_options->deallocator(parser->_options->userdata, ptr); -} +void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {} char* gumbo_copy_stringz(GumboParser* parser, const char* str) { char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1); diff --git a/tests/string_piece.cc b/tests/string_piece.cc index 965ee5aa..4040bfdc 100644 --- a/tests/string_piece.cc +++ b/tests/string_piece.cc @@ -74,13 +74,12 @@ TEST_F(GumboStringPieceTest, CaseNotEqual_Str2Shorter) { } TEST_F(GumboStringPieceTest, Copy) { - GumboParser parser; - parser._options = &kGumboDefaultOptions; + GumboParser* parser = &parser_; INIT_GUMBO_STRING(str1, "bar"); GumboStringPiece str2; - gumbo_string_copy(&parser, &str2, &str1); + gumbo_string_copy(parser, &str2, &str1); EXPECT_TRUE(gumbo_string_equals(&str1, &str2)); - gumbo_parser_deallocate(&parser, (void*) str2.data); + gumbo_parser_deallocate(parser, (void*) str2.data); } } // namespace diff --git a/tests/test_utils.cc b/tests/test_utils.cc index 7fc47711..f57bd73c 100644 --- a/tests/test_utils.cc +++ b/tests/test_utils.cc @@ -16,6 +16,7 @@ #include "test_utils.h" +#include "arena.h" #include "error.h" #include "util.h" @@ -142,54 +143,14 @@ void SanityCheckPointers(const char* input, size_t input_length, } } -// Custom allocator machinery to sanity check for memory leaks. Normally we can -// use heapcheck/valgrind/ASAN for this, but they only give the -// results when the program terminates. This means that if the parser is run in -// a loop (say, a MapReduce) and there's a leak, it may end up exhausting memory -// before it can catch the particular document responsible for the leak. These -// allocators let us check each document individually for leaks. - -static void* LeakDetectingMalloc(void* userdata, size_t size) { - MallocStats* stats = static_cast(userdata); - stats->bytes_allocated += size; - ++stats->objects_allocated; - // Arbitrary limit of 2G on allocation; parsing any reasonable document - // shouldn't take more than that. - assert(stats->bytes_allocated < (1 << 31)); - void* obj = malloc(size); - // gumbo_debug("Allocated %u bytes at %x.\n", size, obj); - return obj; -} - -static void LeakDetectingFree(void* userdata, void* ptr) { - MallocStats* stats = static_cast(userdata); - if (ptr) { - ++stats->objects_freed; - // gumbo_debug("Freed %x.\n"); - free(ptr); - } -} - -void InitLeakDetection(GumboOptions* options, MallocStats* stats) { - stats->bytes_allocated = 0; - stats->objects_allocated = 0; - stats->objects_freed = 0; - - options->allocator = LeakDetectingMalloc; - options->deallocator = LeakDetectingFree; - options->userdata = stats; -} - - GumboTest::GumboTest() : options_(kGumboDefaultOptions), errors_are_expected_(false), text_("") { - InitLeakDetection(&options_, &malloc_stats_); options_.max_errors = 100; parser_._options = &options_; - parser_._output = static_cast( - gumbo_parser_allocate(&parser_, sizeof(GumboOutput))); + parser_._output = static_cast(calloc(1, sizeof(GumboOutput))); + arena_init(&parser_._output->arena); gumbo_init_errors(&parser_); } @@ -204,7 +165,5 @@ GumboTest::~GumboTest() { parser_._output->errors.data[i]), text_); } } - gumbo_destroy_errors(&parser_); - gumbo_parser_deallocate(&parser_, parser_._output); - EXPECT_EQ(malloc_stats_.objects_allocated, malloc_stats_.objects_freed); + gumbo_destroy_output(parser_._options, parser_._output); } From d39588b51cb67156eb7f44d65ad9bfbe76dcc184 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 22:30:52 -0800 Subject: [PATCH 09/14] Tweak some magic numbers, setting minimum string_buffer size to 8 (no sense using less, as we always return word-aligned chunks) and arena size to 200K. --- src/arena.c | 6 ++++-- src/arena.h | 6 +++--- src/string_buffer.c | 6 ++++-- tests/string_buffer.cc | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/arena.c b/src/arena.c index dfb86b38..15f90a49 100644 --- a/src/arena.c +++ b/src/arena.c @@ -42,7 +42,9 @@ void arena_destroy(GumboArena* arena) { void* gumbo_arena_malloc(void* userdata, size_t size) { GumboArena* arena = userdata; GumboArenaChunk* current_chunk = arena->head; - if (arena->allocation_ptr >= current_chunk->data + CHUNK_SIZE - size) { + size_t aligned_size = (size + ARENA_ALIGNMENT - 1) & ~(ARENA_ALIGNMENT - 1); + if (arena->allocation_ptr >= + current_chunk->data + ARENA_CHUNK_SIZE - aligned_size) { GumboArenaChunk* new_chunk = malloc(sizeof(GumboArenaChunk)); gumbo_debug("Allocating new arena chunk @%x\n", new_chunk); new_chunk->next = current_chunk; @@ -50,7 +52,7 @@ void* gumbo_arena_malloc(void* userdata, size_t size) { arena->allocation_ptr = new_chunk->data; } void* obj = arena->allocation_ptr; - arena->allocation_ptr += (size + ALIGNMENT - 1) & ~(ALIGNMENT - 1); + arena->allocation_ptr += aligned_size; return obj; } diff --git a/src/arena.h b/src/arena.h index 162fcd73..b5a7ee63 100644 --- a/src/arena.h +++ b/src/arena.h @@ -29,12 +29,12 @@ extern "C" { // This should allow ~50% of webpages to parse in a single arena chunk, and the // vast majority to use no more than 5, while still keeping typical memory usage // well under a meg. -#define CHUNK_SIZE 400000 -#define ALIGNMENT 8 +#define ARENA_CHUNK_SIZE 399992 +#define ARENA_ALIGNMENT 8 typedef struct GumboInternalArenaChunk { struct GumboInternalArenaChunk* next; - char data[CHUNK_SIZE]; + char data[ARENA_CHUNK_SIZE]; } GumboArenaChunk; // Initialize an arena, allocating the first chunk for it. diff --git a/src/string_buffer.c b/src/string_buffer.c index bb5a7fcb..941bcd1e 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -27,8 +27,10 @@ struct GumboInternalParser; // Size chosen via statistical analysis of ~60K websites. -// 99% of text nodes and 98% of attribute names/values fit in this initial size. -static const size_t kDefaultStringBufferSize = 6; +// 99% of text nodes and 98% of attribute names/values fit within 5 characters. +// Since the arena allocator only ever returns word-aligned chunks, however, it +// makes no sense to use less than 8 chars. +static const size_t kDefaultStringBufferSize = 8; static void maybe_resize_string_buffer( struct GumboInternalParser* parser, size_t additional_chars, diff --git a/tests/string_buffer.cc b/tests/string_buffer.cc index c72170cb..daf0954b 100644 --- a/tests/string_buffer.cc +++ b/tests/string_buffer.cc @@ -47,7 +47,7 @@ class GumboStringBufferTest : public GumboTest { TEST_F(GumboStringBufferTest, Reserve) { gumbo_string_buffer_reserve(&parser_, 21, &buffer_); - EXPECT_EQ(24, buffer_.capacity); + EXPECT_EQ(32, buffer_.capacity); strcpy(buffer_.data, "01234567890123456789"); buffer_.length = 20; NullTerminateBuffer(); From 2b7b2359eed4c95226fe7b4f545062ed4ee19c61 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 22 Feb 2015 23:52:28 -0800 Subject: [PATCH 10/14] Temporary change to expose chunk allocation stats to GumboStats. --- src/arena.c | 8 ++++++++ src/gumbo.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/src/arena.c b/src/arena.c index 15f90a49..c8e1cb54 100644 --- a/src/arena.c +++ b/src/arena.c @@ -21,12 +21,15 @@ #include "util.h" +unsigned int gChunksAllocated; + void arena_init(GumboArena* arena) { assert(arena != NULL); arena->head = malloc(sizeof(GumboArenaChunk)); arena->head->next = NULL; arena->allocation_ptr = arena->head->data; gumbo_debug("Initializing arena @%x\n", arena->head); + gChunksAllocated = 1; } void arena_destroy(GumboArena* arena) { @@ -50,12 +53,17 @@ void* gumbo_arena_malloc(void* userdata, size_t size) { new_chunk->next = current_chunk; arena->head = new_chunk; arena->allocation_ptr = new_chunk->data; + ++gChunksAllocated; } void* obj = arena->allocation_ptr; arena->allocation_ptr += aligned_size; return obj; } +unsigned int gumbo_arena_chunks_allocated() { + return gChunksAllocated; +} + void arena_free(void* userdata, void* obj) { // No-op. } diff --git a/src/gumbo.h b/src/gumbo.h index 05630e9d..fa85a8c1 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -611,6 +611,8 @@ typedef struct GumboInternalArena { void* gumbo_arena_malloc(void* userdata, size_t size); +unsigned int gumbo_arena_chunks_allocated(); + /** The output struct containing the results of the parse. */ typedef struct GumboInternalOutput { /** From b349c7d633f9eed444079118ceccd47bb88abaf0 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 23 Feb 2015 18:26:44 -0800 Subject: [PATCH 11/14] Handle out of memory errors for string buffers. This truncates the buffer at the maximum arena chunk size and continues parsing, setting the out_of_memory flag. --- src/error.c | 16 +++++++++++----- src/gumbo.h | 13 +++++++++++++ src/parser.c | 3 +++ src/string_buffer.c | 40 ++++++++++++++++++++++++++++++++-------- src/string_buffer.h | 5 +++-- src/util.c | 4 ++++ src/util.h | 3 +++ 7 files changed, 69 insertions(+), 15 deletions(-) diff --git a/src/error.c b/src/error.c index 563dfde2..931bc385 100644 --- a/src/error.c +++ b/src/error.c @@ -47,7 +47,9 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, // enough. In this case, we'll double the buffer size and hope it fits when // we retry (letting it fail and returning 0 if it doesn't), since there's // no way to smartly resize the buffer. - gumbo_string_buffer_reserve(parser, output->capacity * 2, output); + if (!gumbo_string_buffer_reserve(parser, output->capacity * 2, output)) { + return 0; + }; va_start(args, format); int result = vsnprintf(output->data + output->length, remaining_capacity, format, args); @@ -62,8 +64,10 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, #endif if (bytes_written > remaining_capacity) { - gumbo_string_buffer_reserve( - parser, output->capacity + bytes_written, output); + if (!gumbo_string_buffer_reserve( + parser, output->capacity + bytes_written, output)) { + return output->capacity; + } remaining_capacity = output->capacity - output->length; va_start(args, format); bytes_written = vsnprintf(output->data + output->length, @@ -242,8 +246,10 @@ void gumbo_caret_diagnostic_to_string( gumbo_string_buffer_append_codepoint(parser, '\n', output); gumbo_string_buffer_append_string(parser, &original_line, output); gumbo_string_buffer_append_codepoint(parser, '\n', output); - gumbo_string_buffer_reserve( - parser, output->length + error->position.column, output); + if (!gumbo_string_buffer_reserve( + parser, output->length + error->position.column, output)) { + return; + } int num_spaces = error->position.column - 1; memset(output->data + output->length, ' ', num_spaces); output->length += num_spaces; diff --git a/src/gumbo.h b/src/gumbo.h index fa85a8c1..a61c14cf 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -642,6 +642,19 @@ typedef struct GumboInternalOutput { * couple pointers) when a custom memory allocator is supplied. */ GumboArena arena; + + /** + * Flag set if an out-of-memory condition occurs. This can either be because + * a stringbuffer or vector requested a single chunk larger than the arena + * chunk size, or because the system malloc failed. (The latter is not + * implemented yet - on most modern OSes, malloc never returns NULL and + * instead overcommits virtual memory.) Gumbo makes its best effort to + * recover from OOM errors: if the reason was that a buffer exceeded maximum + * chunk size, it truncates that buffer at the maximum chunk size, refuses to + * write to it anymore, and continues parsing. If the system malloc fails, it + * returns the parse tree it's parsed up until that point. + */ + bool out_of_memory; } GumboOutput; /** diff --git a/src/parser.c b/src/parser.c index c6510579..c394f423 100644 --- a/src/parser.c +++ b/src/parser.c @@ -494,6 +494,9 @@ static GumboNode* new_document_node(GumboParser* parser) { static void output_init(GumboParser* parser, GumboOutput* output) { output->root = NULL; output->document = new_document_node(parser); + // Arena is initialized before this is called, so we have memory to initialize + // the parser state. + output->out_of_memory = false; gumbo_init_errors(parser); } diff --git a/src/string_buffer.c b/src/string_buffer.c index 941bcd1e..d641e905 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -21,6 +21,7 @@ #include #include +#include "arena.h" #include "string_piece.h" #include "util.h" @@ -32,7 +33,7 @@ struct GumboInternalParser; // makes no sense to use less than 8 chars. static const size_t kDefaultStringBufferSize = 8; -static void maybe_resize_string_buffer( +static bool maybe_resize_string_buffer( struct GumboInternalParser* parser, size_t additional_chars, GumboStringBuffer* buffer) { size_t new_length = buffer->length + additional_chars; @@ -41,14 +42,30 @@ static void maybe_resize_string_buffer( new_capacity *= 2; } if (new_capacity != buffer->capacity) { + if (new_capacity > ARENA_CHUNK_SIZE) { + if (buffer->capacity == ARENA_CHUNK_SIZE) { + // If we have already resized the buffer to the maximum chunk size, then + // we're out of memory, and we ignore any more writes to the buffer. + gumbo_set_out_of_memory(parser); + return false; + } + // Otherwise, this is the first time we've hit the new max. Resize the + // allocation to take up a whole chunk, but don't set an error condition + // and let writes proceed. + new_capacity = ARENA_CHUNK_SIZE; + } char* new_data = gumbo_parser_allocate(parser, new_capacity); memcpy(new_data, buffer->data, buffer->length); gumbo_parser_deallocate(parser, buffer->data); buffer->data = new_data; buffer->capacity = new_capacity; } + return true; } +#define ENSURE_CAPACITY_OR_RETURN(capacity, buffer) \ + if (!maybe_resize_string_buffer(parser, (capacity), (buffer))) { return; } + void gumbo_string_buffer_init( struct GumboInternalParser* parser, GumboStringBuffer* output) { output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize); @@ -56,10 +73,11 @@ void gumbo_string_buffer_init( output->capacity = kDefaultStringBufferSize; } -void gumbo_string_buffer_reserve( +bool gumbo_string_buffer_reserve( struct GumboInternalParser* parser, size_t min_capacity, GumboStringBuffer* output) { - maybe_resize_string_buffer(parser, min_capacity - output->length, output); + return maybe_resize_string_buffer( + parser, min_capacity - output->length, output); } void gumbo_string_buffer_append_codepoint( @@ -81,7 +99,7 @@ void gumbo_string_buffer_append_codepoint( num_bytes = 3; prefix = 0xf0; } - maybe_resize_string_buffer(parser, num_bytes + 1, output); + ENSURE_CAPACITY_OR_RETURN(num_bytes + 1, output); output->data[output->length++] = prefix | (c >> (num_bytes * 6)); for (int i = num_bytes - 1; i >= 0; --i) { output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6))); @@ -91,16 +109,22 @@ void gumbo_string_buffer_append_codepoint( void gumbo_string_buffer_append_string( struct GumboInternalParser* parser, GumboStringPiece* str, GumboStringBuffer* output) { - maybe_resize_string_buffer(parser, str->length, output); + ENSURE_CAPACITY_OR_RETURN(str->length, output); memcpy(output->data + output->length, str->data, str->length); output->length += str->length; } char* gumbo_string_buffer_to_string( struct GumboInternalParser* parser, GumboStringBuffer* input) { - maybe_resize_string_buffer(parser, input->length + 1, input); - char* buffer = input->data; - buffer[input->length] = '\0'; + char* buffer; + if (maybe_resize_string_buffer(parser, input->length + 1, input)) { + buffer = input->data; + buffer[input->length] = '\0'; + } else { + // Out of memory: replace the last character. + buffer = input->data; + buffer[input->length - 1] = '\0'; + } gumbo_string_buffer_init(parser, input); return buffer; } diff --git a/src/string_buffer.h b/src/string_buffer.h index cc139862..8b940eed 100644 --- a/src/string_buffer.h +++ b/src/string_buffer.h @@ -50,8 +50,9 @@ void gumbo_string_buffer_init( // Ensures that the buffer contains at least a certain amount of space. Most // useful with snprintf and the other length-delimited string functions, which -// may want to write directly into the buffer. -void gumbo_string_buffer_reserve( +// may want to write directly into the buffer. Returns false on an allocation +// failure - the client should *not* try to write to the buffer in this case. +bool gumbo_string_buffer_reserve( struct GumboInternalParser* parser, size_t min_capacity, GumboStringBuffer* output); diff --git a/src/util.c b/src/util.c index 5d6ca6e1..488d85d0 100644 --- a/src/util.c +++ b/src/util.c @@ -43,6 +43,10 @@ char* gumbo_copy_stringz(GumboParser* parser, const char* str) { return buffer; } +void gumbo_set_out_of_memory(GumboParser* parser) { + parser->_output->out_of_memory = true; +} + // Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG // to use. void gumbo_debug(const char* format, ...) { diff --git a/src/util.h b/src/util.h index 28b6905b..4817f77f 100644 --- a/src/util.h +++ b/src/util.h @@ -51,6 +51,9 @@ void* gumbo_parser_allocate( // config options. void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr); +// Sets the out-of-memory flag on the output. +void gumbo_set_out_of_memory(struct GumboInternalParser* parser); + // Debug wrapper for printf, to make it easier to turn off debugging info when // required. void gumbo_debug(const char* format, ...); From 16ed80f316956bfbea8b60c354abe16117fd36fd Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Tue, 24 Feb 2015 10:32:26 -0800 Subject: [PATCH 12/14] Add out-of-memory handling to arenas, and a configurable chunk size. This adds an out_of_memory flag to GumboOutput, which is set if malloc fails or a vector is requested that exceeds the maximum chunk size. The allocator will longjmp back to gumbo_parse, and then return the partial parse tree with the flag set. It also adds an arena_chunk_size member to GumboOptions, for sizing the arena. --- src/arena.c | 25 +++++++++++++++++-------- src/arena.h | 18 ++++++------------ src/gumbo.h | 13 +++++++++++-- src/parser.c | 8 +++++++- src/parser.h | 6 ++++++ src/string_buffer.c | 9 +++++---- src/util.c | 11 ++++++++++- tests/parser.cc | 16 ++++++++++++++++ tests/test_utils.cc | 2 +- 9 files changed, 79 insertions(+), 29 deletions(-) diff --git a/src/arena.c b/src/arena.c index c8e1cb54..53da0c95 100644 --- a/src/arena.c +++ b/src/arena.c @@ -23,12 +23,15 @@ unsigned int gChunksAllocated; -void arena_init(GumboArena* arena) { +#define ARENA_ALIGNMENT 8 + +void arena_init(GumboArena* arena, size_t chunk_size) { assert(arena != NULL); - arena->head = malloc(sizeof(GumboArenaChunk)); + arena->head = malloc(chunk_size); arena->head->next = NULL; arena->allocation_ptr = arena->head->data; - gumbo_debug("Initializing arena @%x\n", arena->head); + gumbo_debug( + "Initializing arena with chunk size %d @%x\n", chunk_size, arena->head); gChunksAllocated = 1; } @@ -42,14 +45,20 @@ void arena_destroy(GumboArena* arena) { } } -void* gumbo_arena_malloc(void* userdata, size_t size) { - GumboArena* arena = userdata; +void* arena_malloc(GumboArena* arena, size_t chunk_size, size_t size) { GumboArenaChunk* current_chunk = arena->head; size_t aligned_size = (size + ARENA_ALIGNMENT - 1) & ~(ARENA_ALIGNMENT - 1); if (arena->allocation_ptr >= - current_chunk->data + ARENA_CHUNK_SIZE - aligned_size) { - GumboArenaChunk* new_chunk = malloc(sizeof(GumboArenaChunk)); - gumbo_debug("Allocating new arena chunk @%x\n", new_chunk); + current_chunk->data + chunk_size - aligned_size) { + if (size > chunk_size) { + return NULL; + } + GumboArenaChunk* new_chunk = malloc(chunk_size); + gumbo_debug( + "Allocating new arena chunk of size %d @%x\n", chunk_size, new_chunk); + if (!new_chunk) { + return NULL; + } new_chunk->next = current_chunk; arena->head = new_chunk; arena->allocation_ptr = new_chunk->data; diff --git a/src/arena.h b/src/arena.h index b5a7ee63..4949cffb 100644 --- a/src/arena.h +++ b/src/arena.h @@ -23,27 +23,21 @@ extern "C" { #endif -// 400K is right around the median total memory allocation (on a corpus of ~60K -// webpages taken from CommonCrawl), and allows up to 50K 8-byte allocations. -// The 95th percentile is roughly 2M, which would give 5 arena chunks. -// This should allow ~50% of webpages to parse in a single arena chunk, and the -// vast majority to use no more than 5, while still keeping typical memory usage -// well under a meg. -#define ARENA_CHUNK_SIZE 399992 -#define ARENA_ALIGNMENT 8 - typedef struct GumboInternalArenaChunk { struct GumboInternalArenaChunk* next; - char data[ARENA_CHUNK_SIZE]; + char data[]; } GumboArenaChunk; // Initialize an arena, allocating the first chunk for it. -void arena_init(GumboArena* arena); +void arena_init(GumboArena* arena, size_t chunk_size); // Destroy an arena, freeing all memory used by it and all objects contained. void arena_destroy(GumboArena* arena); -// gumbo_arena_malloc is defined in gumbo.h +// Allocate an object in an arena. chunk_size must remain constant between +// allocations. Returns NULL if either the program requests size > chunk_size +// or the system malloc fails. +void* arena_malloc(GumboArena* arena, size_t chunk_size, size_t size); // No-op free function for use as a custom allocator. void arena_free(void* arena, void* obj); diff --git a/src/gumbo.h b/src/gumbo.h index a61c14cf..e96df92d 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -596,6 +596,17 @@ typedef struct GumboInternalOptions { * Default: -1 */ int max_errors; + + /** + * The memory size of new arena chunks. The default (800K) is enough to parse + * ~60% of webpages in a single chunk; 95% will parse in under 2M. However, + * individual allocations cannot exceed the size of a single chunk; a small + * percentage (< 1/1000) of pages have very long text blocks that will be + * truncated by this, with the out_of_memory flag set. If you really need + * precise data for them and have extra memory to spend, try increasing this + * setting as necessary. + */ + size_t arena_chunk_size; } GumboOptions; /** Default options struct; use this with gumbo_parse_with_options. */ @@ -609,8 +620,6 @@ typedef struct GumboInternalArena { char* allocation_ptr; } GumboArena; -void* gumbo_arena_malloc(void* userdata, size_t size); - unsigned int gumbo_arena_chunks_allocated(); /** The output struct containing the results of the parse. */ diff --git a/src/parser.c b/src/parser.c index c394f423..63b976f3 100644 --- a/src/parser.c +++ b/src/parser.c @@ -61,6 +61,7 @@ const GumboOptions kGumboDefaultOptions = { 8, false, -1, + 799992, }; static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html"); @@ -4050,7 +4051,7 @@ GumboOutput* gumbo_parse_fragment( // arena is stored in the GumboOutput structure, so that must be allocated // manually. parser._output = malloc(sizeof(GumboOutput)); - arena_init(&parser._output->arena); + arena_init(&parser._output->arena, options->arena_chunk_size); // Next initialize the parser state. parser_state_init(&parser); // Must come after parser_state_init, since creating the document node must @@ -4075,6 +4076,11 @@ GumboOutput* gumbo_parse_fragment( GumboToken token; bool has_error = false; + if (setjmp(parser._out_of_memory_jmp)) { + parser._output->out_of_memory = true; + return parser._output; + } + do { if (state->_reprocess_current_token) { state->_reprocess_current_token = false; diff --git a/src/parser.h b/src/parser.h index 95019e3e..7ecd4e34 100644 --- a/src/parser.h +++ b/src/parser.h @@ -20,6 +20,8 @@ #ifndef GUMBO_PARSER_H_ #define GUMBO_PARSER_H_ +#include + #ifdef __cplusplus extern "C" { #endif @@ -48,6 +50,10 @@ typedef struct GumboInternalParser { // The internal parser state. Initialized on parse start and destroyed on // parse end; end-users will never see a non-garbage value in this pointer. struct GumboInternalParserState* _parser_state; + + // A jmp_buf to use in case of out-of-memory conditions. This jumps back to + // gumbo_parse, which then returns after setting the out_of_memory flag. + jmp_buf _out_of_memory_jmp; } GumboParser; #ifdef __cplusplus diff --git a/src/string_buffer.c b/src/string_buffer.c index d641e905..d92427a6 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -21,7 +21,8 @@ #include #include -#include "arena.h" +#include "gumbo.h" +#include "parser.h" #include "string_piece.h" #include "util.h" @@ -42,8 +43,8 @@ static bool maybe_resize_string_buffer( new_capacity *= 2; } if (new_capacity != buffer->capacity) { - if (new_capacity > ARENA_CHUNK_SIZE) { - if (buffer->capacity == ARENA_CHUNK_SIZE) { + if (new_capacity > parser->_options->arena_chunk_size) { + if (buffer->capacity == parser->_options->arena_chunk_size) { // If we have already resized the buffer to the maximum chunk size, then // we're out of memory, and we ignore any more writes to the buffer. gumbo_set_out_of_memory(parser); @@ -52,7 +53,7 @@ static bool maybe_resize_string_buffer( // Otherwise, this is the first time we've hit the new max. Resize the // allocation to take up a whole chunk, but don't set an error condition // and let writes proceed. - new_capacity = ARENA_CHUNK_SIZE; + new_capacity = parser->_options->arena_chunk_size; } char* new_data = gumbo_parser_allocate(parser, new_capacity); memcpy(new_data, buffer->data, buffer->length); diff --git a/src/util.c b/src/util.c index 488d85d0..e501d61f 100644 --- a/src/util.c +++ b/src/util.c @@ -17,12 +17,14 @@ #include "util.h" #include +#include #include #include #include #include #include +#include "arena.h" #include "gumbo.h" #include "parser.h" @@ -32,7 +34,14 @@ const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 }; void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) { - return gumbo_arena_malloc(&parser->_output->arena, num_bytes); + void* result = arena_malloc( + &parser->_output->arena, + parser->_options->arena_chunk_size, + num_bytes); + if (result == NULL) { + longjmp(parser->_out_of_memory_jmp, num_bytes); + } + return result; } void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {} diff --git a/tests/parser.cc b/tests/parser.cc index 55e39e4f..66c06659 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -74,6 +74,22 @@ class GumboParserTest : public ::testing::Test { GumboNode* root_; }; +TEST_F(GumboParserTest, OutOfMemory) { + std::string html("
    "); + for (int i = 0; i < 1000; ++i) { + html += "
  • This should create a child vector larger than the chunk."; + } + html += "
"; + + options_.arena_chunk_size = 4000; + Parse(html); + + EXPECT_TRUE(output_->out_of_memory); + // The partial parse tree should still be constructed. + GumboNode* body; + GetAndAssertBody(root_, &body); +} + TEST_F(GumboParserTest, NullDocument) { Parse(""); ASSERT_TRUE(root_); diff --git a/tests/test_utils.cc b/tests/test_utils.cc index f57bd73c..aa398bd2 100644 --- a/tests/test_utils.cc +++ b/tests/test_utils.cc @@ -150,7 +150,7 @@ GumboTest::GumboTest() : options_.max_errors = 100; parser_._options = &options_; parser_._output = static_cast(calloc(1, sizeof(GumboOutput))); - arena_init(&parser_._output->arena); + arena_init(&parser_._output->arena, options_.arena_chunk_size); gumbo_init_errors(&parser_); } From 77b494017a4da06f78f419d78c76a99242a95f64 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 25 Feb 2015 19:32:58 -0800 Subject: [PATCH 13/14] Fix arena allocation size. It wasn't taking into account the linked list fields on GumboArenaChunk when allocating memory, causing code to blow past the end of the allocation. --- configure.ac | 2 +- src/arena.c | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index e2d4e9f2..8f12ce06 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.65]) -AC_INIT([gumbo], [0.9.2], [jonathan.d.tang@gmail.com]) +AC_INIT([gumbo], [1.0.0], [jonathan.d.tang@gmail.com]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_SRCDIR([src/parser.c]) #AC_CONFIG_HEADERS([config.h]) diff --git a/src/arena.c b/src/arena.c index 53da0c95..0eb0993d 100644 --- a/src/arena.c +++ b/src/arena.c @@ -51,12 +51,15 @@ void* arena_malloc(GumboArena* arena, size_t chunk_size, size_t size) { if (arena->allocation_ptr >= current_chunk->data + chunk_size - aligned_size) { if (size > chunk_size) { + gumbo_debug("Allocation size %d exceeds chunk size %d", size, chunk_size); return NULL; } - GumboArenaChunk* new_chunk = malloc(chunk_size); - gumbo_debug( - "Allocating new arena chunk of size %d @%x\n", chunk_size, new_chunk); + size_t memory_block_size = chunk_size + sizeof(GumboArenaChunk); + GumboArenaChunk* new_chunk = malloc(memory_block_size); + gumbo_debug("Allocating new arena chunk of size %d @%x\n", + memory_block_size, new_chunk); if (!new_chunk) { + gumbo_debug("Malloc failed.\n"); return NULL; } new_chunk->next = current_chunk; @@ -66,6 +69,7 @@ void* arena_malloc(GumboArena* arena, size_t chunk_size, size_t size) { } void* obj = arena->allocation_ptr; arena->allocation_ptr += aligned_size; + assert(arena->allocation_ptr <= arena->head->data + chunk_size); return obj; } From e19a9fa15988ee49df79b85ed873ef14e5b2f92e Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 2 Mar 2015 20:00:24 -0800 Subject: [PATCH 14/14] Properly handle requests for allocations > chunk_size (it returns a fresh, dedicated block for them), and remove arena_chunk_size as a configurable parameter. --- benchmarks/benchmark.cc | 2 +- src/arena.c | 70 +++++++++++++++++++++++++++-------------- src/arena.h | 12 ++----- src/error.c | 16 +++------- src/gumbo.h | 11 ------- src/parser.c | 3 +- src/string_buffer.c | 41 +++++------------------- src/string_buffer.h | 2 +- src/util.c | 1 - tests/parser.cc | 16 ---------- tests/test_utils.cc | 2 +- 11 files changed, 66 insertions(+), 110 deletions(-) diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 9c2c1c86..e3da8c34 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -25,7 +25,7 @@ #include "gumbo.h" -static const int kNumReps = 10; +static const int kNumReps = 200; int main(int argc, char** argv) { if (argc != 1) { diff --git a/src/arena.c b/src/arena.c index 0eb0993d..fedff8d7 100644 --- a/src/arena.c +++ b/src/arena.c @@ -23,15 +23,27 @@ unsigned int gChunksAllocated; -#define ARENA_ALIGNMENT 8 +// Alignment of each returned allocation block. We make sure everything is +// pointer-aligned. +#define ARENA_ALIGNMENT (sizeof(void*)) -void arena_init(GumboArena* arena, size_t chunk_size) { +// Size of a single arena chunk. Most recent Intel CPUs have a 256K L2 cache +// on-core, so we try to size a chunk to fit in that with a little extra room +// for the stack. Measurements on a corpus of ~60K webpages indicate that +// ... +#define ARENA_CHUNK_SIZE 240000 + +typedef struct GumboInternalArenaChunk { + struct GumboInternalArenaChunk* next; + char data[ARENA_CHUNK_SIZE]; +} GumboArenaChunk; + +void arena_init(GumboArena* arena) { assert(arena != NULL); - arena->head = malloc(chunk_size); + arena->head = malloc(sizeof(GumboArenaChunk)); arena->head->next = NULL; arena->allocation_ptr = arena->head->data; - gumbo_debug( - "Initializing arena with chunk size %d @%x\n", chunk_size, arena->head); + gumbo_debug("Initializing arena @%x\n", arena->head); gChunksAllocated = 1; } @@ -45,31 +57,41 @@ void arena_destroy(GumboArena* arena) { } } -void* arena_malloc(GumboArena* arena, size_t chunk_size, size_t size) { - GumboArenaChunk* current_chunk = arena->head; +static void* allocate_new_chunk(GumboArena* arena, size_t size) { + GumboArenaChunk* new_chunk = malloc(size); + gumbo_debug("Allocating new arena chunk of size %d @%x\n", size, new_chunk); + if (!new_chunk) { + gumbo_debug("Malloc failed.\n"); + return NULL; + } + ++gChunksAllocated; + new_chunk->next = arena->head; + arena->head = new_chunk; + return new_chunk->data; +} + +void* arena_malloc(GumboArena* arena, size_t size) { size_t aligned_size = (size + ARENA_ALIGNMENT - 1) & ~(ARENA_ALIGNMENT - 1); if (arena->allocation_ptr >= - current_chunk->data + chunk_size - aligned_size) { - if (size > chunk_size) { - gumbo_debug("Allocation size %d exceeds chunk size %d", size, chunk_size); - return NULL; - } - size_t memory_block_size = chunk_size + sizeof(GumboArenaChunk); - GumboArenaChunk* new_chunk = malloc(memory_block_size); - gumbo_debug("Allocating new arena chunk of size %d @%x\n", - memory_block_size, new_chunk); - if (!new_chunk) { - gumbo_debug("Malloc failed.\n"); - return NULL; + arena->head->data + ARENA_CHUNK_SIZE - aligned_size) { + if (size > ARENA_CHUNK_SIZE) { + // Big block requested; we allocate a chunk of memory of the requested + // size, add it to the list, and then immediately allocate another one. + gumbo_debug( + "Allocation size %d exceeds chunk size %d", size, ARENA_CHUNK_SIZE); + size_t total_chunk_size = + size + sizeof(GumboArenaChunk) - ARENA_CHUNK_SIZE; + void* result = allocate_new_chunk(arena, total_chunk_size); + arena->allocation_ptr = + allocate_new_chunk(arena, sizeof(GumboArenaChunk)); + return result; } - new_chunk->next = current_chunk; - arena->head = new_chunk; - arena->allocation_ptr = new_chunk->data; - ++gChunksAllocated; + // Normal operation: allocate the default arena chunk size. + arena->allocation_ptr = allocate_new_chunk(arena, sizeof(GumboArenaChunk)); } void* obj = arena->allocation_ptr; arena->allocation_ptr += aligned_size; - assert(arena->allocation_ptr <= arena->head->data + chunk_size); + assert(arena->allocation_ptr <= arena->head->data + ARENA_CHUNK_SIZE); return obj; } diff --git a/src/arena.h b/src/arena.h index 4949cffb..5238b5be 100644 --- a/src/arena.h +++ b/src/arena.h @@ -23,21 +23,15 @@ extern "C" { #endif -typedef struct GumboInternalArenaChunk { - struct GumboInternalArenaChunk* next; - char data[]; -} GumboArenaChunk; - // Initialize an arena, allocating the first chunk for it. -void arena_init(GumboArena* arena, size_t chunk_size); +void arena_init(GumboArena* arena); // Destroy an arena, freeing all memory used by it and all objects contained. void arena_destroy(GumboArena* arena); // Allocate an object in an arena. chunk_size must remain constant between -// allocations. Returns NULL if either the program requests size > chunk_size -// or the system malloc fails. -void* arena_malloc(GumboArena* arena, size_t chunk_size, size_t size); +// allocations. Returns NULL if the system malloc fails. +void* arena_malloc(GumboArena* arena, size_t size); // No-op free function for use as a custom allocator. void arena_free(void* arena, void* obj); diff --git a/src/error.c b/src/error.c index 931bc385..563dfde2 100644 --- a/src/error.c +++ b/src/error.c @@ -47,9 +47,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, // enough. In this case, we'll double the buffer size and hope it fits when // we retry (letting it fail and returning 0 if it doesn't), since there's // no way to smartly resize the buffer. - if (!gumbo_string_buffer_reserve(parser, output->capacity * 2, output)) { - return 0; - }; + gumbo_string_buffer_reserve(parser, output->capacity * 2, output); va_start(args, format); int result = vsnprintf(output->data + output->length, remaining_capacity, format, args); @@ -64,10 +62,8 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, #endif if (bytes_written > remaining_capacity) { - if (!gumbo_string_buffer_reserve( - parser, output->capacity + bytes_written, output)) { - return output->capacity; - } + gumbo_string_buffer_reserve( + parser, output->capacity + bytes_written, output); remaining_capacity = output->capacity - output->length; va_start(args, format); bytes_written = vsnprintf(output->data + output->length, @@ -246,10 +242,8 @@ void gumbo_caret_diagnostic_to_string( gumbo_string_buffer_append_codepoint(parser, '\n', output); gumbo_string_buffer_append_string(parser, &original_line, output); gumbo_string_buffer_append_codepoint(parser, '\n', output); - if (!gumbo_string_buffer_reserve( - parser, output->length + error->position.column, output)) { - return; - } + gumbo_string_buffer_reserve( + parser, output->length + error->position.column, output); int num_spaces = error->position.column - 1; memset(output->data + output->length, ' ', num_spaces); output->length += num_spaces; diff --git a/src/gumbo.h b/src/gumbo.h index e96df92d..92094836 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -596,17 +596,6 @@ typedef struct GumboInternalOptions { * Default: -1 */ int max_errors; - - /** - * The memory size of new arena chunks. The default (800K) is enough to parse - * ~60% of webpages in a single chunk; 95% will parse in under 2M. However, - * individual allocations cannot exceed the size of a single chunk; a small - * percentage (< 1/1000) of pages have very long text blocks that will be - * truncated by this, with the out_of_memory flag set. If you really need - * precise data for them and have extra memory to spend, try increasing this - * setting as necessary. - */ - size_t arena_chunk_size; } GumboOptions; /** Default options struct; use this with gumbo_parse_with_options. */ diff --git a/src/parser.c b/src/parser.c index 63b976f3..85becb07 100644 --- a/src/parser.c +++ b/src/parser.c @@ -61,7 +61,6 @@ const GumboOptions kGumboDefaultOptions = { 8, false, -1, - 799992, }; static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html"); @@ -4051,7 +4050,7 @@ GumboOutput* gumbo_parse_fragment( // arena is stored in the GumboOutput structure, so that must be allocated // manually. parser._output = malloc(sizeof(GumboOutput)); - arena_init(&parser._output->arena, options->arena_chunk_size); + arena_init(&parser._output->arena); // Next initialize the parser state. parser_state_init(&parser); // Must come after parser_state_init, since creating the document node must diff --git a/src/string_buffer.c b/src/string_buffer.c index d92427a6..941bcd1e 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -21,8 +21,6 @@ #include #include -#include "gumbo.h" -#include "parser.h" #include "string_piece.h" #include "util.h" @@ -34,7 +32,7 @@ struct GumboInternalParser; // makes no sense to use less than 8 chars. static const size_t kDefaultStringBufferSize = 8; -static bool maybe_resize_string_buffer( +static void maybe_resize_string_buffer( struct GumboInternalParser* parser, size_t additional_chars, GumboStringBuffer* buffer) { size_t new_length = buffer->length + additional_chars; @@ -43,30 +41,14 @@ static bool maybe_resize_string_buffer( new_capacity *= 2; } if (new_capacity != buffer->capacity) { - if (new_capacity > parser->_options->arena_chunk_size) { - if (buffer->capacity == parser->_options->arena_chunk_size) { - // If we have already resized the buffer to the maximum chunk size, then - // we're out of memory, and we ignore any more writes to the buffer. - gumbo_set_out_of_memory(parser); - return false; - } - // Otherwise, this is the first time we've hit the new max. Resize the - // allocation to take up a whole chunk, but don't set an error condition - // and let writes proceed. - new_capacity = parser->_options->arena_chunk_size; - } char* new_data = gumbo_parser_allocate(parser, new_capacity); memcpy(new_data, buffer->data, buffer->length); gumbo_parser_deallocate(parser, buffer->data); buffer->data = new_data; buffer->capacity = new_capacity; } - return true; } -#define ENSURE_CAPACITY_OR_RETURN(capacity, buffer) \ - if (!maybe_resize_string_buffer(parser, (capacity), (buffer))) { return; } - void gumbo_string_buffer_init( struct GumboInternalParser* parser, GumboStringBuffer* output) { output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize); @@ -74,11 +56,10 @@ void gumbo_string_buffer_init( output->capacity = kDefaultStringBufferSize; } -bool gumbo_string_buffer_reserve( +void gumbo_string_buffer_reserve( struct GumboInternalParser* parser, size_t min_capacity, GumboStringBuffer* output) { - return maybe_resize_string_buffer( - parser, min_capacity - output->length, output); + maybe_resize_string_buffer(parser, min_capacity - output->length, output); } void gumbo_string_buffer_append_codepoint( @@ -100,7 +81,7 @@ void gumbo_string_buffer_append_codepoint( num_bytes = 3; prefix = 0xf0; } - ENSURE_CAPACITY_OR_RETURN(num_bytes + 1, output); + maybe_resize_string_buffer(parser, num_bytes + 1, output); output->data[output->length++] = prefix | (c >> (num_bytes * 6)); for (int i = num_bytes - 1; i >= 0; --i) { output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6))); @@ -110,22 +91,16 @@ void gumbo_string_buffer_append_codepoint( void gumbo_string_buffer_append_string( struct GumboInternalParser* parser, GumboStringPiece* str, GumboStringBuffer* output) { - ENSURE_CAPACITY_OR_RETURN(str->length, output); + maybe_resize_string_buffer(parser, str->length, output); memcpy(output->data + output->length, str->data, str->length); output->length += str->length; } char* gumbo_string_buffer_to_string( struct GumboInternalParser* parser, GumboStringBuffer* input) { - char* buffer; - if (maybe_resize_string_buffer(parser, input->length + 1, input)) { - buffer = input->data; - buffer[input->length] = '\0'; - } else { - // Out of memory: replace the last character. - buffer = input->data; - buffer[input->length - 1] = '\0'; - } + maybe_resize_string_buffer(parser, input->length + 1, input); + char* buffer = input->data; + buffer[input->length] = '\0'; gumbo_string_buffer_init(parser, input); return buffer; } diff --git a/src/string_buffer.h b/src/string_buffer.h index 8b940eed..6213ccb2 100644 --- a/src/string_buffer.h +++ b/src/string_buffer.h @@ -52,7 +52,7 @@ void gumbo_string_buffer_init( // useful with snprintf and the other length-delimited string functions, which // may want to write directly into the buffer. Returns false on an allocation // failure - the client should *not* try to write to the buffer in this case. -bool gumbo_string_buffer_reserve( +void gumbo_string_buffer_reserve( struct GumboInternalParser* parser, size_t min_capacity, GumboStringBuffer* output); diff --git a/src/util.c b/src/util.c index e501d61f..1bc89f91 100644 --- a/src/util.c +++ b/src/util.c @@ -36,7 +36,6 @@ const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 }; void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) { void* result = arena_malloc( &parser->_output->arena, - parser->_options->arena_chunk_size, num_bytes); if (result == NULL) { longjmp(parser->_out_of_memory_jmp, num_bytes); diff --git a/tests/parser.cc b/tests/parser.cc index 66c06659..55e39e4f 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -74,22 +74,6 @@ class GumboParserTest : public ::testing::Test { GumboNode* root_; }; -TEST_F(GumboParserTest, OutOfMemory) { - std::string html("
    "); - for (int i = 0; i < 1000; ++i) { - html += "
  • This should create a child vector larger than the chunk."; - } - html += "
"; - - options_.arena_chunk_size = 4000; - Parse(html); - - EXPECT_TRUE(output_->out_of_memory); - // The partial parse tree should still be constructed. - GumboNode* body; - GetAndAssertBody(root_, &body); -} - TEST_F(GumboParserTest, NullDocument) { Parse(""); ASSERT_TRUE(root_); diff --git a/tests/test_utils.cc b/tests/test_utils.cc index aa398bd2..f57bd73c 100644 --- a/tests/test_utils.cc +++ b/tests/test_utils.cc @@ -150,7 +150,7 @@ GumboTest::GumboTest() : options_.max_errors = 100; parser_._options = &options_; parser_._output = static_cast(calloc(1, sizeof(GumboOutput))); - arena_init(&parser_._output->arena, options_.arena_chunk_size); + arena_init(&parser_._output->arena); gumbo_init_errors(&parser_); }