Skip to content
This repository was archived by the owner on Feb 15, 2023. It is now read-only.

Arena #309

Open
wants to merge 14 commits into
base: v1.0.0
Choose a base branch
from
Open

Arena #309

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ lib_LTLIBRARIES = libgumbo.la
libgumbo_la_CFLAGS = -Wall
libgumbo_la_LDFLAGS = -version-info 1:0:0 -no-undefined
libgumbo_la_SOURCES = \
src/arena.c \
src/arena.h \
src/attribute.c \
src/attribute.h \
src/char_ref.c \
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

#include "gumbo.h"

static const int kNumReps = 10;
static const int kNumReps = 200;

int main(int argc, char** argv) {
if (argc != 1) {
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.

AC_PREREQ([2.65])
AC_INIT([gumbo], [0.9.2], [[email protected]])
AC_INIT([gumbo], [1.0.0], [[email protected]])
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_SRCDIR([src/parser.c])
#AC_CONFIG_HEADERS([config.h])
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def run(self):
]

setup(name='gumbo',
version='0.9.2',
version='0.9.4',
description='Python bindings for Gumbo HTML parser',
long_description=README,
url='http://github.com/google/gumbo-parser',
Expand Down
105 changes: 105 additions & 0 deletions src/arena.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2015 Jonathan Tang. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: [email protected] (Jonathan Tang)

#include "arena.h"

#include <assert.h>
#include <stdlib.h>

#include "util.h"

unsigned int gChunksAllocated;

// Alignment of each returned allocation block. We make sure everything is
// pointer-aligned.
#define ARENA_ALIGNMENT (sizeof(void*))

// Size of a single arena chunk. Most recent Intel CPUs have a 256K L2 cache
// on-core, so we try to size a chunk to fit in that with a little extra room
// for the stack. Measurements on a corpus of ~60K webpages indicate that
// ...
#define ARENA_CHUNK_SIZE 240000

typedef struct GumboInternalArenaChunk {
struct GumboInternalArenaChunk* next;
char data[ARENA_CHUNK_SIZE];
} GumboArenaChunk;

void arena_init(GumboArena* arena) {
assert(arena != NULL);
arena->head = malloc(sizeof(GumboArenaChunk));
arena->head->next = NULL;
arena->allocation_ptr = arena->head->data;
gumbo_debug("Initializing arena @%x\n", arena->head);
gChunksAllocated = 1;
}

void arena_destroy(GumboArena* arena) {
GumboArenaChunk* chunk = arena->head;
while (chunk) {
gumbo_debug("Freeing arena chunk @%x\n", chunk);
GumboArenaChunk* to_free = chunk;
chunk = chunk->next;
free(to_free);
}
}

static void* allocate_new_chunk(GumboArena* arena, size_t size) {
GumboArenaChunk* new_chunk = malloc(size);
gumbo_debug("Allocating new arena chunk of size %d @%x\n", size, new_chunk);
if (!new_chunk) {
gumbo_debug("Malloc failed.\n");
return NULL;
}
++gChunksAllocated;
new_chunk->next = arena->head;
arena->head = new_chunk;
return new_chunk->data;
}

void* arena_malloc(GumboArena* arena, size_t size) {
size_t aligned_size = (size + ARENA_ALIGNMENT - 1) & ~(ARENA_ALIGNMENT - 1);
if (arena->allocation_ptr >=
arena->head->data + ARENA_CHUNK_SIZE - aligned_size) {
if (size > ARENA_CHUNK_SIZE) {
// Big block requested; we allocate a chunk of memory of the requested
// size, add it to the list, and then immediately allocate another one.
gumbo_debug(
"Allocation size %d exceeds chunk size %d", size, ARENA_CHUNK_SIZE);
size_t total_chunk_size =
size + sizeof(GumboArenaChunk) - ARENA_CHUNK_SIZE;
void* result = allocate_new_chunk(arena, total_chunk_size);
arena->allocation_ptr =
allocate_new_chunk(arena, sizeof(GumboArenaChunk));
return result;
}
// Normal operation: allocate the default arena chunk size.
arena->allocation_ptr = allocate_new_chunk(arena, sizeof(GumboArenaChunk));
}
void* obj = arena->allocation_ptr;
arena->allocation_ptr += aligned_size;
assert(arena->allocation_ptr <= arena->head->data + ARENA_CHUNK_SIZE);
return obj;
}

unsigned int gumbo_arena_chunks_allocated() {
return gChunksAllocated;
}

void arena_free(void* userdata, void* obj) {
// No-op.
}

43 changes: 43 additions & 0 deletions src/arena.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2015 Jonathan Tang. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: [email protected] (Jonathan Tang)

#ifndef GUMBO_ARENA_H_
#define GUMBO_ARENA_H_

#include "gumbo.h"

#ifdef __cplusplus
extern "C" {
#endif

// Initialize an arena, allocating the first chunk for it.
void arena_init(GumboArena* arena);

// Destroy an arena, freeing all memory used by it and all objects contained.
void arena_destroy(GumboArena* arena);

// Allocate an object in an arena. chunk_size must remain constant between
// allocations. Returns NULL if the system malloc fails.
void* arena_malloc(GumboArena* arena, size_t size);

// No-op free function for use as a custom allocator.
void arena_free(void* arena, void* obj);

#ifdef __cplusplus
}
#endif

#endif // GUMBO_ARENA_H_
8 changes: 5 additions & 3 deletions src/error.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ static const size_t kMessageBufferSize = 256;
static int print_message(GumboParser* parser, GumboStringBuffer* output,
const char* format, ...) {
va_list args;
va_start(args, format);
int remaining_capacity = output->capacity - output->length;
va_start(args, format);
int bytes_written = vsnprintf(output->data + output->length,
remaining_capacity, format, args);
va_end(args);
#ifdef _MSC_VER
if (bytes_written == -1) {
// vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
Expand All @@ -47,6 +48,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
// we retry (letting it fail and returning 0 if it doesn't), since there's
// no way to smartly resize the buffer.
gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
va_start(args, format);
int result = vsnprintf(output->data + output->length,
remaining_capacity, format, args);
va_end(args);
Expand All @@ -55,7 +57,6 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
#else
// -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
if (bytes_written == -1) {
va_end(args);
return 0;
}
#endif
Expand All @@ -64,11 +65,12 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
gumbo_string_buffer_reserve(
parser, output->capacity + bytes_written, output);
remaining_capacity = output->capacity - output->length;
va_start(args, format);
bytes_written = vsnprintf(output->data + output->length,
remaining_capacity, format, args);
va_end(args);
}
output->length += bytes_written;
va_end(args);
return bytes_written;
}

Expand Down
42 changes: 30 additions & 12 deletions src/gumbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -576,18 +576,6 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
* Use kGumboDefaultOptions for sensible defaults, and only set what you need.
*/
typedef struct GumboInternalOptions {
/** A memory allocator function. Default: malloc. */
GumboAllocatorFunction allocator;

/** A memory deallocator function. Default: free. */
GumboDeallocatorFunction deallocator;

/**
* An opaque object that's passed in as the first argument to all callbacks
* used by this library. Default: NULL.
*/
void* userdata;

/**
* The tab-stop size, for computing positions in source code that uses tabs.
* Default: 8.
Expand All @@ -613,6 +601,16 @@ typedef struct GumboInternalOptions {
/** Default options struct; use this with gumbo_parse_with_options. */
extern const GumboOptions kGumboDefaultOptions;

/** Base struct for an arena. */
struct GumboInternalArenaChunk;

typedef struct GumboInternalArena {
struct GumboInternalArenaChunk* head;
char* allocation_ptr;
} GumboArena;

unsigned int gumbo_arena_chunks_allocated();

/** The output struct containing the results of the parse. */
typedef struct GumboInternalOutput {
/**
Expand All @@ -635,6 +633,26 @@ typedef struct GumboInternalOutput {
* reported so we can work out something appropriate for your use-case.
*/
GumboVector /* GumboError */ errors;

/**
* Arena for default memory allocation. This is initialized on parse start
* when using the default memory allocator; it consumes little memory (a
* couple pointers) when a custom memory allocator is supplied.
*/
GumboArena arena;

/**
* Flag set if an out-of-memory condition occurs. This can either be because
* a stringbuffer or vector requested a single chunk larger than the arena
* chunk size, or because the system malloc failed. (The latter is not
* implemented yet - on most modern OSes, malloc never returns NULL and
* instead overcommits virtual memory.) Gumbo makes its best effort to
* recover from OOM errors: if the reason was that a buffer exceeded maximum
* chunk size, it truncates that buffer at the maximum chunk size, refuses to
* write to it anymore, and continues parsing. If the system malloc fails, it
* returns the parse tree it's parsed up until that point.
*/
bool out_of_memory;
} GumboOutput;

/**
Expand Down
51 changes: 20 additions & 31 deletions src/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <string.h>
#include <strings.h>

#include "arena.h"
#include "attribute.h"
#include "error.h"
#include "gumbo.h"
Expand Down Expand Up @@ -56,18 +57,7 @@ static bool handle_in_template(GumboParser*, GumboToken*);
static GumboNode* destroy_node(GumboParser*, GumboNode*);


static void* malloc_wrapper(void* unused, size_t size) {
return malloc(size);
}

static void free_wrapper(void* unused, void* ptr) {
free(ptr);
}

const GumboOptions kGumboDefaultOptions = {
&malloc_wrapper,
&free_wrapper,
NULL,
8,
false,
-1,
Expand Down Expand Up @@ -501,11 +491,12 @@ static GumboNode* new_document_node(GumboParser* parser) {
return document_node;
}

static void output_init(GumboParser* parser) {
GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
static void output_init(GumboParser* parser, GumboOutput* output) {
output->root = NULL;
output->document = new_document_node(parser);
parser->_output = output;
// Arena is initialized before this is called, so we have memory to initialize
// the parser state.
output->out_of_memory = false;
gumbo_init_errors(parser);
}

Expand Down Expand Up @@ -938,8 +929,7 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
insert_node(parser, text_node, location);
}

gumbo_string_buffer_destroy(parser, &buffer_state->_buffer);
gumbo_string_buffer_init(parser, &buffer_state->_buffer);
gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
buffer_state->_type = GUMBO_NODE_WHITESPACE;
assert(buffer_state->_buffer.length == 0);
}
Expand Down Expand Up @@ -4056,10 +4046,16 @@ GumboOutput* gumbo_parse_fragment(
const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace) {
GumboParser parser;
parser._options = options;
// Must come first, since all the other init functions allocate memory. The
// arena is stored in the GumboOutput structure, so that must be allocated
// manually.
parser._output = malloc(sizeof(GumboOutput));
arena_init(&parser._output->arena);
// Next initialize the parser state.
parser_state_init(&parser);
// Must come after parser_state_init, since creating the document node must
// reference parser_state->_current_node.
output_init(&parser);
output_init(&parser, parser._output);
// And this must come after output_init, because initializing the tokenizer
// reads the first character and that may cause a UTF-8 decode error
// (inserting into output->errors) if that's invalid.
Expand All @@ -4079,6 +4075,11 @@ GumboOutput* gumbo_parse_fragment(
GumboToken token;
bool has_error = false;

if (setjmp(parser._out_of_memory_jmp)) {
parser._output->out_of_memory = true;
return parser._output;
}

do {
if (state->_reprocess_current_token) {
state->_reprocess_current_token = false;
Expand Down Expand Up @@ -4156,18 +4157,6 @@ GumboOutput* gumbo_parse_fragment(
}

void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
// Need a dummy GumboParser because the allocator comes along with the
// options object.
GumboParser parser;
parser._parser_state = NULL;
parser._options = options;
GumboNode* current = output->document;
while (current) {
current = destroy_node(&parser, current);
}
for (int i = 0; i < output->errors.length; ++i) {
gumbo_error_destroy(&parser, output->errors.data[i]);
}
gumbo_vector_destroy(&parser, &output->errors);
gumbo_parser_deallocate(&parser, output);
arena_destroy(&output->arena);
free(output);
}
Loading