Skip to content

Commit 5d735c9

Browse files
C: Use hb_string_T for token_T.value (#687)
This pull request changes the `token_T.value` to use `hb_string_T` and adapts the call sites. Token values become non-owning slices into the source buffer, eliminating per-token `strdup` calls during lexing and parsing. As a result, `token_copy` becomes a shallow struct copy and `token_free` no longer frees the value. Two constants, `HB_STRING_EMPTY` and `HB_STRING_NULL`, are introduced to distinguish a valid empty string from the absence of a value. Call sites that previously checked `token->value == NULL` now use `hb_string_is_null`, while sites that checked for empty content use `hb_string_is_empty`. The public API is simplified by removing `herb_lex_file` and `herb_lex_to_buffer`. `herb_lex_file` had an inconsistent lifetime contract, it read a file, lexed it, then freed the source, leaving tokens with dangling pointers. Callers should now read the file themselves and pass the `source` to `herb_lex`. `herb_lex_to_buffer` was only used by the C-CLI and C tests, so it moves to an internal `lex_helpers.h` header. Since token values are non-owning, callers must keep the source string alive for as long as tokens or AST nodes are in use. All existing bindings already satisfy this naturally, they hold a reference to the source (Ruby string, JNI string, `std::string`, etc.) for the duration of the operation and convert to native objects before releasing it. #### Comparison **`make bench_allocs`** ##### Before (current main `2990a34cc77681bcd45bb21b4c4320065c5ea129`) ``` === Allocation Benchmark === [small] (35 bytes input) lex small allocs: 32 deallocs: 0 bytes_alloc: 691 tokens: 16 parse small allocs: 106 deallocs: 68 bytes_alloc: 2597 [medium] (650 bytes input) lex medium allocs: 420 deallocs: 0 bytes_alloc: 9260 tokens: 210 parse medium allocs: 1355 deallocs: 897 bytes_alloc: 33354 [large] (2878 bytes input) lex large allocs: 1384 deallocs: 0 bytes_alloc: 31250 tokens: 692 parse large allocs: 4394 deallocs: 2963 bytes_alloc: 110160 ``` ##### After (this pull request) ``` === Allocation Benchmark === [small] (35 bytes input) lex small allocs: 16 deallocs: 0 bytes_alloc: 768 tokens: 16 parse small allocs: 59 deallocs: 34 bytes_alloc: 2844 [medium] (650 bytes input) lex medium allocs: 210 deallocs: 0 bytes_alloc: 10080 tokens: 210 parse medium allocs: 764 deallocs: 451 bytes_alloc: 36038 [large] (2878 bytes input) lex large allocs: 692 deallocs: 0 bytes_alloc: 33216 tokens: 692 parse large allocs: 2465 deallocs: 1491 bytes_alloc: 117003 ``` ##### Conclusion This pull request does 50% less allocations while lexing and 40-50% less allocations while parsing, though overall, it has to slightly allocate more total bytes. --------- Co-authored-by: Marco Roth <marco.roth@intergga.ch>
1 parent 951f34a commit 5d735c9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+1793
-485
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ docs/docs/linter/rules/
66
# Herb output
77
/herb
88
/run_herb_tests
9+
/bench_allocs
910

1011
# Herb Ruby extension
1112
/ext/herb/extconf.h

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ test_sources = $(wildcard test/**/*.c)
2020
test_objects = $(test_sources:.c=.o)
2121
non_main_objects = $(filter-out src/main.o, $(objects))
2222

23+
bench_allocs_exec = bench_allocs
24+
bench_allocs_source = bench/bench_allocs.c
25+
2326
soext ?= $(shell ruby -e 'puts RbConfig::CONFIG["DLEXT"]')
2427
lib_name = $(build_dir)/lib$(exec).$(soext)
2528
static_lib_name = $(build_dir)/lib$(exec).a
@@ -102,9 +105,14 @@ test/%.o: test/%.c templates prism
102105
test: $(test_objects) $(non_main_objects)
103106
$(cc) $(test_objects) $(non_main_objects) $(test_cflags) $(test_ldflags) -o $(test_exec)
104107

108+
.PHONY: bench_allocs
109+
bench_allocs: $(non_main_objects)
110+
$(cc) $(bench_allocs_source) $(non_main_objects) $(flags) $(prism_ldflags) -o $(bench_allocs_exec)
111+
./$(bench_allocs_exec)
112+
105113
.PHONY: clean
106114
clean:
107-
rm -f $(exec) $(test_exec) $(lib_name) $(shared_lib_name) $(ruby_extension)
115+
rm -f $(exec) $(test_exec) $(bench_allocs_exec) $(lib_name) $(shared_lib_name) $(ruby_extension)
108116
rm -rf $(objects) $(test_objects) $(extension_objects) lib/herb/*.bundle tmp
109117
rm -rf $(prism_path)
110118
rake prism:clean

bench/bench_allocs.c

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#include "../src/include/herb.h"
2+
#include "../src/include/util/hb_allocator.h"
3+
4+
#include <stdio.h>
5+
#include <stdlib.h>
6+
#include <string.h>
7+
8+
static const char* SMALL_INPUT = "<div class=\"hello\"><%= foo %></div>";
9+
10+
static const char* MEDIUM_INPUT =
11+
"<!DOCTYPE html>\n"
12+
"<html lang=\"en\">\n"
13+
"<head>\n"
14+
" <meta charset=\"UTF-8\">\n"
15+
" <title><%= @title %></title>\n"
16+
"</head>\n"
17+
"<body>\n"
18+
" <div id=\"app\" class=\"container\">\n"
19+
" <h1><%= @heading %></h1>\n"
20+
" <% @items.each do |item| %>\n"
21+
" <div class=\"item\" data-id=\"<%= item.id %>\">\n"
22+
" <span class=\"name\"><%= item.name %></span>\n"
23+
" <p><%= item.description %></p>\n"
24+
" <% if item.active? %>\n"
25+
" <span class=\"badge\">Active</span>\n"
26+
" <% else %>\n"
27+
" <span class=\"badge inactive\">Inactive</span>\n"
28+
" <% end %>\n"
29+
" </div>\n"
30+
" <% end %>\n"
31+
" </div>\n"
32+
" <%# This is a comment %>\n"
33+
" <%= render partial: 'footer', locals: { year: 2024 } %>\n"
34+
"</body>\n"
35+
"</html>\n";
36+
37+
static const char* LARGE_INPUT =
38+
"<!DOCTYPE html>\n"
39+
"<html lang=\"<%= I18n.locale %>\">\n"
40+
"<head>\n"
41+
" <meta charset=\"UTF-8\">\n"
42+
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n"
43+
" <title><%= @page_title || 'Default' %></title>\n"
44+
" <%= csrf_meta_tags %>\n"
45+
" <%= csp_meta_tag %>\n"
46+
" <%= stylesheet_link_tag 'application' %>\n"
47+
" <%= javascript_include_tag 'application' %>\n"
48+
"</head>\n"
49+
"<body class=\"<%= body_class %>\">\n"
50+
" <nav class=\"navbar\">\n"
51+
" <div class=\"nav-brand\">\n"
52+
" <%= link_to root_path do %>\n"
53+
" <img src=\"<%= asset_path('logo.png') %>\" alt=\"Logo\">\n"
54+
" <% end %>\n"
55+
" </div>\n"
56+
" <ul class=\"nav-links\">\n"
57+
" <% @nav_items.each do |nav| %>\n"
58+
" <li class=\"<%= 'active' if current_page?(nav.path) %>\">\n"
59+
" <%= link_to nav.label, nav.path, class: 'nav-link' %>\n"
60+
" </li>\n"
61+
" <% end %>\n"
62+
" </ul>\n"
63+
" <div class=\"nav-user\">\n"
64+
" <% if current_user %>\n"
65+
" <span><%= current_user.name %></span>\n"
66+
" <%= link_to 'Logout', logout_path, method: :delete %>\n"
67+
" <% else %>\n"
68+
" <%= link_to 'Login', login_path %>\n"
69+
" <% end %>\n"
70+
" </div>\n"
71+
" </nav>\n"
72+
" <main id=\"content\">\n"
73+
" <% if flash[:notice] %>\n"
74+
" <div class=\"alert alert-info\"><%= flash[:notice] %></div>\n"
75+
" <% end %>\n"
76+
" <% if flash[:alert] %>\n"
77+
" <div class=\"alert alert-danger\"><%= flash[:alert] %></div>\n"
78+
" <% end %>\n"
79+
" <div class=\"container\">\n"
80+
" <h1><%= @heading %></h1>\n"
81+
" <table class=\"table\">\n"
82+
" <thead>\n"
83+
" <tr>\n"
84+
" <th>Name</th>\n"
85+
" <th>Email</th>\n"
86+
" <th>Role</th>\n"
87+
" <th>Status</th>\n"
88+
" <th>Actions</th>\n"
89+
" </tr>\n"
90+
" </thead>\n"
91+
" <tbody>\n"
92+
" <% @users.each do |user| %>\n"
93+
" <tr id=\"user-<%= user.id %>\" class=\"<%= cycle('odd', 'even') %>\">\n"
94+
" <td><%= user.name %></td>\n"
95+
" <td><%= mail_to user.email %></td>\n"
96+
" <td><%= user.role.titleize %></td>\n"
97+
" <td>\n"
98+
" <% if user.active? %>\n"
99+
" <span class=\"badge badge-success\">Active</span>\n"
100+
" <% elsif user.suspended? %>\n"
101+
" <span class=\"badge badge-warning\">Suspended</span>\n"
102+
" <% else %>\n"
103+
" <span class=\"badge badge-danger\">Inactive</span>\n"
104+
" <% end %>\n"
105+
" </td>\n"
106+
" <td>\n"
107+
" <%= link_to 'View', user_path(user), class: 'btn btn-sm' %>\n"
108+
" <%= link_to 'Edit', edit_user_path(user), class: 'btn btn-sm' %>\n"
109+
" <% if can?(:destroy, user) %>\n"
110+
" <%= link_to 'Delete', user_path(user), method: :delete, class: 'btn btn-sm btn-danger', data: { confirm: 'Are you sure?' } %>\n"
111+
" <% end %>\n"
112+
" </td>\n"
113+
" </tr>\n"
114+
" <% end %>\n"
115+
" </tbody>\n"
116+
" </table>\n"
117+
" <%= render 'shared/pagination', collection: @users %>\n"
118+
" </div>\n"
119+
" </main>\n"
120+
" <footer class=\"footer\">\n"
121+
" <p>&copy; <%= Time.current.year %> Company</p>\n"
122+
" </footer>\n"
123+
"</body>\n"
124+
"</html>\n";
125+
126+
typedef struct {
127+
const char* name;
128+
const char* source;
129+
} test_case_T;
130+
131+
static void run_lex_benchmark(const char* name, const char* source) {
132+
hb_allocator_T allocator = hb_allocator_with_tracking();
133+
134+
hb_array_T* tokens = herb_lex(source, &allocator);
135+
136+
hb_allocator_tracking_stats_T* stats = hb_allocator_tracking_stats(&allocator);
137+
138+
printf(" lex %-10s allocs: %-6zu deallocs: %-6zu bytes_alloc: %-8zu tokens: %zu\n",
139+
name, stats->allocation_count, stats->deallocation_count, stats->bytes_allocated, tokens->size);
140+
141+
herb_free_tokens(&tokens, &allocator);
142+
hb_allocator_destroy(&allocator);
143+
}
144+
145+
static void run_parse_benchmark(const char* name, const char* source) {
146+
hb_allocator_T allocator = hb_allocator_with_tracking();
147+
148+
AST_DOCUMENT_NODE_T* root = herb_parse(source, NULL, &allocator);
149+
150+
hb_allocator_tracking_stats_T* stats = hb_allocator_tracking_stats(&allocator);
151+
152+
printf(" parse %-9s allocs: %-6zu deallocs: %-6zu bytes_alloc: %-8zu\n",
153+
name, stats->allocation_count, stats->deallocation_count, stats->bytes_allocated);
154+
155+
ast_node_free((AST_NODE_T*) root, &allocator);
156+
hb_allocator_destroy(&allocator);
157+
}
158+
159+
int main(void) {
160+
test_case_T cases[] = {
161+
{ "small", SMALL_INPUT },
162+
{ "medium", MEDIUM_INPUT },
163+
{ "large", LARGE_INPUT },
164+
};
165+
166+
size_t num_cases = sizeof(cases) / sizeof(cases[0]);
167+
168+
printf("=== Allocation Benchmark ===\n\n");
169+
170+
for (size_t i = 0; i < num_cases; i++) {
171+
printf("[%s] (%zu bytes input)\n", cases[i].name, strlen(cases[i].source));
172+
run_lex_benchmark(cases[i].name, cases[i].source);
173+
run_parse_benchmark(cases[i].name, cases[i].source);
174+
printf("\n");
175+
}
176+
177+
return 0;
178+
}

config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ errors:
3232
message:
3333
template: "Found %s when expecting %s at (%u:%u)."
3434
arguments:
35-
- token_type_to_friendly_string(found->type)
36-
- token_type_to_friendly_string(expected_type)
35+
- hb_string(token_type_to_friendly_string(found->type))
36+
- hb_string(token_type_to_friendly_string(expected_type))
3737
- found->location.start.line
3838
- found->location.start.column
3939

ext/herb/extension.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,11 @@ static VALUE Herb_lex_file(int argc, VALUE* argv, VALUE self) {
121121

122122
lex_args_T args = { 0 };
123123
args.source = read_file_to_ruby_string(file_path);
124+
char* string = (char*) check_string(args.source);
124125

125126
if (!hb_allocator_init(&args.allocator, HB_ALLOCATOR_ARENA)) { return Qnil; }
126127

127-
args.tokens = herb_lex_file(file_path, &args.allocator);
128+
args.tokens = herb_lex(string, &args.allocator);
128129

129130
if (print_arena_stats) { hb_arena_print_stats((hb_arena_T*) args.allocator.context); }
130131

ext/herb/extension_helpers.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "../../src/include/location.h"
1010
#include "../../src/include/position.h"
1111
#include "../../src/include/token.h"
12+
#include "../../src/include/util/hb_string.h"
1213

1314
const char* check_string(VALUE value) {
1415
if (NIL_P(value)) { return NULL; }
@@ -44,14 +45,19 @@ VALUE rb_range_from_c_struct(range_T range) {
4445
return rb_class_new_instance(2, args, cRange);
4546
}
4647

48+
VALUE rb_string_from_hb_string(hb_string_T string) {
49+
if (hb_string_is_null(string)) { return Qnil; }
50+
51+
return rb_utf8_str_new(string.data, string.length);
52+
}
53+
4754
VALUE rb_token_from_c_struct(token_T* token) {
4855
if (!token) { return Qnil; }
4956

50-
VALUE value = token->value ? rb_utf8_str_new_cstr(token->value) : Qnil;
51-
57+
VALUE value = rb_string_from_hb_string(token->value);
5258
VALUE range = rb_range_from_c_struct(token->range);
5359
VALUE location = rb_location_from_c_struct(token->location);
54-
VALUE type = rb_utf8_str_new_cstr(token_type_to_string(token->type));
60+
VALUE type = rb_string_from_hb_string(token_type_to_string(token->type));
5561

5662
VALUE args[4] = { value, range, location, type };
5763

ext/herb/extension_helpers.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
const char* check_string(VALUE value);
1313
VALUE read_file_to_ruby_string(const char* file_path);
14+
VALUE rb_string_from_hb_string(hb_string_T string);
1415

1516
VALUE rb_position_from_c_struct(position_T position);
1617
VALUE rb_location_from_c_struct(location_T location);

java/extension_helpers.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,24 @@
1010
#include "../../src/include/range.h"
1111
#include "../../src/include/token.h"
1212
#include "../../src/include/util/hb_array.h"
13+
#include "../../src/include/util/hb_string.h"
1314

1415
#include <stdio.h>
1516
#include <stdlib.h>
1617
#include <string.h>
1718

19+
jstring CreateStringFromHbString(JNIEnv* env, hb_string_T string) {
20+
if (hb_string_is_null(string)) { return NULL; }
21+
22+
char* c_string = hb_string_to_c_string_using_malloc(string);
23+
if (!c_string) { return NULL; }
24+
25+
jstring result = (*env)->NewStringUTF(env, c_string);
26+
free(c_string);
27+
28+
return result;
29+
}
30+
1831
jobject CreatePosition(JNIEnv* env, position_T position) {
1932
jclass positionClass = (*env)->FindClass(env, "org/herb/Position");
2033
jmethodID constructor = (*env)->GetMethodID(env, positionClass, "<init>", "(II)V");
@@ -47,8 +60,8 @@ jobject CreateToken(JNIEnv* env, token_T* token) {
4760
jmethodID constructor = (*env)->GetMethodID(
4861
env, tokenClass, "<init>", "(Ljava/lang/String;Ljava/lang/String;Lorg/herb/Location;Lorg/herb/Range;)V");
4962

50-
jstring type = (*env)->NewStringUTF(env, token_type_to_string(token->type));
51-
jstring value = (*env)->NewStringUTF(env, token->value);
63+
jstring type = CreateStringFromHbString(env, token_type_to_string(token->type));
64+
jstring value = CreateStringFromHbString(env, token->value);
5265
jobject location = CreateLocation(env, token->location);
5366
jobject range = CreateRange(env, token->range);
5467

java/extension_helpers.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99
#include "../../src/include/range.h"
1010
#include "../../src/include/token.h"
1111
#include "../../src/include/util/hb_array.h"
12+
#include "../../src/include/util/hb_string.h"
1213

1314
#ifdef __cplusplus
1415
extern "C" {
1516
#endif
1617

18+
jstring CreateStringFromHbString(JNIEnv* env, hb_string_T string);
1719
jobject CreatePosition(JNIEnv* env, position_T position);
1820
jobject CreateLocation(JNIEnv* env, location_T location);
1921
jobject CreateRange(JNIEnv* env, range_T range);

javascript/packages/core/src/util.ts

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,3 @@ export function ensureString(object: any): string {
55

66
throw new TypeError("Argument must be a string")
77
}
8-
9-
export function convertToUTF8(string: string) {
10-
const bytes = []
11-
12-
for (let i = 0; i < string.length; i++) {
13-
bytes.push(string.charCodeAt(i))
14-
}
15-
16-
const decoder = new TextDecoder("utf-8")
17-
18-
return decoder.decode(new Uint8Array(bytes))
19-
}

0 commit comments

Comments
 (0)