Skip to content

Commit 4fde77f

Browse files
committed
C lexer usable in Bun
1 parent bdef763 commit 4fde77f

File tree

7 files changed

+166
-25
lines changed

7 files changed

+166
-25
lines changed

cimp/lexer.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#include "lexer.h"
2+
3+
//
4+
// ▰ ▰ ▰ ▰ ▰ ▰ ▰
5+
// Export to JS
6+
// ▰ ▰ ▰ ▰ ▰ ▰ ▰
7+
//
8+
9+
char* lex(uint32_t* text, size_t text_len) {
10+
printf("[lex] Lexing %zu characters\n", text_len);
11+
12+
Lexer lexer;
13+
lexer_init(&lexer);
14+
lexer_parse_chunk(&lexer, text, text_len);
15+
lexer_finish(&lexer);
16+
17+
char out[1024 * 1024];
18+
lexer_to_js(&lexer, out, sizeof(out));
19+
printf("[lex] Processed %zu tokens\n", lexer.processed_len);
20+
lexer_free(&lexer);
21+
// Return a copy of the output string
22+
return strdup(out);
23+
}

cimp/lexer.h

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,61 @@ void lexer_reset(Lexer *lexer) {
121121
lexer->processed_len = 0;
122122
}
123123

124-
// void lexer_to_js(const Lexer *lexer, ...)
125-
// TODO
124+
/*
125+
* Convert the lexer state to a JavaScript array of tokens.
126+
*/
127+
static inline void lexer_to_js(const Lexer *lexer, char *out, size_t out_size) {
128+
if (!lexer || !out || out_size == 0) {
129+
if (out && out_size > 0) {
130+
out[0] = '\0';
131+
}
132+
return;
133+
}
134+
135+
// Initialize an empty string in case there are no tokens
136+
out[0] = '\0';
137+
size_t current_len = 0;
138+
size_t remaining_size = out_size;
139+
140+
// Start the JavaScript array
141+
int written = snprintf(out, remaining_size, "[\n");
142+
if (written < 0 || (size_t)written >= remaining_size) {
143+
// Not enough space, out is now truncated but null-terminated.
144+
return;
145+
}
146+
current_len += written;
147+
remaining_size -= written;
148+
149+
// Iterate over each processed token
150+
for (size_t i = 0; i < lexer->processed_len; ++i) {
151+
// Convert the current token to its JS representation
152+
// We pass the pointer to the current end of the output string
153+
token_to_js(&lexer->processed[i], out + current_len, remaining_size);
154+
155+
// Update current length and remaining size
156+
size_t token_len = strlen(out + current_len);
157+
current_len += token_len;
158+
159+
if (current_len >= out_size - 1) {
160+
// Buffer is full, we can't add a comma or the closing bracket.
161+
return;
162+
}
163+
remaining_size = out_size - current_len;
164+
165+
// Add a comma if this is not the last token
166+
if (i < lexer->processed_len - 1) {
167+
written = snprintf(out + current_len, remaining_size, ",\n");
168+
if (written < 0 || (size_t)written >= remaining_size) {
169+
return; // Not enough space for comma
170+
}
171+
current_len += written;
172+
remaining_size -= written;
173+
}
174+
}
175+
176+
// Close the JavaScript array
177+
snprintf(out + current_len, remaining_size, "\n]");
178+
}
126179

127180
/*
128181
* Transition to a new lexer state.
@@ -166,8 +219,8 @@ static void lexer__commit(Lexer *lexer) {
166219
last_pos - token->pos_start, token->pos_start, last_pos);
167220
} else {
168221
const char *name = token_name_utf8(token);
169-
printf("[Lexer__commit] Commit token: name=%s, type: %d, pos: %zu-%zu\n",
170-
name, token->type, token->pos_start, last_pos);
222+
printf("[Lexer__commit] Commit token type: %d: name=%s, params: %zu, pos: %zu-%zu\n",
223+
token->type, name, token->param_len, token->pos_start, last_pos);
171224
}
172225

173226
// Check if we need to reallocate the processed tokens array
@@ -181,27 +234,25 @@ static void lexer__commit(Lexer *lexer) {
181234
}
182235
}
183236

184-
// Add the pending token to the processed tokens
185-
// This creates a copy, so the pending token must
186-
// be freed or reset later
237+
// Shallow copy the pending token to the processed tokens
187238
lexer->processed[lexer->processed_len++] = *token;
188-
// Re-create the pending token
189-
token_reset(&lexer->pendNode);
190-
// lexer->pendNode = *token_create();
239+
// Create a new pending token for the next cycle
240+
// TODO :: check if this causes memory leaks !!
241+
lexer->pendNode = *token_create();
191242
lexer->pendNode.pos_start = last_pos;
192243
lexer->pendNode.pos_end = last_pos;
193244
}
194245

195246
static inline void lexer__commit_param(Lexer *lexer) {
196247
token_param_append(&lexer->pendNode, &lexer->pendParam);
197248
// Re-create the pending parameter
198-
// Maybe is should be reset instead?
249+
// TODO :: check if this causes memory leaks !!
199250
lexer->pendParam = *param_create();
200251
}
201252

202253
static inline void lexer__parse_one(Lexer *lexer, uint32_t curr, uint32_t prev) {
203-
// printf("i=%ld - STATE :: %u ;; new CHAR :: (%d) ;; prev CHAR :: (%d)\n",
204-
// lexer->index, lexer->state, (int)curr, (int)prev);
254+
printf("i=%ld - STATE :: %u ;; new CHAR :: (%d) ;; prev CHAR :: (%d)\n",
255+
lexer->index, lexer->state, (int)curr, (int)prev);
205256

206257
if (lexer->state == STATE_RAW_TEXT) {
207258
// Could this be the beginning of a new tag?
@@ -389,6 +440,7 @@ static inline void lexer__parse_one(Lexer *lexer, uint32_t curr, uint32_t prev)
389440
}
390441
// Is this a valid closing quote?
391442
else if (curr == value_0 && is_quote(curr) && value_z != '\\') {
443+
param_val_append(&lexer->pendParam, curr);
392444
lexer__commit_param(lexer);
393445
lexer__transition(lexer, STATE_INSIDE_TAG);
394446
}

cimp/lexer2.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import { dlopen, FFIType, ptr } from "bun:ffi";
2+
const path = "liblexer.so";
3+
const {
4+
symbols: { lex },
5+
} = dlopen(path, {
6+
lex: {
7+
args: ["ptr", "usize"],
8+
returns: FFIType.cstring,
9+
},
10+
});
11+
12+
// Method 2: Using Bun's FFI with a C source file
13+
// import { cc, FFIType, ptr } from "bun:ffi";
14+
// import source from "./lexer.c" with { type: "file" };
15+
// const {
16+
// symbols: { lex },
17+
// } = cc({
18+
// source,
19+
// symbols: {
20+
// lex: {
21+
// args: ["ptr", "usize"],
22+
// returns: FFIType.cstring,
23+
// },
24+
// },
25+
// });
26+
27+
const buff = new Uint32Array(Buffer.from("hello\0", "utf8"));
28+
const result = lex(ptr(buff), buff.length);
29+
console.log(eval(result.toString()));

cimp/main.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#include "lexer.h"
2+
3+
int main(int argc, char *argv[]) {
4+
// The program name is argv[0], first arg is argv[1].
5+
// The app expects exactly one argument: the filename.
6+
if (argc != 2) {
7+
// Print usage instructions to standard error if the argument is missing.
8+
fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
9+
return 1;
10+
}
11+
12+
Lexer lexer;
13+
lexer_init(&lexer);
14+
15+
const char *filename = argv[1];
16+
printf("Processing file: %s\n", filename);
17+
lexer_parse_file(&lexer, filename);
18+
19+
char out[1024 * 1024];
20+
lexer_to_js(&lexer, out, sizeof(out));
21+
printf("Lexer tokens: %s\n", out);
22+
23+
lexer_free(&lexer);
24+
return 0;
25+
}

cimp/test/test_lex.c

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@ void test_simple_single(void) {
4040
TEST_ASSERT_EQUAL(5, lexer.processed[1].pos_end);
4141
TEST_ASSERT_EQUAL_STRING("b", token_name_utf8(&lexer.processed[1]));
4242
lexer_reset(&lexer);
43+
44+
lexer_parse_chunk(&lexer, U"<div id=\"main\"/>", 0);
45+
TEST_ASSERT_EQUAL(16, lexer.index);
46+
TEST_ASSERT_EQUAL(1, lexer.processed_len);
47+
TEST_ASSERT_EQUAL(TYPE_SINGLE_TAG, lexer.processed[0].type);
48+
TEST_ASSERT_EQUAL(0, lexer.processed[0].pos_start);
49+
TEST_ASSERT_EQUAL(16, lexer.processed[0].pos_end);
50+
51+
char out[96];
52+
lexer_to_js(&lexer, out, sizeof(out));
53+
printf("Lexer output: %s\n", out);
4354
}
4455

4556
void test_simple_double(void) {
@@ -71,7 +82,6 @@ void test_simple_double(void) {
7182
void test_unicode_chars(void) {
7283
lexer_init(&lexer);
7384
lexer_parse_text(&lexer, (const uint32_t *)U"French <àéìòùÀÉÌÒÙ/> German <äöüßÄÖÜ/>\0");
74-
// lexer_display(&lexer);
7585
TEST_ASSERT_EQUAL(38, lexer.index);
7686
TEST_ASSERT_EQUAL(4, lexer.processed_len);
7787
TEST_ASSERT_EQUAL(TYPE_RAW_TEXT, lexer.processed[0].type);
@@ -100,10 +110,13 @@ void test_unicode_chars(void) {
100110
void test_parse_file(void) {
101111
lexer_init(&lexer);
102112
lexer_parse_file(&lexer, "fixtures/menu.xml");
103-
// lexer_display(&lexer);
104113
TEST_ASSERT_EQUAL(250, lexer.index);
105114
TEST_ASSERT_EQUAL(27, lexer.processed_len);
106115

116+
// char out[1200];
117+
// lexer_to_js(&lexer, out, sizeof(out));
118+
// printf("Lexer output: %s\n", out);
119+
107120
TEST_ASSERT_EQUAL(TYPE_RAW_TEXT, lexer.processed[0].type);
108121
TEST_ASSERT_EQUAL(TYPE_DOUBLE_TAG, lexer.processed[1].type);
109122
TEST_ASSERT_EQUAL_STRING(U"breakfast_menu", lexer.processed[1].name);
@@ -122,8 +135,8 @@ void test_parse_file(void) {
122135

123136
//
124137
// ▰ ▰ ▰ ▰ ▰ ▰ ▰
125-
// Serious testss
126-
// ▰ ▰ ▰ ▰ ▰ ▰ ▰
138+
// Serious testz
139+
// ▰ ▰ ▰ ▰ ▰ ▰ ▰
127140
//
128141

129142
typedef struct {
@@ -279,6 +292,6 @@ int main(void) {
279292
RUN_TEST(test_simple_double);
280293
RUN_TEST(test_unicode_chars);
281294
RUN_TEST(test_parse_file);
282-
// RUN_TEST(test_input_output);
295+
RUN_TEST(test_input_output);
283296
return UNITY_END();
284297
}

cimp/test/test_tok.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ void test_token_param_append(void) {
131131

132132
char out[64];
133133
token_to_js(token, out, sizeof(out));
134-
TEST_ASSERT_EQUAL_STRING("{type:0,pos_start:0,pos_end:0}", out);
134+
TEST_ASSERT_EQUAL_STRING("{}", out);
135135
token->type = TYPE_SINGLE_TAG; // Set type for testing
136136
token->pos_end = 10;
137137
token_to_js(token, out, sizeof(out));

cimp/token.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ static inline bool param_val_append(LexParam *param, uint32_t codepoint) {
7272

7373
static inline void param_to_js(const LexParam *param, char *out, size_t out_size) {
7474
if (!param || !out || out_size == 0) return;
75-
if (param->key_len == 0 && param->val.len == 0) {
75+
if (param->key_len == 0 || param->val.len == 0) {
7676
snprintf(out, out_size, "{}");
7777
return; // Empty param
7878
}
@@ -206,7 +206,7 @@ static inline bool token_grow_params(LexToken *tok) {
206206
*/
207207
static inline const char *token_name_utf8(const LexToken *tok) {
208208
if (!tok || tok->name_len == 0) return NULL; // No name
209-
static char utf8_name[MAX_NAME_LEN * 4]; // Enough for UTF-8 encoding
209+
static char utf8_name[MAX_NAME_LEN * 4]; // Enough for UTF-8 encoding
210210
size_t pos = 0;
211211
for (size_t i = 0; i < tok->name_len && pos < sizeof(utf8_name) - 1; i++) {
212212
char temp[4];
@@ -238,8 +238,6 @@ static inline bool token_param_append(LexToken *tok, LexParam *p) {
238238
if (tok->param_len >= tok->param_cap) {
239239
if (!token_grow_params(tok)) return false; // Allocation failed
240240
}
241-
// printf("[Token_param_append] Appending param: key_len=%zu, val_len=%zu\n",
242-
// p->key_len, p->val.len);
243241
tok->params[tok->param_len++] = *p; // Copy the Param
244242
return true;
245243
}
@@ -253,7 +251,7 @@ static inline void token_to_js(const LexToken *tok, char *out, size_t out_size)
253251
// Example double tag:
254252
// {type: 2, pos_start: 0, pos_end: 10, name: 'name2', params: [{param_key: 'param_value'}]}
255253
if (!tok || !out || out_size == 0) return;
256-
if (tok->name_len == 0 && tok->param_len == 0) {
254+
if (tok->pos_end <= tok->pos_start) {
257255
snprintf(out, out_size, "{}");
258256
return; // Empty token
259257
}
@@ -273,7 +271,8 @@ static inline void token_to_js(const LexToken *tok, char *out, size_t out_size)
273271
if (tok->param_len > 0) {
274272
pos += snprintf(out + pos, out_size - pos, ",params:[{");
275273
for (size_t i = 0; i < tok->param_len; ++i) {
276-
char param_js[256]; // Buffer for a single parameter's JS representation
274+
// Limited buffer for a single parameter's JS representation
275+
char param_js[1024];
277276
param_to_js(&tok->params[i], param_js, sizeof(param_js));
278277
pos += snprintf(out + pos, out_size - pos, "%s", param_js);
279278
if (i < tok->param_len - 1) {

0 commit comments

Comments
 (0)