Skip to content

Commit d7936fe

Browse files
committed
Save work
1 parent 75e21df commit d7936fe

File tree

12 files changed

+5556
-0
lines changed

12 files changed

+5556
-0
lines changed

.clang-format

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
BasedOnStyle: Google
2+
IndentWidth: 4
3+
ColumnLimit: 0

.gitignore

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,20 @@ dist
128128
.yarn/build-state.yml
129129
.yarn/install-state.gz
130130
.pnp.*
131+
132+
# Executables
133+
*.exe
134+
*.out
135+
*.app
136+
*.i*86
137+
*.x86_64
138+
*.hex
139+
140+
# Debug files
141+
*.dSYM/
142+
*.su
143+
*.idb
144+
*.pdb
145+
146+
# Twofold specific
147+
cimg/lexer

cimp/fixtures/cyril.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Доброе утро мир (Good morning world)
2+
Здравствуй и прощай! (Hello, and goodbye!)

cimp/fixtures/greek.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Καλημέρα κόσμε (Good morning world)
2+
Γεια σας, και αντίο! (Hëllö, and goodbye!)

cimp/lexer.h

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#include <stdbool.h>
2+
#include <stdint.h>
3+
#include <stdlib.h>
4+
5+
#include "token.h"
6+
7+
#define OPEN_TAG_CHAR '<'
8+
#define CLOSE_TAG_CHAR '>'
9+
#define OPEN_EXPR_CHAR '{'
10+
#define CLOSE_EXPR_CHAR '}'
11+
#define LAST_STOPPER_CHAR '/'
12+
13+
static inline bool is_space(uint32_t c) {
14+
// space, tab, form feed and vertical tab
15+
return c == ' ' || c == '\t' || c == '\f' || c == '\v';
16+
}
17+
18+
static inline bool is_newline(uint32_t c) {
19+
// newline, carriage return
20+
return c == '\n' || c == '\r';
21+
}
22+
23+
static inline bool is_quote(uint32_t c) {
24+
// single quote, double quote, backtick
25+
return c == '\'' || c == '"' || c == '`';
26+
}
27+
28+
// lower latin, greek & cyrillic alphabet
29+
// the beginning of a tag, or param name
30+
static inline bool is_allowed_start(uint32_t c) {
31+
return (c >= 'a' && c <= 'z') || // a-z
32+
(c >= 224 && c <= 255) || // à-ÿ
33+
(c >= 940 && c <= 974) || // ά-ω
34+
(c >= 1072 && c <= 1103); // а-я
35+
}
36+
37+
// arabic numbers, all latin, greek & cyrillic
38+
// inside the tag name, or param name
39+
static inline bool is_allowed_alpha(uint32_t c) {
40+
// 0-9, A-Z, a-z, _, À-ÿ, ά-ω
41+
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') ||
42+
(c >= 'a' && c <= 'z') || c == '_' ||
43+
(c >= 192 && c <= 255) || // À-Ÿ à-ÿ
44+
(c >= 904 && c <= 974) || // Α-Ω ά-ω
45+
(c >= 1040 && c <= 1103); // А-Я а-я
46+
}
47+
48+
typedef enum {
49+
STATE_RAW_TEXT,
50+
STATE_OPEN_TAG, // 1
51+
STATE_CLOSE_TAG, // 2
52+
STATE_TAG_NAME, // 3
53+
STATE_INSIDE_TAG, // 4
54+
STATE_PARAM_NAME, // 5
55+
STATE_PARAM_VALUE, // 6
56+
STATE_EQUAL = 8,
57+
STATE_FINAL = 9,
58+
} LexerState;
59+
60+
// Lexer struct
61+
// ▰ ▰ ▰ ▰ ▰ ▰ ▰
62+
typedef struct {
63+
// curr index in the text buffer
64+
size_t index;
65+
LexerState state;
66+
LexerState priorState;
67+
size_t processed_len;
68+
size_t processed_cap;
69+
LexParam pendParam;
70+
LexToken pendNode;
71+
LexToken *processed;
72+
} Lexer;
73+
// ▰ ▰ ▰
74+
75+
// Initialize the lexer with default values
76+
void lexer_init(Lexer *lexer) {
77+
if (!lexer) {
78+
fprintf(stderr, "Lexer pointer is NULL\n");
79+
return;
80+
}
81+
printf("[Lexer_init] Initializing lexer\n");
82+
lexer->index = 0;
83+
lexer->state = STATE_RAW_TEXT;
84+
lexer->priorState = STATE_RAW_TEXT;
85+
lexer->processed_len = 0;
86+
lexer->processed_cap = 96;
87+
lexer->pendParam = *param_create();
88+
lexer->pendNode = *token_create();
89+
lexer->processed = (LexToken *)calloc(lexer->processed_cap, sizeof(LexToken));
90+
// lexer->processed = (LexToken *)malloc(sizeof(LexToken) * lexer->processed_cap);
91+
}
92+
93+
void lexer_free(Lexer *lexer) {
94+
if (!lexer) {
95+
fprintf(stderr, "Lexer pointer is NULL\n");
96+
return;
97+
}
98+
free(lexer->processed);
99+
str_free(&lexer->pendParam.val);
100+
token_free(&lexer->pendNode);
101+
// Note: Do NOT free lexer itself, as it might be
102+
// on the stack or part of another struct.
103+
printf("[Lexer_free] Freed lexer resources\n");
104+
}
105+
106+
void lexer_reset(Lexer *lexer) {
107+
if (!lexer) {
108+
fprintf(stderr, "Lexer pointer is NULL\n");
109+
return;
110+
}
111+
printf("\n[Lexer__reset] Resetting lexer state\n\n");
112+
lexer->index = 0;
113+
lexer->state = STATE_RAW_TEXT;
114+
lexer->priorState = STATE_RAW_TEXT;
115+
param_reset(&lexer->pendParam);
116+
token_reset(&lexer->pendNode);
117+
for (size_t i = 0; i < lexer->processed_len; i++) {
118+
token_reset(&lexer->processed[i]);
119+
}
120+
lexer->processed_len = 0;
121+
}

cimp/str8.h

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
#include <stdbool.h>
2+
#include <stdio.h>
3+
#include <stdlib.h>
4+
#include <string.h>
5+
6+
static inline int utf8_len_byte0(unsigned char c) {
7+
if (c < 0x80) return 1;
8+
if (c < 0xE0) return 2;
9+
if (c < 0xF0) return 3;
10+
return 4;
11+
}
12+
13+
static inline uint32_t utf8_decode(const char *s, size_t *bytes) {
14+
unsigned char c = *s;
15+
if (c < 0x80) {
16+
*bytes = 1;
17+
return c;
18+
}
19+
20+
uint32_t cp;
21+
int len = utf8_len_byte0(c);
22+
*bytes = len;
23+
24+
switch (len) {
25+
case 2:
26+
cp = (c & 0x1F) << 6;
27+
break;
28+
case 3:
29+
cp = (c & 0x0F) << 12;
30+
break;
31+
case 4:
32+
cp = (c & 0x07) << 18;
33+
break;
34+
default:
35+
return 0xFFFD; // invalid
36+
}
37+
38+
for (int i = 1; i < len; ++i) {
39+
c = s[i];
40+
if ((c & 0xC0) != 0x80) return 0xFFFD;
41+
cp |= (c & 0x3F) << (6 * (len - 1 - i));
42+
}
43+
return cp;
44+
}
45+
46+
// Encodes one code-point. Returns number of bytes written (1-4).
47+
static size_t utf8_encode(uint32_t cp, char bytes[4]) {
48+
if (cp <= 0x7F) {
49+
// 1-byte sequence (ASCII)
50+
bytes[0] = (char)cp;
51+
return 1;
52+
}
53+
if (cp <= 0x7FF) {
54+
// 2-byte sequence
55+
bytes[0] = (char)(0xC0 | (cp >> 6));
56+
bytes[1] = (char)(0x80 | (cp & 0x3F));
57+
return 2;
58+
}
59+
if (cp <= 0xFFFF) {
60+
// 3-byte sequence
61+
bytes[0] = (char)(0xE0 | (cp >> 12));
62+
bytes[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
63+
bytes[2] = (char)(0x80 | (cp & 0x3F));
64+
return 3;
65+
}
66+
if (cp <= 0x10FFFF) {
67+
// 4-byte sequence
68+
bytes[0] = (char)(0xF0 | (cp >> 18));
69+
bytes[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
70+
bytes[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
71+
bytes[3] = (char)(0x80 | (cp & 0x3F));
72+
return 4;
73+
}
74+
// Invalid Unicode code point
75+
return 0;
76+
}
77+
78+
#define STR_MIN_CAPACITY 24
79+
80+
// String8 struct
81+
// ▰ ▰ ▰ ▰ ▰ ▰ ▰
82+
typedef struct {
83+
char *data; // UTF-8 bytes
84+
size_t len; // number of codepoints (not bytes!)
85+
size_t cap; // allocated bytes
86+
size_t byte_len; // current number of bytes used
87+
} String8;
88+
// ▰ ▰ ▰ ▰
89+
90+
String8 *str_new(size_t initial_capacity) {
91+
if (initial_capacity < STR_MIN_CAPACITY)
92+
initial_capacity = STR_MIN_CAPACITY;
93+
String8 *s8 = calloc(1, sizeof(String8));
94+
if (!s8)
95+
return NULL;
96+
s8->cap = initial_capacity;
97+
s8->data = calloc(s8->cap, sizeof(char));
98+
if (!s8->data) {
99+
free(s8);
100+
return NULL;
101+
}
102+
return s8;
103+
}
104+
105+
void str_free(String8 *s8) {
106+
if (s8) {
107+
free(s8->data);
108+
// Do NOT free s8 itself. The caller is responsible for that,
109+
// as s8 might be on the stack or part of another struct.
110+
}
111+
}
112+
113+
void str_clear(String8 *s8) {
114+
if (s8) {
115+
s8->data[0] = '\0';
116+
s8->byte_len = 0;
117+
s8->len = 0;
118+
}
119+
}
120+
121+
static inline size_t str_length(const String8 *s8) {
122+
return s8->len;
123+
}
124+
125+
static inline uint32_t str_first_codepoint(const String8 *s8) {
126+
if (s8->len == 0) return 0;
127+
size_t len = s8->byte_len;
128+
return utf8_decode(s8->data, &len);
129+
}
130+
131+
static uint32_t str_last_codepoint(const String8 *s8) {
132+
if (s8->len == 0) return 0;
133+
size_t pos = s8->byte_len;
134+
while (pos > 0) {
135+
unsigned char b = s8->data[pos - 1];
136+
if ((b & 0xC0) != 0x80) { // not a continuation byte
137+
size_t len = s8->byte_len - (pos - 1);
138+
return utf8_decode(s8->data + pos - 1, &len);
139+
}
140+
pos--;
141+
}
142+
return 0xFFFD;
143+
}
144+
145+
static bool str_ensure_capacity(String8 *s8, size_t needed) {
146+
// +1 for the null terminator
147+
if (s8->byte_len + needed + 1 <= s8->cap) return true;
148+
size_t new_cap = s8->cap;
149+
while (new_cap < s8->byte_len + needed + 1) {
150+
new_cap *= 2;
151+
}
152+
char *new_data = realloc(s8->data, new_cap);
153+
if (!new_data) return false;
154+
s8->data = new_data;
155+
s8->cap = new_cap;
156+
return true;
157+
}
158+
159+
static bool str_append_uint32(String8 *s8, uint32_t cp) {
160+
char temp[4];
161+
printf("str_append_uint32 codepoint: %u\n", cp);
162+
size_t bytes = utf8_encode(cp, temp);
163+
if (bytes == 0) return false; // invalid codepoint
164+
if (!str_ensure_capacity(s8, bytes)) return false;
165+
// Copy the bytes to the end of the string
166+
memcpy(s8->data + s8->byte_len, temp, bytes);
167+
s8->byte_len += bytes;
168+
s8->data[s8->byte_len] = '\0';
169+
s8->len++;
170+
return true;
171+
}
172+
173+
bool str_equal(const String8 *a, const String8 *b) {
174+
if (!a || !b) {
175+
return (a == b); // both NULL → equal, one NULL → not
176+
}
177+
if (a->len != b->len) {
178+
return false; // different number of codepoints
179+
}
180+
if (a->byte_len != b->byte_len) {
181+
return false; // different byte length → can't be equal
182+
}
183+
// If byte lengths are same and codepoint counts are same,
184+
// do byte comparison — because UTF-8 is deterministic.
185+
// Same codepoints → same encoding → same bytes.
186+
return memcmp(a->data, b->data, a->byte_len) == 0;
187+
}
188+
189+
// void str_append_utf8(String8 *s8, const char *utf8_str) {
190+
// if (!utf8_str) return;
191+
// const char *p = utf8_str;
192+
// while (*p) {
193+
// size_t len = strlen(p);
194+
// uint32_t cp = utf8_decode(p, &len);
195+
// if (cp == 0xFFFD) len = 1; // skip invalid
196+
// str_append_uint32(s8, cp);
197+
// p += len;
198+
// }
199+
// }
200+
201+
bool str_read_file(String8 *fs, const char *filename) {
202+
FILE *f = fopen(filename, "rb"); // binary: UTF-8 has no BOM issues
203+
if (!f) return false;
204+
205+
// Check file size
206+
fseek(f, 0, SEEK_END);
207+
long fsize = ftell(f);
208+
fseek(f, 0, SEEK_SET);
209+
210+
if (fsize <= 0) {
211+
fclose(f);
212+
return true; // empty file is OK
213+
}
214+
if (!str_ensure_capacity(fs, fsize)) {
215+
fclose(f);
216+
return false; // allocation failed
217+
}
218+
219+
size_t bytes_read = fread(fs->data + fs->byte_len, 1, fsize, f);
220+
fclose(f);
221+
222+
// Decode byte-by-byte to update length
223+
const char *p = fs->data + fs->byte_len;
224+
size_t remaining = bytes_read;
225+
while (remaining > 0) {
226+
size_t len = remaining;
227+
uint32_t cp = utf8_decode(p, &len);
228+
if (cp == 0xFFFD) len = 1;
229+
fs->len++;
230+
p += len;
231+
remaining -= len;
232+
}
233+
234+
fs->byte_len += bytes_read;
235+
fs->data[fs->byte_len] = '\0';
236+
237+
return true;
238+
}

0 commit comments

Comments
 (0)