ShinyTrinkets
diff --git a/‎.clang-format‎
Lines changed: 3 additions & 0 deletions b/‎.clang-format‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 17 additions & 0 deletions b/‎.gitignore‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎cimp/fixtures/cyril.txt‎
Lines changed: 2 additions & 0 deletions b/‎cimp/fixtures/cyril.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cimp/fixtures/greek.txt‎
Lines changed: 2 additions & 0 deletions b/‎cimp/fixtures/greek.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cimp/lexer.h‎
Lines changed: 121 additions & 0 deletions b/‎cimp/lexer.h‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎cimp/str8.h‎
Lines changed: 238 additions & 0 deletions b/‎cimp/str8.h‎
Lines changed: 238 additions & 0 deletions
@@ -0,0 +1,3 @@
+BasedOnStyle: Google
+IndentWidth: 4
+ColumnLimit: 0
@@ -128,3 +128,20 @@ dist
 .yarn/build-state.yml
 .yarn/install-state.gz
 .pnp.*
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+*.idb
+*.pdb
+
+# Twofold specific
+cimg/lexer
@@ -0,0 +1,2 @@
+Доброе утро мир (Good morning world)
+Здравствуй и прощай! (Hello, and goodbye!)
@@ -0,0 +1,2 @@
+Καλημέρα κόσμε (Good morning world)
+Γεια σας, και αντίο! (Hëllö, and goodbye!)
@@ -0,0 +1,121 @@
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "token.h"
+
+#define OPEN_TAG_CHAR '<'
+#define CLOSE_TAG_CHAR '>'
+#define OPEN_EXPR_CHAR '{'
+#define CLOSE_EXPR_CHAR '}'
+#define LAST_STOPPER_CHAR '/'
+
+static inline bool is_space(uint32_t c) {
+    // space, tab, form feed and vertical tab
+    return c == ' ' || c == '\t' || c == '\f' || c == '\v';
+}
+
+static inline bool is_newline(uint32_t c) {
+    // newline, carriage return
+    return c == '\n' || c == '\r';
+}
+
+static inline bool is_quote(uint32_t c) {
+    // single quote, double quote, backtick
+    return c == '\'' || c == '"' || c == '`';
+}
+
+// lower latin, greek & cyrillic alphabet
+// the beginning of a tag, or param name
+static inline bool is_allowed_start(uint32_t c) {
+    return (c >= 'a' && c <= 'z') ||  // a-z
+           (c >= 224 && c <= 255) ||  // à-ÿ
+           (c >= 940 && c <= 974) ||  // ά-ω
+           (c >= 1072 && c <= 1103);  // а-я
+}
+
+// arabic numbers, all latin, greek & cyrillic
+// inside the tag name, or param name
+static inline bool is_allowed_alpha(uint32_t c) {
+    // 0-9, A-Z, a-z, _, À-ÿ, ά-ω
+    return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') ||
+           (c >= 'a' && c <= 'z') || c == '_' ||
+           (c >= 192 && c <= 255) ||  // À-Ÿ à-ÿ
+           (c >= 904 && c <= 974) ||  // Α-Ω ά-ω
+           (c >= 1040 && c <= 1103);  // А-Я а-я
+}
+
+typedef enum {
+    STATE_RAW_TEXT,
+    STATE_OPEN_TAG,     // 1
+    STATE_CLOSE_TAG,    // 2
+    STATE_TAG_NAME,     // 3
+    STATE_INSIDE_TAG,   // 4
+    STATE_PARAM_NAME,   // 5
+    STATE_PARAM_VALUE,  // 6
+    STATE_EQUAL = 8,
+    STATE_FINAL = 9,
+} LexerState;
+
+// Lexer struct
+// ▰ ▰ ▰ ▰ ▰ ▰ ▰
+typedef struct {
+    // curr index in the text buffer
+    size_t index;
+    LexerState state;
+    LexerState priorState;
+    size_t processed_len;
+    size_t processed_cap;
+    LexParam pendParam;
+    LexToken pendNode;
+    LexToken *processed;
+} Lexer;
+// ▰ ▰ ▰
+
+// Initialize the lexer with default values
+void lexer_init(Lexer *lexer) {
+    if (!lexer) {
+        fprintf(stderr, "Lexer pointer is NULL\n");
+        return;
+    }
+    printf("[Lexer_init] Initializing lexer\n");
+    lexer->index = 0;
+    lexer->state = STATE_RAW_TEXT;
+    lexer->priorState = STATE_RAW_TEXT;
+    lexer->processed_len = 0;
+    lexer->processed_cap = 96;
+    lexer->pendParam = *param_create();
+    lexer->pendNode = *token_create();
+    lexer->processed = (LexToken *)calloc(lexer->processed_cap, sizeof(LexToken));
+    // lexer->processed = (LexToken *)malloc(sizeof(LexToken) * lexer->processed_cap);
+}
+
+void lexer_free(Lexer *lexer) {
+    if (!lexer) {
+        fprintf(stderr, "Lexer pointer is NULL\n");
+        return;
+    }
+    free(lexer->processed);
+    str_free(&lexer->pendParam.val);
+    token_free(&lexer->pendNode);
+    // Note: Do NOT free lexer itself, as it might be
+    // on the stack or part of another struct.
+    printf("[Lexer_free] Freed lexer resources\n");
+}
+
+void lexer_reset(Lexer *lexer) {
+    if (!lexer) {
+        fprintf(stderr, "Lexer pointer is NULL\n");
+        return;
+    }
+    printf("\n[Lexer__reset] Resetting lexer state\n\n");
+    lexer->index = 0;
+    lexer->state = STATE_RAW_TEXT;
+    lexer->priorState = STATE_RAW_TEXT;
+    param_reset(&lexer->pendParam);
+    token_reset(&lexer->pendNode);
+    for (size_t i = 0; i < lexer->processed_len; i++) {
+        token_reset(&lexer->processed[i]);
+    }
+    lexer->processed_len = 0;
+}
@@ -0,0 +1,238 @@
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static inline int utf8_len_byte0(unsigned char c) {
+    if (c < 0x80) return 1;
+    if (c < 0xE0) return 2;
+    if (c < 0xF0) return 3;
+    return 4;
+}
+
+static inline uint32_t utf8_decode(const char *s, size_t *bytes) {
+    unsigned char c = *s;
+    if (c < 0x80) {
+        *bytes = 1;
+        return c;
+    }
+
+    uint32_t cp;
+    int len = utf8_len_byte0(c);
+    *bytes = len;
+
+    switch (len) {
+        case 2:
+            cp = (c & 0x1F) << 6;
+            break;
+        case 3:
+            cp = (c & 0x0F) << 12;
+            break;
+        case 4:
+            cp = (c & 0x07) << 18;
+            break;
+        default:
+            return 0xFFFD;  // invalid
+    }
+
+    for (int i = 1; i < len; ++i) {
+        c = s[i];
+        if ((c & 0xC0) != 0x80) return 0xFFFD;
+        cp |= (c & 0x3F) << (6 * (len - 1 - i));
+    }
+    return cp;
+}
+
+// Encodes one code-point. Returns number of bytes written (1-4).
+static size_t utf8_encode(uint32_t cp, char bytes[4]) {
+    if (cp <= 0x7F) {
+        // 1-byte sequence (ASCII)
+        bytes[0] = (char)cp;
+        return 1;
+    }
+    if (cp <= 0x7FF) {
+        // 2-byte sequence
+        bytes[0] = (char)(0xC0 | (cp >> 6));
+        bytes[1] = (char)(0x80 | (cp & 0x3F));
+        return 2;
+    }
+    if (cp <= 0xFFFF) {
+        // 3-byte sequence
+        bytes[0] = (char)(0xE0 | (cp >> 12));
+        bytes[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
+        bytes[2] = (char)(0x80 | (cp & 0x3F));
+        return 3;
+    }
+    if (cp <= 0x10FFFF) {
+        // 4-byte sequence
+        bytes[0] = (char)(0xF0 | (cp >> 18));
+        bytes[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
+        bytes[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
+        bytes[3] = (char)(0x80 | (cp & 0x3F));
+        return 4;
+    }
+    // Invalid Unicode code point
+    return 0;
+}
+
+#define STR_MIN_CAPACITY 24
+
+// String8 struct
+// ▰ ▰ ▰ ▰ ▰ ▰ ▰
+typedef struct {
+    char *data;       // UTF-8 bytes
+    size_t len;       // number of codepoints (not bytes!)
+    size_t cap;       // allocated bytes
+    size_t byte_len;  // current number of bytes used
+} String8;
+// ▰ ▰ ▰ ▰
+
+String8 *str_new(size_t initial_capacity) {
+    if (initial_capacity < STR_MIN_CAPACITY)
+        initial_capacity = STR_MIN_CAPACITY;
+    String8 *s8 = calloc(1, sizeof(String8));
+    if (!s8)
+        return NULL;
+    s8->cap = initial_capacity;
+    s8->data = calloc(s8->cap, sizeof(char));
+    if (!s8->data) {
+        free(s8);
+        return NULL;
+    }
+    return s8;
+}
+
+void str_free(String8 *s8) {
+    if (s8) {
+        free(s8->data);
+        // Do NOT free s8 itself. The caller is responsible for that,
+        // as s8 might be on the stack or part of another struct.
+    }
+}
+
+void str_clear(String8 *s8) {
+    if (s8) {
+        s8->data[0] = '\0';
+        s8->byte_len = 0;
+        s8->len = 0;
+    }
+}
+
+static inline size_t str_length(const String8 *s8) {
+    return s8->len;
+}
+
+static inline uint32_t str_first_codepoint(const String8 *s8) {
+    if (s8->len == 0) return 0;
+    size_t len = s8->byte_len;
+    return utf8_decode(s8->data, &len);
+}
+
+static uint32_t str_last_codepoint(const String8 *s8) {
+    if (s8->len == 0) return 0;
+    size_t pos = s8->byte_len;
+    while (pos > 0) {
+        unsigned char b = s8->data[pos - 1];
+        if ((b & 0xC0) != 0x80) {  // not a continuation byte
+            size_t len = s8->byte_len - (pos - 1);
+            return utf8_decode(s8->data + pos - 1, &len);
+        }
+        pos--;
+    }
+    return 0xFFFD;
+}
+
+static bool str_ensure_capacity(String8 *s8, size_t needed) {
+    // +1 for the null terminator
+    if (s8->byte_len + needed + 1 <= s8->cap) return true;
+    size_t new_cap = s8->cap;
+    while (new_cap < s8->byte_len + needed + 1) {
+        new_cap *= 2;
+    }
+    char *new_data = realloc(s8->data, new_cap);
+    if (!new_data) return false;
+    s8->data = new_data;
+    s8->cap = new_cap;
+    return true;
+}
+
+static bool str_append_uint32(String8 *s8, uint32_t cp) {
+    char temp[4];
+    printf("str_append_uint32 codepoint: %u\n", cp);
+    size_t bytes = utf8_encode(cp, temp);
+    if (bytes == 0) return false;  // invalid codepoint
+    if (!str_ensure_capacity(s8, bytes)) return false;
+    // Copy the bytes to the end of the string
+    memcpy(s8->data + s8->byte_len, temp, bytes);
+    s8->byte_len += bytes;
+    s8->data[s8->byte_len] = '\0';
+    s8->len++;
+    return true;
+}
+
+bool str_equal(const String8 *a, const String8 *b) {
+    if (!a || !b) {
+        return (a == b);  // both NULL → equal, one NULL → not
+    }
+    if (a->len != b->len) {
+        return false;  // different number of codepoints
+    }
+    if (a->byte_len != b->byte_len) {
+        return false;  // different byte length → can't be equal
+    }
+    // If byte lengths are same and codepoint counts are same,
+    // do byte comparison — because UTF-8 is deterministic.
+    // Same codepoints → same encoding → same bytes.
+    return memcmp(a->data, b->data, a->byte_len) == 0;
+}
+
+// void str_append_utf8(String8 *s8, const char *utf8_str) {
+//     if (!utf8_str) return;
+//     const char *p = utf8_str;
+//     while (*p) {
+//         size_t len = strlen(p);
+//         uint32_t cp = utf8_decode(p, &len);
+//         if (cp == 0xFFFD) len = 1;  // skip invalid
+//         str_append_uint32(s8, cp);
+//         p += len;
+//     }
+// }
+
+bool str_read_file(String8 *fs, const char *filename) {
+    FILE *f = fopen(filename, "rb");  // binary: UTF-8 has no BOM issues
+    if (!f) return false;
+
+    // Check file size
+    fseek(f, 0, SEEK_END);
+    long fsize = ftell(f);
+    fseek(f, 0, SEEK_SET);
+
+    if (fsize <= 0) {
+        fclose(f);
+        return true;  // empty file is OK
+    }
+    if (!str_ensure_capacity(fs, fsize)) {
+        fclose(f);
+        return false;  // allocation failed
+    }
+
+    size_t bytes_read = fread(fs->data + fs->byte_len, 1, fsize, f);
+    fclose(f);
+
+    // Decode byte-by-byte to update length
+    const char *p = fs->data + fs->byte_len;
+    size_t remaining = bytes_read;
+    while (remaining > 0) {
+        size_t len = remaining;
+        uint32_t cp = utf8_decode(p, &len);
+        if (cp == 0xFFFD) len = 1;
+        fs->len++;
+        p += len;
+        remaining -= len;
+    }
+
+    fs->byte_len += bytes_read;
+    fs->data[fs->byte_len] = '\0';
+
+    return true;
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+BasedOnStyle: Google`
	`2`	`+IndentWidth: 4`
	`3`	`+ColumnLimit: 0`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Доброе утро мир (Good morning world)`
	`2`	`+Здравствуй и прощай! (Hello, and goodbye!)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Καλημέρα κόσμε (Good morning world)`
	`2`	`+Γεια σας, και αντίο! (Hëllö, and goodbye!)`