|
| 1 | +#include <stdbool.h> |
| 2 | +#include <stdio.h> |
| 3 | +#include <stdlib.h> |
| 4 | +#include <string.h> |
| 5 | + |
| 6 | +static inline int utf8_len_byte0(unsigned char c) { |
| 7 | + if (c < 0x80) return 1; |
| 8 | + if (c < 0xE0) return 2; |
| 9 | + if (c < 0xF0) return 3; |
| 10 | + return 4; |
| 11 | +} |
| 12 | + |
| 13 | +static inline uint32_t utf8_decode(const char *s, size_t *bytes) { |
| 14 | + unsigned char c = *s; |
| 15 | + if (c < 0x80) { |
| 16 | + *bytes = 1; |
| 17 | + return c; |
| 18 | + } |
| 19 | + |
| 20 | + uint32_t cp; |
| 21 | + int len = utf8_len_byte0(c); |
| 22 | + *bytes = len; |
| 23 | + |
| 24 | + switch (len) { |
| 25 | + case 2: |
| 26 | + cp = (c & 0x1F) << 6; |
| 27 | + break; |
| 28 | + case 3: |
| 29 | + cp = (c & 0x0F) << 12; |
| 30 | + break; |
| 31 | + case 4: |
| 32 | + cp = (c & 0x07) << 18; |
| 33 | + break; |
| 34 | + default: |
| 35 | + return 0xFFFD; // invalid |
| 36 | + } |
| 37 | + |
| 38 | + for (int i = 1; i < len; ++i) { |
| 39 | + c = s[i]; |
| 40 | + if ((c & 0xC0) != 0x80) return 0xFFFD; |
| 41 | + cp |= (c & 0x3F) << (6 * (len - 1 - i)); |
| 42 | + } |
| 43 | + return cp; |
| 44 | +} |
| 45 | + |
| 46 | +// Encodes one code-point. Returns number of bytes written (1-4). |
| 47 | +static size_t utf8_encode(uint32_t cp, char bytes[4]) { |
| 48 | + if (cp <= 0x7F) { |
| 49 | + // 1-byte sequence (ASCII) |
| 50 | + bytes[0] = (char)cp; |
| 51 | + return 1; |
| 52 | + } |
| 53 | + if (cp <= 0x7FF) { |
| 54 | + // 2-byte sequence |
| 55 | + bytes[0] = (char)(0xC0 | (cp >> 6)); |
| 56 | + bytes[1] = (char)(0x80 | (cp & 0x3F)); |
| 57 | + return 2; |
| 58 | + } |
| 59 | + if (cp <= 0xFFFF) { |
| 60 | + // 3-byte sequence |
| 61 | + bytes[0] = (char)(0xE0 | (cp >> 12)); |
| 62 | + bytes[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); |
| 63 | + bytes[2] = (char)(0x80 | (cp & 0x3F)); |
| 64 | + return 3; |
| 65 | + } |
| 66 | + if (cp <= 0x10FFFF) { |
| 67 | + // 4-byte sequence |
| 68 | + bytes[0] = (char)(0xF0 | (cp >> 18)); |
| 69 | + bytes[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); |
| 70 | + bytes[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); |
| 71 | + bytes[3] = (char)(0x80 | (cp & 0x3F)); |
| 72 | + return 4; |
| 73 | + } |
| 74 | + // Invalid Unicode code point |
| 75 | + return 0; |
| 76 | +} |
| 77 | + |
| 78 | +#define STR_MIN_CAPACITY 24 |
| 79 | + |
| 80 | +// String8 struct |
| 81 | +// ▰ ▰ ▰ ▰ ▰ ▰ ▰ |
| 82 | +typedef struct { |
| 83 | + char *data; // UTF-8 bytes |
| 84 | + size_t len; // number of codepoints (not bytes!) |
| 85 | + size_t cap; // allocated bytes |
| 86 | + size_t byte_len; // current number of bytes used |
| 87 | +} String8; |
| 88 | +// ▰ ▰ ▰ ▰ |
| 89 | + |
| 90 | +String8 *str_new(size_t initial_capacity) { |
| 91 | + if (initial_capacity < STR_MIN_CAPACITY) |
| 92 | + initial_capacity = STR_MIN_CAPACITY; |
| 93 | + String8 *s8 = calloc(1, sizeof(String8)); |
| 94 | + if (!s8) |
| 95 | + return NULL; |
| 96 | + s8->cap = initial_capacity; |
| 97 | + s8->data = calloc(s8->cap, sizeof(char)); |
| 98 | + if (!s8->data) { |
| 99 | + free(s8); |
| 100 | + return NULL; |
| 101 | + } |
| 102 | + return s8; |
| 103 | +} |
| 104 | + |
| 105 | +void str_free(String8 *s8) { |
| 106 | + if (s8) { |
| 107 | + free(s8->data); |
| 108 | + // Do NOT free s8 itself. The caller is responsible for that, |
| 109 | + // as s8 might be on the stack or part of another struct. |
| 110 | + } |
| 111 | +} |
| 112 | + |
| 113 | +void str_clear(String8 *s8) { |
| 114 | + if (s8) { |
| 115 | + s8->data[0] = '\0'; |
| 116 | + s8->byte_len = 0; |
| 117 | + s8->len = 0; |
| 118 | + } |
| 119 | +} |
| 120 | + |
| 121 | +static inline size_t str_length(const String8 *s8) { |
| 122 | + return s8->len; |
| 123 | +} |
| 124 | + |
| 125 | +static inline uint32_t str_first_codepoint(const String8 *s8) { |
| 126 | + if (s8->len == 0) return 0; |
| 127 | + size_t len = s8->byte_len; |
| 128 | + return utf8_decode(s8->data, &len); |
| 129 | +} |
| 130 | + |
| 131 | +static uint32_t str_last_codepoint(const String8 *s8) { |
| 132 | + if (s8->len == 0) return 0; |
| 133 | + size_t pos = s8->byte_len; |
| 134 | + while (pos > 0) { |
| 135 | + unsigned char b = s8->data[pos - 1]; |
| 136 | + if ((b & 0xC0) != 0x80) { // not a continuation byte |
| 137 | + size_t len = s8->byte_len - (pos - 1); |
| 138 | + return utf8_decode(s8->data + pos - 1, &len); |
| 139 | + } |
| 140 | + pos--; |
| 141 | + } |
| 142 | + return 0xFFFD; |
| 143 | +} |
| 144 | + |
| 145 | +static bool str_ensure_capacity(String8 *s8, size_t needed) { |
| 146 | + // +1 for the null terminator |
| 147 | + if (s8->byte_len + needed + 1 <= s8->cap) return true; |
| 148 | + size_t new_cap = s8->cap; |
| 149 | + while (new_cap < s8->byte_len + needed + 1) { |
| 150 | + new_cap *= 2; |
| 151 | + } |
| 152 | + char *new_data = realloc(s8->data, new_cap); |
| 153 | + if (!new_data) return false; |
| 154 | + s8->data = new_data; |
| 155 | + s8->cap = new_cap; |
| 156 | + return true; |
| 157 | +} |
| 158 | + |
| 159 | +static bool str_append_uint32(String8 *s8, uint32_t cp) { |
| 160 | + char temp[4]; |
| 161 | + printf("str_append_uint32 codepoint: %u\n", cp); |
| 162 | + size_t bytes = utf8_encode(cp, temp); |
| 163 | + if (bytes == 0) return false; // invalid codepoint |
| 164 | + if (!str_ensure_capacity(s8, bytes)) return false; |
| 165 | + // Copy the bytes to the end of the string |
| 166 | + memcpy(s8->data + s8->byte_len, temp, bytes); |
| 167 | + s8->byte_len += bytes; |
| 168 | + s8->data[s8->byte_len] = '\0'; |
| 169 | + s8->len++; |
| 170 | + return true; |
| 171 | +} |
| 172 | + |
| 173 | +bool str_equal(const String8 *a, const String8 *b) { |
| 174 | + if (!a || !b) { |
| 175 | + return (a == b); // both NULL → equal, one NULL → not |
| 176 | + } |
| 177 | + if (a->len != b->len) { |
| 178 | + return false; // different number of codepoints |
| 179 | + } |
| 180 | + if (a->byte_len != b->byte_len) { |
| 181 | + return false; // different byte length → can't be equal |
| 182 | + } |
| 183 | + // If byte lengths are same and codepoint counts are same, |
| 184 | + // do byte comparison — because UTF-8 is deterministic. |
| 185 | + // Same codepoints → same encoding → same bytes. |
| 186 | + return memcmp(a->data, b->data, a->byte_len) == 0; |
| 187 | +} |
| 188 | + |
| 189 | +// void str_append_utf8(String8 *s8, const char *utf8_str) { |
| 190 | +// if (!utf8_str) return; |
| 191 | +// const char *p = utf8_str; |
| 192 | +// while (*p) { |
| 193 | +// size_t len = strlen(p); |
| 194 | +// uint32_t cp = utf8_decode(p, &len); |
| 195 | +// if (cp == 0xFFFD) len = 1; // skip invalid |
| 196 | +// str_append_uint32(s8, cp); |
| 197 | +// p += len; |
| 198 | +// } |
| 199 | +// } |
| 200 | + |
| 201 | +bool str_read_file(String8 *fs, const char *filename) { |
| 202 | + FILE *f = fopen(filename, "rb"); // binary: UTF-8 has no BOM issues |
| 203 | + if (!f) return false; |
| 204 | + |
| 205 | + // Check file size |
| 206 | + fseek(f, 0, SEEK_END); |
| 207 | + long fsize = ftell(f); |
| 208 | + fseek(f, 0, SEEK_SET); |
| 209 | + |
| 210 | + if (fsize <= 0) { |
| 211 | + fclose(f); |
| 212 | + return true; // empty file is OK |
| 213 | + } |
| 214 | + if (!str_ensure_capacity(fs, fsize)) { |
| 215 | + fclose(f); |
| 216 | + return false; // allocation failed |
| 217 | + } |
| 218 | + |
| 219 | + size_t bytes_read = fread(fs->data + fs->byte_len, 1, fsize, f); |
| 220 | + fclose(f); |
| 221 | + |
| 222 | + // Decode byte-by-byte to update length |
| 223 | + const char *p = fs->data + fs->byte_len; |
| 224 | + size_t remaining = bytes_read; |
| 225 | + while (remaining > 0) { |
| 226 | + size_t len = remaining; |
| 227 | + uint32_t cp = utf8_decode(p, &len); |
| 228 | + if (cp == 0xFFFD) len = 1; |
| 229 | + fs->len++; |
| 230 | + p += len; |
| 231 | + remaining -= len; |
| 232 | + } |
| 233 | + |
| 234 | + fs->byte_len += bytes_read; |
| 235 | + fs->data[fs->byte_len] = '\0'; |
| 236 | + |
| 237 | + return true; |
| 238 | +} |
0 commit comments