diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 3fbe9b44..d78a542e 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -20,7 +20,7 @@ permissions:
jobs:
versioning:
name: Update Version
- runs-on: ubuntu-24
+ runs-on: ubuntu-24.04
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -402,7 +402,7 @@ jobs:
test_alpine:
name: Alpine Linux
- runs-on: ubuntu-24
+ runs-on: ubuntu-24.04
container:
image: alpine:latest
options: --privileged # If needed for certain Docker operations
@@ -451,7 +451,7 @@ jobs:
]
strategy:
matrix:
- os: [ubuntu-24, macos-13, windows-2022]
+ os: [ubuntu-24.04, macos-13, windows-2022]
python-version: ["36", "37", "38", "39", "310", "311", "312"]
steps:
- uses: actions/checkout@v4
@@ -462,7 +462,7 @@ jobs:
# We only need QEMU for Linux builds
- name: Setup QEMU
- if: matrix.os == 'ubuntu-24'
+ if: matrix.os == 'ubuntu-24.04'
uses: docker/setup-qemu-action@v3
- name: Install cibuildwheel
run: python -m pip install cibuildwheel
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 144ae8b0..a841803b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -19,7 +19,7 @@ permissions:
jobs:
versioning:
name: Update Version
- runs-on: ubuntu-24
+ runs-on: ubuntu-24.04
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -49,7 +49,7 @@ jobs:
rebase:
name: Rebase Dev. Branch
- runs-on: ubuntu-24
+ runs-on: ubuntu-24.04
if: github.ref == 'refs/heads/main'
needs: versioning
steps:
@@ -78,7 +78,7 @@ jobs:
needs: versioning
strategy:
matrix:
- os: [ubuntu-24, macos-13, windows-2022]
+ os: [ubuntu-24.04, macos-13, windows-2022]
python-version: ["36", "37", "38", "39", "310", "311", "312"]
steps:
- uses: actions/checkout@v4
@@ -90,7 +90,7 @@ jobs:
with:
python-version: 3.x
- name: Setup QEMU
- if: matrix.os == 'ubuntu-24' # We only need QEMU for Linux builds
+ if: matrix.os == 'ubuntu-24.04' # We only need QEMU for Linux builds
uses: docker/setup-qemu-action@v3
- name: Install cibuildwheel
run: python -m pip install cibuildwheel
@@ -153,7 +153,7 @@ jobs:
# publish_javascript:
# name: Publish JavaScript
# needs: versioning
- # runs-on: ubuntu-24
+ # runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# with:
diff --git a/.vscode/settings.json b/.vscode/settings.json
index ee77189d..980956d1 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -55,7 +55,9 @@
"Hirschberg's",
"Horspool",
"Hyyro",
+ "illformed",
"initproc",
+ "inplace",
"intp",
"isprintable",
"itemsize",
diff --git a/README.md b/README.md
index 40b3258f..96a3c1fe 100644
--- a/README.md
+++ b/README.md
@@ -186,6 +186,28 @@ __Who is this for?__
arm: 25.8 MB/s
+
+
+ Mapping Characters with Look-Up Table Transforms |
+
+
+ ⚪ |
+
+ transform
+ x86: 3.81 ·
+ arm: 2.65 GB/s
+ |
+
+ str.translate
+ x86: 260.0 ·
+ arm: 140.0 MB/s
+ |
+
+ sz_look_up_transform
+ x86: 21.2 ·
+ arm: 8.5 GB/s
+ |
+
Get sorted order, ≅ 8 million English words 6 |
@@ -373,6 +395,25 @@ x: Strs = text.split_charset(separator='chars', maxsplit=sys.maxsize, keepsepara
x: Strs = text.rsplit_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
```
+You can also transform the string using Look-Up Tables (LUTs), mapping it to a different character set.
+This would result in a copy - `str` for `str` inputs and `bytes` for other types.
+
+```py
+x: str = text.translate('chars', {}, start=0, end=sys.maxsize, inplace=False)
+x: bytes = text.translate(b'chars', {}, start=0, end=sys.maxsize, inplace=False)
+```
+
+For efficiency reasons, pass the LUT as a string or bytes object, not as a dictionary.
+This can be useful in high-throughput applications dealing with binary data, including bioinformatics and image processing.
+Here is an example:
+
+```py
+import stringzilla as sz
+look_up_table = bytes(range(256)) # Identity LUT
+image = open("/image/path.jpeg", "rb").read()
+sz.translate(image, look_up_table, inplace=True)
+```
+
### Collection-Level Operations
Once split into a `Strs` object, you can sort, shuffle, and reorganize the slices, with minimum memory footprint.
@@ -1024,6 +1065,18 @@ char uuid[36];
sz::randomize(sz::string_span(uuid, 36), "0123456789abcdef-"); // Overwrite any buffer
```
+### Bulk Replacements
+
+In text processing, it's often necessary to replace all occurrences of a specific substring or set of characters within a string.
+Standard library functions may not offer the most efficient or convenient methods for performing bulk replacements, especially when dealing with large strings or performance-critical applications.
+
+- `haystack.replace_all(needle_string, replacement_string)`
+- `haystack.replace_all(sz::char_set(""), replacement_string)`
+- `haystack.try_replace_all(needle_string, replacement_string)`
+- `haystack.try_replace_all(sz::char_set(""), replacement_string)`
+- `haystack.transform(sz::look_up_table::identity())`
+- `haystack.transform(sz::look_up_table::identity(), haystack.data())`
+
### Levenshtein Edit Distance and Alignment Scores
Levenshtein and Hamming edit distance are provided for both byte-strings and UTF-8 strings.
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 0068c11f..bd6dbdf6 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -149,6 +149,17 @@
#endif // SZ_DYNAMIC_DISPATCH
#endif // SZ_DYNAMIC
+/**
+ * @brief Alignment macro for 64-byte alignment.
+ */
+#if defined(_MSC_VER)
+#define SZ_ALIGN64 __declspec(align(64))
+#elif defined(__GNUC__) || defined(__clang__)
+#define SZ_ALIGN64 __attribute__((aligned(64)))
+#else
+#define SZ_ALIGN64
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -172,6 +183,9 @@ typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
#else // if SZ_AVOID_LIBC:
+// ! The C standard doesn't specify the signedness of char.
+// ! On x86 char is signed by default while on Arm it is unsigned by default.
+// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
typedef signed char sz_i8_t; // Always 8 bits
typedef unsigned char sz_u8_t; // Always 8 bits
typedef unsigned short sz_u16_t; // Always 16 bits
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 6a65038f..736877df 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1962,6 +1962,7 @@ class basic_string_slice {
* * `try_` exception-free "try" operations that returning non-zero values on success,
* * `replace_all` and `erase_all` similar to Boost,
* * `edit_distance` - Levenshtein distance computation reusing the allocator,
+ * * `translate` - character mapping,
* * `randomize`, `random` - for fast random string generation.
*
* Functions defined for `basic_string_slice`, but not present in `basic_string`:
@@ -3413,7 +3414,8 @@ class basic_string {
}
/**
- * @brief Maps all chatacters in the current string into another buffer using the provided lookup table.
+ * @brief Maps all characters in the current string into another buffer using the provided lookup table.
+ * @param output The buffer to write the transformed string into.
*/
void transform(look_up_table const &table, pointer output) const noexcept {
sz_ptr_t start;
@@ -3875,7 +3877,7 @@ void transform(basic_string_slice string, basic_look_up_table
void transform(basic_string_slice source, basic_look_up_table const &table,
diff --git a/python/lib.c b/python/lib.c
index 85aef1aa..696c5383 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -662,6 +662,17 @@ static PyObject *Str_repr(Str *self) {
static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash(self->memory.start, self->memory.length); }
+static char const doc_like_hash[] = //
+ "Compute the hash value of the string.\n\n"
+ "This function can be called as a method on a Str object or as a standalone function.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object (if called as a method).\n"
+ " text (str): The string to hash (if called as a function).\n\n"
+ "Returns:\n"
+ " int: The hash value of the string.\n\n"
+ "Raises:\n"
+ " TypeError: If the argument is not string-like or incorrect number of arguments is provided.";
+
static PyObject *Str_like_hash(PyObject *self, PyObject *args, PyObject *kwargs) {
// Check minimum arguments
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
@@ -1193,6 +1204,17 @@ static PyObject *Strs_richcompare(PyObject *self, PyObject *other, int op) {
}
}
+static char const doc_decode[] = //
+ "Decode the bytes into a Unicode string with a given encoding.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " encoding (str, optional): The encoding to use (default is 'utf-8').\n"
+ " errors (str, optional): Error handling scheme (default is 'strict').\n\n"
+ "Returns:\n"
+ " str: The decoded Unicode string.\n\n"
+ "Raises:\n"
+ " UnicodeDecodeError: If decoding fails.";
+
static PyObject *Str_decode(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
@@ -1235,9 +1257,14 @@ static PyObject *Str_decode(PyObject *self, PyObject *args, PyObject *kwargs) {
return PyUnicode_Decode(text.start, text.length, encoding.start, errors.start);
}
-/**
- * @brief Saves a StringZilla string to disk.
- */
+static char const doc_write_to[] = //
+ "Write the string to a file.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " filename (str): The file path to write to.\n\n"
+ "Returns:\n"
+ " None.";
+
static PyObject *Str_write_to(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
@@ -1308,11 +1335,14 @@ static PyObject *Str_write_to(PyObject *self, PyObject *args, PyObject *kwargs)
Py_RETURN_NONE;
}
-/**
- * @brief Given a native StringZilla string, suggests it's offset within another native StringZilla string.
- * Very practical when dealing with large files.
- * @return Unsigned integer on success.
- */
+static char const doc_offset_within[] = //
+ "Return the raw byte offset of this StringZilla string within a larger StringZilla string.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The substring.\n"
+ " larger (Str): The larger string to search within.\n\n"
+ "Returns:\n"
+ " int: The byte offset where 'self' is found within 'larger', or -1 if not found.";
+
static PyObject *Str_offset_within(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
@@ -1438,6 +1468,16 @@ static int _Str_find_implementation_( //
return 1;
}
+static char const doc_contains[] = //
+ "Check if a string contains a substring.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " substring (str): The substring to search for.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " bool: True if the substring is found, False otherwise.";
+
static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -1448,6 +1488,16 @@ static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs)
else { Py_RETURN_TRUE; }
}
+static char const doc_find[] = //
+ "Find the first occurrence of a substring.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " substring (str): The substring to find.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: The index of the first occurrence, or -1 if not found.";
+
static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -1457,6 +1507,18 @@ static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
return PyLong_FromSsize_t(signed_offset);
}
+static char const doc_index[] = //
+ "Find the first occurrence of a substring or raise an error if not found.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " substring (str): The substring to find.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: The index of the first occurrence.\n\n"
+ "Raises:\n"
+ " ValueError: If the substring is not found.";
+
static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -1470,6 +1532,16 @@ static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
return PyLong_FromSsize_t(signed_offset);
}
+static char const doc_rfind[] = //
+ "Find the last occurrence of a substring.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " substring (str): The substring to find.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: The index of the last occurrence, or -1 if not found.";
+
static PyObject *Str_rfind(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -1479,6 +1551,18 @@ static PyObject *Str_rfind(PyObject *self, PyObject *args, PyObject *kwargs) {
return PyLong_FromSsize_t(signed_offset);
}
+static char const doc_rindex[] = //
+ "Find the last occurrence of a substring or raise an error if not found.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " substring (str): The substring to find.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: The index of the last occurrence.\n\n"
+ "Raises:\n"
+ " ValueError: If the substring is not found.";
+
static PyObject *Str_rindex(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -1547,14 +1631,41 @@ static PyObject *_Str_partition_implementation(PyObject *self, PyObject *args, P
return result_tuple;
}
+static char const doc_partition[] = //
+ "Split the string into a 3-tuple around the first occurrence of a separator.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separator (str): The separator to partition by.\n\n"
+ "Returns:\n"
+ " tuple: A 3-tuple (head, separator, tail). If the separator is not found, returns (self, '', '').";
+
static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) {
return _Str_partition_implementation(self, args, kwargs, &sz_find, sz_false_k);
}
+static char const doc_rpartition[] = //
+ "Split the string into a 3-tuple around the last occurrence of a separator.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separator (str): The separator to partition by.\n\n"
+ "Returns:\n"
+ " tuple: A 3-tuple (head, separator, tail). If the separator is not found, returns ('', '', self).";
+
static PyObject *Str_rpartition(PyObject *self, PyObject *args, PyObject *kwargs) {
return _Str_partition_implementation(self, args, kwargs, &sz_rfind, sz_true_k);
}
+static char const doc_count[] = //
+ "Count the occurrences of a substring.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " substring (str): The substring to count.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n"
+ " allowoverlap (bool, optional): Count overlapping occurrences (default is False).\n\n"
+ "Returns:\n"
+ " int: The number of occurrences of the substring.";
+
static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
@@ -1603,7 +1714,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
while (haystack.length) {
sz_cptr_t ptr = sz_find(haystack.start, haystack.length, needle.start, needle.length);
sz_bool_t found = ptr != NULL;
- sz_size_t offset = found ? ptr - haystack.start : haystack.length;
+ sz_size_t offset = found ? (sz_size_t)(ptr - haystack.start) : haystack.length;
count += found;
haystack.start += offset + found;
haystack.length -= offset + found;
@@ -1613,7 +1724,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
while (haystack.length) {
sz_cptr_t ptr = sz_find(haystack.start, haystack.length, needle.start, needle.length);
sz_bool_t found = ptr != NULL;
- sz_size_t offset = found ? ptr - haystack.start : haystack.length;
+ sz_size_t offset = found ? (sz_size_t)(ptr - haystack.start) : haystack.length;
count += found;
haystack.start += offset + needle.length;
haystack.length -= offset + needle.length * found;
@@ -1679,10 +1790,28 @@ static PyObject *_Str_edit_distance(PyObject *self, PyObject *args, PyObject *kw
return PyLong_FromSize_t(distance);
}
+static char const doc_edit_distance[] = //
+ "Compute the Levenshtein edit distance between two strings.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The first string.\n"
+ " other (str): The second string to compare.\n"
+ " bound (int, optional): Optional maximum distance to compute (default is no bound).\n\n"
+ "Returns:\n"
+ " int: The edit distance (number of insertions, deletions, substitutions).";
+
static PyObject *Str_edit_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
return _Str_edit_distance(self, args, kwargs, &sz_edit_distance);
}
+static char const doc_edit_distance_unicode[] = //
+ "Compute the Levenshtein edit distance between two Unicode strings.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The first string.\n"
+ " other (str): The second string to compare.\n"
+ " bound (int, optional): Optional maximum distance to compute (default is no bound).\n\n"
+ "Returns:\n"
+ " int: The edit distance in Unicode characters.";
+
static PyObject *Str_edit_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
return _Str_edit_distance(self, args, kwargs, &sz_edit_distance_utf8);
}
@@ -1737,14 +1866,43 @@ static PyObject *_Str_hamming_distance(PyObject *self, PyObject *args, PyObject
return PyLong_FromSize_t(distance);
}
+static char const doc_hamming_distance[] = //
+ "Compute the Hamming distance between two strings.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The first string.\n"
+ " other (str): The second string to compare.\n"
+ " bound (int, optional): Optional maximum distance to compute (default is no bound).\n\n"
+ "Returns:\n"
+ " int: The Hamming distance, including differing bytes and length difference.";
+
static PyObject *Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance);
}
+static char const doc_hamming_distance_unicode[] = //
+ "Compute the Hamming distance between two Unicode strings.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The first string.\n"
+ " other (str): The second string to compare.\n"
+ " bound (int, optional): Optional maximum distance to compute (default is no bound).\n\n"
+ "Returns:\n"
+ " int: The Hamming distance, including differing Unicode characters and length difference.";
+
static PyObject *Str_hamming_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance_utf8);
}
+static char const doc_alignment_score[] = //
+ "Compute the Needleman-Wunsch alignment score between two strings.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The first string.\n"
+ " other (str): The second string to align.\n"
+ " substitution_matrix (numpy.ndarray): A 256x256 substitution cost matrix.\n"
+ " gap_score (int): The score for introducing a gap.\n"
+ " bound (int, optional): Optional maximum score to compute (default is no bound).\n\n"
+ "Returns:\n"
+ " int: The alignment score.";
+
static PyObject *Str_alignment_score(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
@@ -1841,6 +1999,16 @@ static PyObject *Str_alignment_score(PyObject *self, PyObject *args, PyObject *k
return PyLong_FromSsize_t(score);
}
+static char const doc_startswith[] = //
+ "Check if a string starts with a given prefix.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " prefix (str): The prefix to check.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " bool: True if the string starts with the prefix, False otherwise.";
+
static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
@@ -1877,13 +2045,23 @@ static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs
// Apply start and end arguments
str.start += start;
str.length -= start;
- if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+ if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
if (str.length < prefix.length) { Py_RETURN_FALSE; }
else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; }
else { Py_RETURN_FALSE; }
}
+static char const doc_endswith[] = //
+ "Check if a string ends with a given suffix.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " suffix (str): The suffix to check.\n"
+ " start (int, optional): The starting index (default is 0).\n"
+ " end (int, optional): The ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " bool: True if the string ends with the suffix, False otherwise.";
+
static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
@@ -1920,17 +2098,31 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs)
// Apply start and end arguments
str.start += start;
str.length -= start;
- if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+ if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
if (str.length < suffix.length) { Py_RETURN_FALSE; }
else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; }
else { Py_RETURN_FALSE; }
}
+static char const doc_translate[] = //
+ "Perform transformation of a string using a look-up table.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " table (str or dict): A 256-character string or a dictionary mapping bytes to bytes.\n"
+ " start (int, optional): The starting index for translation (default is 0).\n"
+ " end (int, optional): The ending index for translation (default is the string length).\n\n"
+ " inplace (bool, optional): If True, the string is modified in place (default is False).\n\n"
+ "Returns:\n"
+ " Union[None, str, bytes]: If inplace is False, a new string is returned, otherwise None.\n\n"
+ "Raises:\n"
+ " ValueError: If the table is not 256 bytes long.\n"
+ " TypeError: If the table is not a string or dictionary.";
+
static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
- if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+ if (nargs < !is_member + 1 || nargs > !is_member + 4) {
PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
return NULL;
}
@@ -1939,6 +2131,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs)
PyObject *look_up_table_obj = PyTuple_GET_ITEM(args, !is_member);
PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+ PyObject *inplace_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
// Optional start and end arguments
Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
@@ -1953,27 +2146,104 @@ static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs)
return NULL;
}
- sz_string_view_t str, look_up_table;
- if (!export_string_like(str_obj, &str.start, &str.length) ||
- !export_string_like(look_up_table_obj, &look_up_table.start, &look_up_table.length)) {
- PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+ sz_string_view_t str;
+ if (!export_string_like(str_obj, &str.start, &str.length)) {
+ PyErr_SetString(PyExc_TypeError, "First argument must be string-like");
+ return NULL;
+ }
+
+ sz_string_view_t look_up_table_str;
+ SZ_ALIGN64 char look_up_table[256];
+ if (export_string_like(look_up_table_obj, &look_up_table_str.start, &look_up_table_str.length)) {
+ // Export
+ if (look_up_table_str.length != 256) {
+ PyErr_SetString(PyExc_ValueError, "The look-up table must be exactly 256 bytes long");
+ return NULL;
+ }
+ memcpy(&look_up_table[0], look_up_table_str.start, look_up_table_str.length);
+ }
+ else if (PyDict_Check(look_up_table_obj)) {
+
+ // If any character is not defined, it will be replaced with itself:
+ for (int i = 0; i < 256; i++) { look_up_table[i] = (char)i; }
+
+ // Process the dictionary into the look-up table
+ PyObject *key, *value;
+ Py_ssize_t pos = 0;
+ while (PyDict_Next(look_up_table_obj, &pos, &key, &value)) {
+ if (!PyUnicode_Check(key) || PyUnicode_GetLength(key) != 1 || !PyUnicode_Check(value) ||
+ PyUnicode_GetLength(value) != 1) {
+ PyErr_SetString(PyExc_TypeError, "Keys and values must be single characters");
+ return NULL;
+ }
+
+ char key_char = PyUnicode_AsUTF8(key)[0];
+ char value_char = PyUnicode_AsUTF8(value)[0];
+ look_up_table[(unsigned char)key_char] = value_char;
+ }
+ }
+ else {
+ PyErr_SetString(PyExc_TypeError, "Second argument must be string-like or a dictionary");
+ return NULL;
+ }
+
+ int is_inplace = inplace_obj ? PyObject_IsTrue(inplace_obj) : 0;
+ if (is_inplace == -1) {
+ PyErr_SetString(PyExc_TypeError, "The inplace argument must be a boolean");
return NULL;
}
// Apply start and end arguments
str.start += start;
str.length -= start;
- if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+ if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
- if (look_up_table.length != 256) {
- PyErr_SetString(PyExc_ValueError, "The look-up table must be exactly 256 bytes long");
- return NULL;
+ // Perform the translation using the look-up table
+ if (is_inplace) {
+ sz_look_up_transform(str.start, str.length, look_up_table, str.start);
+ Py_RETURN_NONE;
}
+ // Allocate a string of the same size, get it's raw pointer and transform the data into it
+ else {
+
+ // For binary inputs return bytes, for unicode return str
+ if (PyUnicode_Check(str_obj)) {
+ // Create a new Unicode object
+ PyObject *new_unicode_obj = PyUnicode_New(str.length, PyUnicode_MAX_CHAR_VALUE(str_obj));
+ if (!new_unicode_obj) {
+ PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for new Unicode string");
+ return NULL;
+ }
+
+ sz_ptr_t new_buffer = (sz_ptr_t)PyUnicode_DATA(new_unicode_obj);
+ sz_look_up_transform(str.start, str.length, look_up_table, new_buffer);
+ return new_unicode_obj;
+ }
+ else {
+ PyObject *new_bytes_obj = PyBytes_FromStringAndSize(NULL, str.length);
+ if (!new_bytes_obj) {
+ PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for new string");
+ return NULL;
+ }
- sz_look_up_transform(str.start, str.length, look_up_table.start, str.start);
- return Py_None;
+ // Get the buffer and perform the transformation
+ sz_ptr_t new_buffer = (sz_ptr_t)PyBytes_AS_STRING(new_bytes_obj);
+ sz_look_up_transform(str.start, str.length, look_up_table, new_buffer);
+ return new_bytes_obj;
+ }
+ }
}
+static char const doc_find_first_of[] = //
+ "Find the index of the first occurrence of any character from another string.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " chars (str): A string containing characters to search for.\n"
+ " start (int, optional): Starting index (default is 0).\n"
+ " end (int, optional): Ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: Index of the first matching character, or -1 if none found.";
+
static PyObject *Str_find_first_of(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -1984,6 +2254,16 @@ static PyObject *Str_find_first_of(PyObject *self, PyObject *args, PyObject *kwa
return PyLong_FromSsize_t(signed_offset);
}
+static char const doc_find_first_not_of[] = //
+ "Find the index of the first character not in another string.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " chars (str): A string containing characters to exclude.\n"
+ " start (int, optional): Starting index (default is 0).\n"
+ " end (int, optional): Ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: Index of the first non-matching character, or -1 if all match.";
+
static PyObject *Str_find_first_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -1994,6 +2274,16 @@ static PyObject *Str_find_first_not_of(PyObject *self, PyObject *args, PyObject
return PyLong_FromSsize_t(signed_offset);
}
+static char const doc_find_last_of[] = //
+ "Find the index of the last occurrence of any character from another string.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " chars (str): A string containing characters to search for.\n"
+ " start (int, optional): Starting index (default is 0).\n"
+ " end (int, optional): Ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: Index of the last matching character, or -1 if none found.";
+
static PyObject *Str_find_last_of(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -2004,6 +2294,16 @@ static PyObject *Str_find_last_of(PyObject *self, PyObject *args, PyObject *kwar
return PyLong_FromSsize_t(signed_offset);
}
+static char const doc_find_last_not_of[] = //
+ "Find the index of the last character not in another string.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " chars (str): A string containing characters to exclude.\n"
+ " start (int, optional): Starting index (default is 0).\n"
+ " end (int, optional): Ending index (default is the string length).\n\n"
+ "Returns:\n"
+ " int: Index of the last non-matching character, or -1 if all match.";
+
static PyObject *Str_find_last_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
Py_ssize_t signed_offset;
sz_string_view_t text;
@@ -2302,38 +2602,131 @@ static PyObject *Str_split_with_known_callback(PyObject *self, PyObject *args, P
: Str_rsplit_(text_object, text, separator, keepseparator, maxsplit, finder, match_length);
}
+static char const doc_split[] = //
+ "Split a string by a separator.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separator (str): The separator to split by (cannot be empty).\n"
+ " maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+ " keepseparator (bool, optional): Include the separator in results (default is False).\n\n"
+ "Returns:\n"
+ " Strs: A list of strings split by the separator.\n\n"
+ "Raises:\n"
+ " ValueError: If the separator is an empty string.";
+
static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_false_k);
}
+static char const doc_rsplit[] = //
+ "Split a string by a separator starting from the end.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separator (str): The separator to split by (cannot be empty).\n"
+ " maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+ " keepseparator (bool, optional): Include the separator in results (default is False).\n\n"
+ "Returns:\n"
+ " Strs: A list of strings split by the separator.\n\n"
+ "Raises:\n"
+ " ValueError: If the separator is an empty string.";
+
static PyObject *Str_rsplit(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_false_k);
}
+static char const doc_split_charset[] = //
+ "Split a string by a set of character separators.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separators (str): A string containing separator characters.\n"
+ " maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+ " keepseparator (bool, optional): Include separators in results (default is False).\n\n"
+ "Returns:\n"
+ " Strs: A list of strings split by the character set.";
+
static PyObject *Str_split_charset(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_find_char_from, 1, sz_false_k, sz_false_k);
}
+static char const doc_rsplit_charset[] = //
+ "Split a string by a set of character separators in reverse order.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separators (str): A string containing separator characters.\n"
+ " maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+ " keepseparator (bool, optional): Include separators in results (default is False).\n\n"
+ "Returns:\n"
+ " Strs: A list of strings split by the character set.";
+
static PyObject *Str_rsplit_charset(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_char_from, 1, sz_true_k, sz_false_k);
}
+static char const doc_split_iter[] = //
+ "Create an iterator for splitting a string by a separator.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separator (str): The separator to split by (cannot be empty).\n"
+ " keepseparator (bool, optional): Include separator in results (default is False).\n\n"
+ "Returns:\n"
+ " iterator: An iterator yielding split substrings.\n\n"
+ "Raises:\n"
+ " ValueError: If the separator is an empty string.";
+
static PyObject *Str_split_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_true_k);
}
+static char const doc_rsplit_iter[] = //
+ "Create an iterator for splitting a string by a separator in reverse order.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separator (str): The separator to split by (cannot be empty).\n"
+ " keepseparator (bool, optional): Include separator in results (default is False).\n\n"
+ "Returns:\n"
+ " iterator: An iterator yielding split substrings in reverse.\n\n"
+ "Raises:\n"
+ " ValueError: If the separator is an empty string.";
+
static PyObject *Str_rsplit_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_true_k);
}
+static char const doc_split_charset_iter[] = //
+ "Create an iterator for splitting a string by a set of character separators.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separators (str): A string containing separator characters.\n"
+ " keepseparator (bool, optional): Include separators in results (default is False).\n\n"
+ "Returns:\n"
+ " iterator: An iterator yielding split substrings.";
+
static PyObject *Str_split_charset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_find_char_from, 1, sz_false_k, sz_true_k);
}
+static char const doc_rsplit_charset_iter[] = //
+ "Create an iterator for splitting a string by a set of character separators in reverse order.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " separators (str): A string containing separator characters.\n"
+ " keepseparator (bool, optional): Include separators in results (default is False).\n\n"
+ "Returns:\n"
+ " iterator: An iterator yielding split substrings in reverse.";
+
static PyObject *Str_rsplit_charset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_char_from, 1, sz_true_k, sz_true_k);
}
+static char const doc_splitlines[] = //
+ "Split a string by line breaks.\n\n"
+ "Args:\n"
+ " self (Str or str or bytes): The string object.\n"
+ " keeplinebreaks (bool, optional): Include line breaks in the results (default is False).\n"
+ " maxsplit (int, optional): Maximum number of splits (default is no limit).\n\n"
+ "Returns:\n"
+ " Strs: A list of strings split by line breaks.";
+
static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) {
// Check minimum arguments
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
@@ -2479,65 +2872,52 @@ static PyGetSetDef Str_getsetters[] = {
#define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS
static PyMethodDef Str_methods[] = {
- // Basic `str`, `bytes`, and `bytearray`-like functionality
- {"contains", Str_contains, SZ_METHOD_FLAGS, "Check if a string contains a substring."},
- {"count", Str_count, SZ_METHOD_FLAGS, "Count the occurrences of a substring."},
- {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, "Split a string by line breaks."},
- {"startswith", Str_startswith, SZ_METHOD_FLAGS, "Check if a string starts with a given prefix."},
- {"endswith", Str_endswith, SZ_METHOD_FLAGS, "Check if a string ends with a given suffix."},
- {"translate", Str_translate, SZ_METHOD_FLAGS, "Look-Up Table in-place transformation of a byte-string."},
- {"decode", Str_decode, SZ_METHOD_FLAGS, "Decode the bytes into `str` with a given encoding"},
+ {"contains", (PyCFunction)Str_contains, SZ_METHOD_FLAGS, doc_contains},
+ {"count", (PyCFunction)Str_count, SZ_METHOD_FLAGS, doc_count},
+ {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
+ {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
+ {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
+ {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
+ {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
// Bidirectional operations
- {"find", Str_find, SZ_METHOD_FLAGS, "Find the first occurrence of a substring."},
- {"index", Str_index, SZ_METHOD_FLAGS, "Find the first occurrence of a substring or raise error if missing."},
- {"partition", Str_partition, SZ_METHOD_FLAGS, "Splits string into 3-tuple: before, first match, after."},
- {"split", Str_split, SZ_METHOD_FLAGS, "Split a string by a separator."},
- {"rfind", Str_rfind, SZ_METHOD_FLAGS, "Find the last occurrence of a substring."},
- {"rindex", Str_rindex, SZ_METHOD_FLAGS, "Find the last occurrence of a substring or raise error if missing."},
- {"rpartition", Str_rpartition, SZ_METHOD_FLAGS, "Splits string into 3-tuple: before, last match, after."},
- {"rsplit", Str_rsplit, SZ_METHOD_FLAGS, "Split a string by a separator in reverse order."},
+ {"find", (PyCFunction)Str_find, SZ_METHOD_FLAGS, doc_find},
+ {"index", (PyCFunction)Str_index, SZ_METHOD_FLAGS, doc_index},
+ {"partition", (PyCFunction)Str_partition, SZ_METHOD_FLAGS, doc_partition},
+ {"split", (PyCFunction)Str_split, SZ_METHOD_FLAGS, doc_split},
+ {"rfind", (PyCFunction)Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
+ {"rindex", (PyCFunction)Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
+ {"rpartition", (PyCFunction)Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
+ {"rsplit", (PyCFunction)Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
// Edit distance extensions
- {"hamming_distance", Str_hamming_distance, SZ_METHOD_FLAGS,
- "Hamming distance between two strings, as the number of replaced bytes, and difference in length."},
- {"hamming_distance_unicode", Str_hamming_distance_unicode, SZ_METHOD_FLAGS,
- "Hamming distance between two strings, as the number of replaced unicode characters, and difference in length."},
- {"edit_distance", Str_edit_distance, SZ_METHOD_FLAGS,
- "Levenshtein distance between two strings, as the number of inserted, deleted, and replaced bytes."},
- {"edit_distance_unicode", Str_edit_distance_unicode, SZ_METHOD_FLAGS,
- "Levenshtein distance between two strings, as the number of inserted, deleted, and replaced unicode characters."},
- {"alignment_score", Str_alignment_score, SZ_METHOD_FLAGS,
- "Needleman-Wunsch alignment score given a substitution cost matrix."},
+ {"hamming_distance", (PyCFunction)Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
+ {"hamming_distance_unicode", (PyCFunction)Str_hamming_distance_unicode, SZ_METHOD_FLAGS,
+ doc_hamming_distance_unicode},
+ {"edit_distance", (PyCFunction)Str_edit_distance, SZ_METHOD_FLAGS, doc_edit_distance},
+ {"edit_distance_unicode", (PyCFunction)Str_edit_distance_unicode, SZ_METHOD_FLAGS, doc_edit_distance_unicode},
+ {"alignment_score", (PyCFunction)Str_alignment_score, SZ_METHOD_FLAGS, doc_alignment_score},
// Character search extensions
- {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS,
- "Finds the first occurrence of a character from another string."},
- {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS,
- "Finds the last occurrence of a character from another string."},
- {"find_first_not_of", Str_find_first_not_of, SZ_METHOD_FLAGS,
- "Finds the first occurrence of a character not present in another string."},
- {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS,
- "Finds the last occurrence of a character not present in another string."},
- {"split_charset", Str_split_charset, SZ_METHOD_FLAGS, "Split a string by a set of character separators."},
- {"rsplit_charset", Str_rsplit_charset, SZ_METHOD_FLAGS,
- "Split a string by a set of character separators in reverse order."},
+ {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
+ {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
+ {"find_first_not_of", (PyCFunction)Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
+ {"find_last_not_of", (PyCFunction)Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
+ {"split_charset", (PyCFunction)Str_split_charset, SZ_METHOD_FLAGS, doc_split_charset},
+ {"rsplit_charset", (PyCFunction)Str_rsplit_charset, SZ_METHOD_FLAGS, doc_rsplit_charset},
// Lazily evaluated iterators
- {"split_iter", Str_split_iter, SZ_METHOD_FLAGS, "Create an iterator for splitting a string by a separator."},
- {"rsplit_iter", Str_rsplit_iter, SZ_METHOD_FLAGS,
- "Create an iterator for splitting a string by a separator in reverse order."},
- {"split_charset_iter", Str_split_charset_iter, SZ_METHOD_FLAGS,
- "Create an iterator for splitting a string by a set of character separators."},
- {"rsplit_charset_iter", Str_rsplit_charset_iter, SZ_METHOD_FLAGS,
- "Create an iterator for splitting a string by a set of character separators in reverse order."},
+ {"split_iter", (PyCFunction)Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
+ {"rsplit_iter", (PyCFunction)Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
+ {"split_charset_iter", (PyCFunction)Str_split_charset_iter, SZ_METHOD_FLAGS, doc_split_charset_iter},
+ {"rsplit_charset_iter", (PyCFunction)Str_rsplit_charset_iter, SZ_METHOD_FLAGS, doc_rsplit_charset_iter},
// Dealing with larger-than-memory datasets
- {"offset_within", Str_offset_within, SZ_METHOD_FLAGS,
- "Return the raw byte offset of one binary string within another."},
- {"write_to", Str_write_to, SZ_METHOD_FLAGS, "Return the raw byte offset of one binary string within another."},
+ {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
+ {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
- {NULL, NULL, 0, NULL}};
+ {NULL, NULL, 0, NULL} // Sentinel
+};
static PyTypeObject StrType = {
PyVarObject_HEAD_INIT(NULL, 0) //
@@ -3182,67 +3562,51 @@ static void stringzilla_cleanup(PyObject *m) {
static PyMethodDef stringzilla_methods[] = {
// Basic `str`, `bytes`, and `bytearray`-like functionality
- {"contains", Str_contains, SZ_METHOD_FLAGS, "Check if a string contains a substring."},
- {"count", Str_count, SZ_METHOD_FLAGS, "Count the occurrences of a substring."},
- {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, "Split a string by line breaks."},
- {"startswith", Str_startswith, SZ_METHOD_FLAGS, "Check if a string starts with a given prefix."},
- {"endswith", Str_endswith, SZ_METHOD_FLAGS, "Check if a string ends with a given suffix."},
- {"translate", Str_translate, SZ_METHOD_FLAGS, "Look-Up Table in-place transformation of a byte-string."},
- {"decode", Str_decode, SZ_METHOD_FLAGS, "Decode the bytes into `str` with a given encoding"},
+ {"contains", Str_contains, SZ_METHOD_FLAGS, doc_contains},
+ {"count", Str_count, SZ_METHOD_FLAGS, doc_count},
+ {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
+ {"startswith", Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
+ {"endswith", Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
+ {"translate", Str_translate, SZ_METHOD_FLAGS, doc_translate},
+ {"decode", Str_decode, SZ_METHOD_FLAGS, doc_decode},
// Bidirectional operations
- {"find", Str_find, SZ_METHOD_FLAGS, "Find the first occurrence of a substring."},
- {"index", Str_index, SZ_METHOD_FLAGS, "Find the first occurrence of a substring or raise error if missing."},
- {"partition", Str_partition, SZ_METHOD_FLAGS, "Splits string into 3-tuple: before, first match, after."},
- {"split", Str_split, SZ_METHOD_FLAGS, "Split a string by a separator."},
- {"rfind", Str_rfind, SZ_METHOD_FLAGS, "Find the last occurrence of a substring."},
- {"rindex", Str_rindex, SZ_METHOD_FLAGS, "Find the last occurrence of a substring or raise error if missing."},
- {"rpartition", Str_rpartition, SZ_METHOD_FLAGS, "Splits string into 3-tuple: before, last match, after."},
- {"rsplit", Str_rsplit, SZ_METHOD_FLAGS, "Split a string by a separator in reverse order."},
+ {"find", Str_find, SZ_METHOD_FLAGS, doc_find},
+ {"index", Str_index, SZ_METHOD_FLAGS, doc_index},
+ {"partition", Str_partition, SZ_METHOD_FLAGS, doc_partition},
+ {"split", Str_split, SZ_METHOD_FLAGS, doc_split},
+ {"rfind", Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
+ {"rindex", Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
+ {"rpartition", Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
+ {"rsplit", Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
// Edit distance extensions
- {"hamming_distance", Str_hamming_distance, SZ_METHOD_FLAGS,
- "Hamming distance between two strings, as the number of replaced bytes, and difference in length."},
- {"hamming_distance_unicode", Str_hamming_distance_unicode, SZ_METHOD_FLAGS,
- "Hamming distance between two strings, as the number of replaced unicode characters, and difference in "
- "length."},
- {"edit_distance", Str_edit_distance, SZ_METHOD_FLAGS,
- "Levenshtein distance between two strings, as the number of inserted, deleted, and replaced bytes."},
- {"edit_distance_unicode", Str_edit_distance_unicode, SZ_METHOD_FLAGS,
- "Levenshtein distance between two strings, as the number of inserted, deleted, and replaced unicode "
- "characters."},
- {"alignment_score", Str_alignment_score, SZ_METHOD_FLAGS,
- "Needleman-Wunsch alignment score given a substitution cost matrix."},
+ {"hamming_distance", Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
+ {"hamming_distance_unicode", Str_hamming_distance_unicode, SZ_METHOD_FLAGS, doc_hamming_distance_unicode},
+ {"edit_distance", Str_edit_distance, SZ_METHOD_FLAGS, doc_edit_distance},
+ {"edit_distance_unicode", Str_edit_distance_unicode, SZ_METHOD_FLAGS, doc_edit_distance_unicode},
+ {"alignment_score", Str_alignment_score, SZ_METHOD_FLAGS, doc_alignment_score},
// Character search extensions
- {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS,
- "Finds the first occurrence of a character from another string."},
- {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS,
- "Finds the last occurrence of a character from another string."},
- {"find_first_not_of", Str_find_first_not_of, SZ_METHOD_FLAGS,
- "Finds the first occurrence of a character not present in another string."},
- {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS,
- "Finds the last occurrence of a character not present in another string."},
- {"split_charset", Str_split_charset, SZ_METHOD_FLAGS, "Split a string by a set of character separators."},
- {"rsplit_charset", Str_rsplit_charset, SZ_METHOD_FLAGS,
- "Split a string by a set of character separators in reverse order."},
+ {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
+ {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
+ {"find_first_not_of", Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
+ {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
+ {"split_charset", Str_split_charset, SZ_METHOD_FLAGS, doc_split_charset},
+ {"rsplit_charset", Str_rsplit_charset, SZ_METHOD_FLAGS, doc_rsplit_charset},
// Lazily evaluated iterators
- {"split_iter", Str_split_iter, SZ_METHOD_FLAGS, "Create an iterator for splitting a string by a separator."},
- {"rsplit_iter", Str_rsplit_iter, SZ_METHOD_FLAGS,
- "Create an iterator for splitting a string by a separator in reverse order."},
- {"split_charset_iter", Str_split_charset_iter, SZ_METHOD_FLAGS,
- "Create an iterator for splitting a string by a set of character separators."},
- {"rsplit_charset_iter", Str_rsplit_charset_iter, SZ_METHOD_FLAGS,
- "Create an iterator for splitting a string by a set of character separators in reverse order."},
+ {"split_iter", Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
+ {"rsplit_iter", Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
+ {"split_charset_iter", Str_split_charset_iter, SZ_METHOD_FLAGS, doc_split_charset_iter},
+ {"rsplit_charset_iter", Str_rsplit_charset_iter, SZ_METHOD_FLAGS, doc_rsplit_charset_iter},
// Dealing with larger-than-memory datasets
- {"offset_within", Str_offset_within, SZ_METHOD_FLAGS,
- "Return the raw byte offset of one binary string within another."},
- {"write_to", Str_write_to, SZ_METHOD_FLAGS, "Return the raw byte offset of one binary string within another."},
+ {"offset_within", Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
+ {"write_to", Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
// Global unary extensions
- {"hash", Str_like_hash, SZ_METHOD_FLAGS, "Hash a string or a byte-array."},
+ {"hash", Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
{NULL, NULL, 0, NULL}};
diff --git a/scripts/bench_search.py b/scripts/bench_search.py
index 0f49ffad..e5759d35 100644
--- a/scripts/bench_search.py
+++ b/scripts/bench_search.py
@@ -60,11 +60,16 @@ def find_all_sets(haystack: Str, characters: str) -> int:
return count
+def translate(haystack: Str, look_up_table) -> str:
+ return haystack.translate(look_up_table)
+
+
def log_functionality(
tokens: List[str],
pythonic_str: str,
stringzilla_str: Str,
):
+ # Read-only Search
log("str.find", pythonic_str, tokens, find_all)
log("Str.find", stringzilla_str, tokens, find_all)
log("str.rfind", pythonic_str, tokens, rfind_all)
@@ -72,6 +77,23 @@ def log_functionality(
log("re.finditer", pythonic_str, [r" \t\n\r"], find_all_regex)
log("Str.find_first_of", stringzilla_str, [r" \t\n\r"], find_all_sets)
+ # Search & Modify
+ identity = bytes(range(256))
+ reverse = bytes(reversed(identity))
+ repeated = bytes(range(64)) * 4
+ hex = b"0123456789abcdef" * 16
+ log(
+ "str.translate",
+ pythonic_str,
+ [
+ bytes.maketrans(identity, reverse),
+ bytes.maketrans(identity, repeated),
+ bytes.maketrans(identity, hex),
+ ],
+ translate,
+ )
+ log("Str.translate", stringzilla_str, [reverse, repeated, hex], translate)
+
def bench(
haystack_path: str = None,
diff --git a/scripts/test.py b/scripts/test.py
index b30689ae..b183b47e 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -422,6 +422,11 @@ def test_unit_globals():
assert sz.edit_distance("abababab", "aaaaaaaa", 2) == 2
assert sz.edit_distance("abababab", "aaaaaaaa", bound=2) == 2
+ assert sz.translate("ABC", {"A": "X", "B": "Y", "C": "Z"}) == "XYZ"
+ assert sz.translate("ABC", {"A": "X", "B": "Y"}) == "XYC"
+ assert sz.translate("ABC", {"A": "X", "B": "Y"}, 1, -1) == "YC"
+ assert sz.translate("ABC", bytes(range(256))) == "ABC"
+
def test_string_lengths():
assert 4 == len(sz.Str("abcd"))
diff --git a/setup.py b/setup.py
index 47e96e7f..85943a6d 100644
--- a/setup.py
+++ b/setup.py
@@ -109,7 +109,7 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
macros_args = [
("SZ_USE_X86_AVX512", "1" if is_64bit_x86() else "0"),
("SZ_USE_X86_AVX2", "1" if is_64bit_x86() else "0"),
- ("SZ_USE_ARM_SVE", "1" if is_64bit_arm() else "0"),
+ ("SZ_USE_ARM_SVE", "0"),
("SZ_USE_ARM_NEON", "1" if is_64bit_arm() else "0"),
("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"),
]