Switch usage of re package for regex which is slightly more performant (#1127)

JAORMX · web-flow · commit 105521696050 · 2025-02-20T16:34:22.000+02:00
`regex` is a drop-in replacement of `re` and provides better
performance.

Signed-off-by: Juan Antonio Osorio &lt;ozz@stacklok.com&gt;
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,7 @@ onnxruntime = "==1.20.1"
 onnx = "==1.17.0"
 spacy = "<3.8.0"
 en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"}
+regex = "==2024.11.6"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "==8.3.4"
diff --git a/src/codegate/api/v1_processing.py b/src/codegate/api/v1_processing.py
@@ -1,10 +1,10 @@
 import asyncio
 import json
-import re
 from collections import defaultdict
 from typing import AsyncGenerator, Dict, List, Optional, Tuple
 
 import cachetools.func
+import regex as re
 import requests
 import structlog
 
diff --git a/src/codegate/clients/detector.py b/src/codegate/clients/detector.py
@@ -1,8 +1,8 @@
-import re
 from abc import ABC, abstractmethod
 from functools import wraps
 from typing import List, Optional
 
+import regex as re
 import structlog
 from fastapi import Request
 
diff --git a/src/codegate/db/fim_cache.py b/src/codegate/db/fim_cache.py
@@ -1,9 +1,9 @@
 import datetime
 import hashlib
 import json
-import re
 from typing import Dict, List, Optional
 
+import regex as re
 import structlog
 from pydantic import BaseModel
 
@@ -21,6 +21,11 @@ class CachedFim(BaseModel):
     initial_id: str
 
 
+# Regular expression to match file paths in FIM messages.
+# Compiled regex to improve performance.
+filepath_matcher = re.compile(r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b", re.MULTILINE)
+
+
 class FimCache:
 
     def __init__(self):
@@ -55,8 +60,8 @@ def _match_filepath(self, message: str, provider: str) -> Optional[str]:
         # folder/testing_file.py
         # Path: file3.py
         # // Path: file3.js <-- Javascript
-        pattern = r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b"
-        matches = re.findall(pattern, message, re.MULTILINE)
+        matches = filepath_matcher.findall(message)
+
         # If no path is found, hash the entire prompt message.
         if not matches:
             return None
diff --git a/src/codegate/extract_snippets/message_extractor.py b/src/codegate/extract_snippets/message_extractor.py
@@ -1,8 +1,8 @@
-import re
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, List, Optional, Self
 
+import regex as re
 import structlog
 from pydantic import BaseModel, field_validator, model_validator
 from pygments.lexers import guess_lexer
diff --git a/src/codegate/pipeline/cli/cli.py b/src/codegate/pipeline/cli/cli.py
@@ -1,7 +1,7 @@
-import re
 import shlex
 from typing import Optional
 
+import regex as re
 from litellm import ChatCompletionRequest
 
 from codegate.clients.clients import ClientType
diff --git a/src/codegate/pipeline/codegate_context_retriever/codegate.py b/src/codegate/pipeline/codegate_context_retriever/codegate.py
@@ -1,6 +1,6 @@
 import json
-import re
 
+import regex as re
 import structlog
 from litellm import ChatCompletionRequest
 
@@ -19,6 +19,12 @@
 logger = structlog.get_logger("codegate")
 
 
+# Pre-compiled regex patterns for performance
+markdown_code_block = re.compile(r"```.*?```", flags=re.DOTALL)
+markdown_file_listing = re.compile(r"⋮...*?⋮...\n\n", flags=re.DOTALL)
+environment_details = re.compile(r"<environment_details>.*?</environment_details>", flags=re.DOTALL)
+
+
 class CodegateContextRetriever(PipelineStep):
     """
     Pipeline step that adds a context message to the completion request when it detects
@@ -95,11 +101,9 @@ async def process(  # noqa: C901
 
         # Remove code snippets and file listing from the user messages and search for bad packages
         # in the rest of the user query/messsages
-        user_messages = re.sub(r"```.*?```", "", user_message, flags=re.DOTALL)
-        user_messages = re.sub(r"⋮...*?⋮...\n\n", "", user_messages, flags=re.DOTALL)
-        user_messages = re.sub(
-            r"<environment_details>.*?</environment_details>", "", user_messages, flags=re.DOTALL
-        )
+        user_messages = markdown_code_block.sub("", user_message)
+        user_messages = markdown_file_listing.sub("", user_messages)
+        user_messages = environment_details.sub("", user_messages)
 
         # split messages into double newlines, to avoid passing so many content in the search
         split_messages = re.split(r"</?task>|\n|\\n", user_messages)
diff --git a/src/codegate/pipeline/pii/pii.py b/src/codegate/pipeline/pii/pii.py
@@ -1,6 +1,6 @@
-import re
 from typing import Any, Dict, List, Optional
 
+import regex as re
 import structlog
 from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
 from litellm.types.utils import Delta, StreamingChoices
diff --git a/src/codegate/pipeline/secrets/secrets.py b/src/codegate/pipeline/secrets/secrets.py
@@ -1,7 +1,7 @@
-import re
 from abc import abstractmethod
 from typing import List, Optional, Tuple
 
+import regex as re
 import structlog
 from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
 from litellm.types.utils import Delta, StreamingChoices
diff --git a/src/codegate/pipeline/secrets/signatures.py b/src/codegate/pipeline/secrets/signatures.py
@@ -1,10 +1,10 @@
 # signatures.py
 import math
-import re
 from pathlib import Path
 from threading import Lock
 from typing import ClassVar, Dict, List, NamedTuple, Optional, Union
 
+import regex as re
 import structlog
 import yaml
 
diff --git a/src/codegate/providers/copilot/provider.py b/src/codegate/providers/copilot/provider.py
@@ -1,13 +1,13 @@
 import asyncio
 import datetime
 import os
-import re
 import ssl
 import tempfile
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Union
 from urllib.parse import unquote, urljoin, urlparse
 
+import regex as re
 import structlog
 from litellm.types.utils import Delta, ModelResponse, StreamingChoices
 
@@ -29,6 +29,9 @@
 setup_logging()
 logger = structlog.get_logger("codegate").bind(origin="copilot_proxy")
 
+# Pre-compiled regex patterns for performance
+proxy_ep_pattern = re.compile(r"proxy-ep=([^;]+)")
+
 
 TEMPDIR = None
 if os.getenv("CODEGATE_DUMP_DIR"):
@@ -613,7 +616,7 @@ async def _get_target_url(self, complete_request) -> Optional[str]:
         auth_header = headers_dict.get("authorization", "")
 
         if auth_header:
-            match = re.search(r"proxy-ep=([^;]+)", auth_header)
+            match = proxy_ep_pattern.search(auth_header)
             if match:
                 self.proxy_ep = match.group(1)
                 if not urlparse(self.proxy_ep).scheme:
diff --git a/src/codegate/storage/storage_engine.py b/src/codegate/storage/storage_engine.py
@@ -1,9 +1,9 @@
 import os
-import re
 import sqlite3
 from typing import List, Optional
 
 import numpy as np
+import regex as re
 import sqlite_vec_sl_tmp
 import structlog
 
@@ -21,6 +21,11 @@
 }
 
 
+# Pre-compiled regex patterns for performance
+alpha_numeric_pattern = re.compile(r"[^\w\s]*$")
+non_alphanumeric_pattern = re.compile(r"[^\w@\/\.-]")
+
+
 class StorageEngine:
     __storage_engine = None
 
@@ -231,11 +236,11 @@ async def search(
             query_words = None
             if query:
                 # Remove all non alphanumeric characters at the end of the string
-                cleaned_query = re.sub(r"[^\w\s]*$", "", query.lower())
+                cleaned_query = alpha_numeric_pattern.sub("", query.lower())
 
                 # Remove all non alphanumeric characters in the middle of the string
                 # except @, /, . and -
-                cleaned_query = re.sub(r"[^\w@\/\.-]", " ", cleaned_query)
+                cleaned_query = non_alphanumeric_pattern.sub(" ", cleaned_query)
 
                 # Tokenize the cleaned query
                 query_words = cleaned_query.split()