Skip to content

Commit 1055216

Browse files
authored
Switch usage of re package for regex which is slightly more performant (#1127)
`regex` is a drop-in replacement of `re` and provides better performance. Signed-off-by: Juan Antonio Osorio <[email protected]>
1 parent d686510 commit 1055216

File tree

13 files changed

+41
-23
lines changed

13 files changed

+41
-23
lines changed

poetry.lock

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ onnxruntime = "==1.20.1"
4040
onnx = "==1.17.0"
4141
spacy = "<3.8.0"
4242
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"}
43+
regex = "==2024.11.6"
4344

4445
[tool.poetry.group.dev.dependencies]
4546
pytest = "==8.3.4"

src/codegate/api/v1_processing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import asyncio
22
import json
3-
import re
43
from collections import defaultdict
54
from typing import AsyncGenerator, Dict, List, Optional, Tuple
65

76
import cachetools.func
7+
import regex as re
88
import requests
99
import structlog
1010

src/codegate/clients/detector.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import re
21
from abc import ABC, abstractmethod
32
from functools import wraps
43
from typing import List, Optional
54

5+
import regex as re
66
import structlog
77
from fastapi import Request
88

src/codegate/db/fim_cache.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import datetime
22
import hashlib
33
import json
4-
import re
54
from typing import Dict, List, Optional
65

6+
import regex as re
77
import structlog
88
from pydantic import BaseModel
99

@@ -21,6 +21,11 @@ class CachedFim(BaseModel):
2121
initial_id: str
2222

2323

24+
# Regular expression to match file paths in FIM messages.
25+
# Compiled regex to improve performance.
26+
filepath_matcher = re.compile(r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b", re.MULTILINE)
27+
28+
2429
class FimCache:
2530

2631
def __init__(self):
@@ -55,8 +60,8 @@ def _match_filepath(self, message: str, provider: str) -> Optional[str]:
5560
# folder/testing_file.py
5661
# Path: file3.py
5762
# // Path: file3.js <-- Javascript
58-
pattern = r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b"
59-
matches = re.findall(pattern, message, re.MULTILINE)
63+
matches = filepath_matcher.findall(message)
64+
6065
# If no path is found, hash the entire prompt message.
6166
if not matches:
6267
return None

src/codegate/extract_snippets/message_extractor.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import re
21
from abc import ABC, abstractmethod
32
from pathlib import Path
43
from typing import Dict, List, Optional, Self
54

5+
import regex as re
66
import structlog
77
from pydantic import BaseModel, field_validator, model_validator
88
from pygments.lexers import guess_lexer

src/codegate/pipeline/cli/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import re
21
import shlex
32
from typing import Optional
43

4+
import regex as re
55
from litellm import ChatCompletionRequest
66

77
from codegate.clients.clients import ClientType

src/codegate/pipeline/codegate_context_retriever/codegate.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import json
2-
import re
32

3+
import regex as re
44
import structlog
55
from litellm import ChatCompletionRequest
66

@@ -19,6 +19,12 @@
1919
logger = structlog.get_logger("codegate")
2020

2121

22+
# Pre-compiled regex patterns for performance
23+
markdown_code_block = re.compile(r"```.*?```", flags=re.DOTALL)
24+
markdown_file_listing = re.compile(r"⋮...*?⋮...\n\n", flags=re.DOTALL)
25+
environment_details = re.compile(r"<environment_details>.*?</environment_details>", flags=re.DOTALL)
26+
27+
2228
class CodegateContextRetriever(PipelineStep):
2329
"""
2430
Pipeline step that adds a context message to the completion request when it detects
@@ -95,11 +101,9 @@ async def process( # noqa: C901
95101

96102
# Remove code snippets and file listing from the user messages and search for bad packages
97103
# in the rest of the user query/messsages
98-
user_messages = re.sub(r"```.*?```", "", user_message, flags=re.DOTALL)
99-
user_messages = re.sub(r"⋮...*?⋮...\n\n", "", user_messages, flags=re.DOTALL)
100-
user_messages = re.sub(
101-
r"<environment_details>.*?</environment_details>", "", user_messages, flags=re.DOTALL
102-
)
104+
user_messages = markdown_code_block.sub("", user_message)
105+
user_messages = markdown_file_listing.sub("", user_messages)
106+
user_messages = environment_details.sub("", user_messages)
103107

104108
# split messages into double newlines, to avoid passing so many content in the search
105109
split_messages = re.split(r"</?task>|\n|\\n", user_messages)

src/codegate/pipeline/pii/pii.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import re
21
from typing import Any, Dict, List, Optional
32

3+
import regex as re
44
import structlog
55
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
66
from litellm.types.utils import Delta, StreamingChoices

src/codegate/pipeline/secrets/secrets.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import re
21
from abc import abstractmethod
32
from typing import List, Optional, Tuple
43

4+
import regex as re
55
import structlog
66
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
77
from litellm.types.utils import Delta, StreamingChoices

src/codegate/pipeline/secrets/signatures.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# signatures.py
22
import math
3-
import re
43
from pathlib import Path
54
from threading import Lock
65
from typing import ClassVar, Dict, List, NamedTuple, Optional, Union
76

7+
import regex as re
88
import structlog
99
import yaml
1010

src/codegate/providers/copilot/provider.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import asyncio
22
import datetime
33
import os
4-
import re
54
import ssl
65
import tempfile
76
from dataclasses import dataclass
87
from typing import Dict, List, Optional, Tuple, Union
98
from urllib.parse import unquote, urljoin, urlparse
109

10+
import regex as re
1111
import structlog
1212
from litellm.types.utils import Delta, ModelResponse, StreamingChoices
1313

@@ -29,6 +29,9 @@
2929
setup_logging()
3030
logger = structlog.get_logger("codegate").bind(origin="copilot_proxy")
3131

32+
# Pre-compiled regex patterns for performance
33+
proxy_ep_pattern = re.compile(r"proxy-ep=([^;]+)")
34+
3235

3336
TEMPDIR = None
3437
if os.getenv("CODEGATE_DUMP_DIR"):
@@ -613,7 +616,7 @@ async def _get_target_url(self, complete_request) -> Optional[str]:
613616
auth_header = headers_dict.get("authorization", "")
614617

615618
if auth_header:
616-
match = re.search(r"proxy-ep=([^;]+)", auth_header)
619+
match = proxy_ep_pattern.search(auth_header)
617620
if match:
618621
self.proxy_ep = match.group(1)
619622
if not urlparse(self.proxy_ep).scheme:

src/codegate/storage/storage_engine.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import os
2-
import re
32
import sqlite3
43
from typing import List, Optional
54

65
import numpy as np
6+
import regex as re
77
import sqlite_vec_sl_tmp
88
import structlog
99

@@ -21,6 +21,11 @@
2121
}
2222

2323

24+
# Pre-compiled regex patterns for performance
25+
alpha_numeric_pattern = re.compile(r"[^\w\s]*$")
26+
non_alphanumeric_pattern = re.compile(r"[^\w@\/\.-]")
27+
28+
2429
class StorageEngine:
2530
__storage_engine = None
2631

@@ -231,11 +236,11 @@ async def search(
231236
query_words = None
232237
if query:
233238
# Remove all non alphanumeric characters at the end of the string
234-
cleaned_query = re.sub(r"[^\w\s]*$", "", query.lower())
239+
cleaned_query = alpha_numeric_pattern.sub("", query.lower())
235240

236241
# Remove all non alphanumeric characters in the middle of the string
237242
# except @, /, . and -
238-
cleaned_query = re.sub(r"[^\w@\/\.-]", " ", cleaned_query)
243+
cleaned_query = non_alphanumeric_pattern.sub(" ", cleaned_query)
239244

240245
# Tokenize the cleaned query
241246
query_words = cleaned_query.split()

0 commit comments

Comments
 (0)