Skip to content

Commit 872e009

Browse files
Change the string append logic to use StringIO
1 parent a797c91 commit 872e009

File tree

4 files changed

+41
-35
lines changed

4 files changed

+41
-35
lines changed

src/SimpleReplay/audit_logs_parsing.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
This module parses various auditlogs
55
"""
6-
6+
from io import StringIO
77
logger = None
88

99

@@ -16,7 +16,13 @@ def __init__(self):
1616
self.database_name = ""
1717
self.pid = ""
1818
self.xid = ""
19-
self.text = ""
19+
self.text = StringIO()
20+
21+
def clear_and_set_text(self, new_value):
22+
# Better to create a new instance, rather than truncate and seek - because it’s faster
23+
self.text.close()
24+
self.text = StringIO()
25+
self.text.write(new_value)
2026

2127
def get_filename(self):
2228
base_name = (
@@ -44,7 +50,7 @@ def __str__(self):
4450
self.database_name,
4551
self.pid,
4652
self.xid,
47-
self.text,
53+
self.text.getvalue(),
4854
)
4955
)
5056

@@ -58,11 +64,11 @@ def __eq__(self, other):
5864
and self.database_name == other.database_name
5965
and self.pid == other.pid
6066
and self.xid == other.xid
61-
and self.text == other.text
67+
and self.text.getvalue() == other.text.getvalue()
6268
)
6369

6470
def __hash__(self):
65-
return hash((str(self.pid), str(self.xid), self.text.strip("\n")))
71+
return hash((str(self.pid), str(self.xid), self.text.getvalue().strip("\n")))
6672

6773

6874
class ConnectionLog:

src/SimpleReplay/extract/extractor/extract_parser.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time):
6464
if filename in logs:
6565
# Check if duplicate. This happens with JDBC connections.
6666
prev_query = logs[filename][-1]
67-
if not is_duplicate(prev_query.text, user_activity_log.text):
67+
if not is_duplicate(prev_query.text.getvalue(), user_activity_log.text.getvalue()):
6868
if fetch_pattern.search(
69-
prev_query.text
70-
) and fetch_pattern.search(user_activity_log.text):
71-
user_activity_log.text = f"--{user_activity_log.text}"
69+
prev_query.text.getvalue()
70+
) and fetch_pattern.search(user_activity_log.text.getvalue()):
71+
user_activity_log.clear_and_set_text(f"--{user_activity_log.text.getvalue()}")
7272
logs[filename].append(user_activity_log)
7373
else:
7474
logs[filename].append(user_activity_log)
@@ -87,9 +87,9 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time):
8787
user_activity_log.database_name = query_information[3][3:]
8888
user_activity_log.pid = query_information[5][4:]
8989
user_activity_log.xid = query_information[7][4:]
90-
user_activity_log.text = line_split[1]
90+
user_activity_log.clear_and_set_text(line_split[1])
9191
else:
92-
user_activity_log.text += line
92+
user_activity_log.text.write(line)
9393

9494

9595
def _parse_start_node_log(file, logs, databases, start_time, end_time):
@@ -107,7 +107,7 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time):
107107
if filename in logs:
108108
# Check if duplicate. This happens with JDBC connections.
109109
prev_query = logs[filename][-1]
110-
if not is_duplicate(prev_query.text, start_node_log.text):
110+
if not is_duplicate(prev_query.text.getvalue(), start_node_log.text.getvalue()):
111111
logs[filename].append(start_node_log)
112112
else:
113113
logs[filename] = [start_node_log]
@@ -132,14 +132,14 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time):
132132
start_node_log.username = query_information[4][3:].split(":")[0]
133133
start_node_log.pid = query_information[5][4:]
134134
start_node_log.xid = query_information[7][4:]
135-
start_node_log.text = line_split[1].strip()
135+
start_node_log.clear_and_set_text(line_split[1].strip())
136136
else:
137-
start_node_log.text += line
137+
start_node_log.text.write(line)
138138

139139

140140
def _parse_connection_log(file, connections, last_connections, start_time, end_time):
141141
for line in file.readlines():
142-
142+
143143
line = line.decode("utf-8")
144144

145145
connection_information = line.split("|")

src/SimpleReplay/extract/extractor/extractor.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -200,33 +200,33 @@ def get_sql_connections_replacements(self, last_connections, log_items):
200200
)
201201
continue
202202

203-
query.text = remove_line_comments(query.text).strip()
203+
query.clear_and_set_text(remove_line_comments(query.text.getvalue()).strip())
204204

205-
if "copy " in query.text.lower() and "from 's3:" in query.text.lower():
205+
if "copy " in query.text.getvalue().lower() and "from 's3:" in query.text.getvalue().lower():
206206
bucket = re.search(
207-
r"from 's3:\/\/[^']*", query.text, re.IGNORECASE
207+
r"from 's3:\/\/[^']*", query.text.getvalue(), re.IGNORECASE
208208
).group()[6:]
209209
replacements.add(bucket)
210-
query.text = re.sub(
210+
query.clear_and_set_text(re.sub(
211211
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
212212
f" IAM_ROLE ''",
213-
query.text,
213+
query.text.getvalue(),
214214
flags=re.IGNORECASE,
215-
)
216-
if "unload" in query.text.lower() and "to 's3:" in query.text.lower():
217-
query.text = re.sub(
215+
))
216+
if "unload" in query.text.getvalue().lower() and "to 's3:" in query.text.getvalue().lower():
217+
query.clear_and_set_text(re.sub(
218218
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
219219
f" IAM_ROLE ''",
220-
query.text,
220+
query.text.getvalue(),
221221
flags=re.IGNORECASE,
222-
)
222+
))
223223

224-
query.text = f"{query.text.strip()}"
225-
if not len(query.text) == 0:
226-
if not query.text.endswith(";"):
227-
query.text += ";"
224+
query.clear_and_set_text(f"{query.text.getvalue().strip()}")
225+
if not len(query.text.getvalue()) == 0:
226+
if not query.text.getvalue().endswith(";"):
227+
query.text.write(";")
228228

229-
query_info["text"] = query.text
229+
query_info["text"] = query.text.getvalue()
230230
sql_json["transactions"][query.xid]["queries"].append(query_info)
231231

232232
if not hash((query.database_name, query.username, query.pid)) in last_connections:

src/SimpleReplay/log_validation.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,18 @@ def is_valid_log(log, start_time, end_time):
4444
if end_time and log.record_time > end_time:
4545
return False
4646

47-
if any(word in log.text for word in problem_keywords):
47+
if any(word in log.text.getvalue() for word in problem_keywords):
4848
return False
4949

50-
if any(word in log.text for word in potential_problem_keywords) and not any(word in log.text for word in not_problem_keywords):
50+
if any(word in log.text.getvalue() for word in potential_problem_keywords) and not any(word in log.text.getvalue() for word in not_problem_keywords):
5151
return False
5252

5353
# filter internal statement rewrites with parameter markers
54-
if re.search('\$\d',log.text):
54+
if re.search('\$\d',log.text.getvalue()):
5555
# remove \$\d in string literals ( select '$1' ) or comment blocks ( */ $1 */ )
56-
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.text,flags=re.DOTALL)
56+
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.text.getvalue(),flags=re.DOTALL)
5757
# remove \$\d in single line quotes ( -- $1 )
58-
if '--' in log.text:
58+
if '--' in log.text.getvalue():
5959
text_without_valid_parameter_markers = re.sub('^\s*--.*\$\d','',text_without_valid_parameter_markers)
6060
# if there are still parameter markers remaining after removing them from valid cases, the query text is invalid
6161
if re.search('\$\d',text_without_valid_parameter_markers):

0 commit comments

Comments
 (0)