-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexamine_user_payload.py
152 lines (137 loc) · 7.27 KB
/
examine_user_payload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import tarfile
import json
import struct
from collections import defaultdict
def examine_payload(segment_path, users_found, field_stats):
"""Examine the payload data in a segment file"""
print(f"\nExamining payload in: {os.path.basename(segment_path)}")
with tarfile.open(segment_path, 'r') as tar:
# Try to find and read payload data
payload_files = [m for m in tar.getmembers() if 'payload_storage/page_0.dat' in m.name]
for payload_file in payload_files:
print(f"\nReading payload file: {payload_file.name}")
f = tar.extractfile(payload_file)
if f:
data = f.read()
print(f"Payload size: {len(data)} bytes")
# Try to decode some of the payload data
try:
# Skip header (first 16 bytes)
payload_data = data[16:]
# Try to find JSON-like structures
start = 0
while start < len(payload_data) and users_found[0] < 100:
# Look for JSON start
try:
json_start = payload_data.index(b'{', start)
# Look for matching closing brace
brace_count = 1
pos = json_start + 1
while brace_count > 0 and pos < len(payload_data):
if payload_data[pos] == ord('{'):
brace_count += 1
elif payload_data[pos] == ord('}'):
brace_count -= 1
pos += 1
if brace_count == 0:
json_data = payload_data[json_start:pos]
try:
# Try to decode and parse JSON
decoded = json.loads(json_data.decode('utf-8'))
if 'user_id' in decoded:
users_found[0] += 1
print(f"\nUser {users_found[0]}:")
print(json.dumps(decoded, indent=2))
# Track all fields and their types
for field, value in decoded.items():
field_stats[field]['count'] += 1
value_type = type(value).__name__
field_stats[field]['types'].add(value_type)
if isinstance(value, (list, dict)):
field_stats[field]['example'] = value
else:
field_stats[field]['example'] = str(value)
except:
pass
start = pos
else:
start = json_start + 1
except ValueError:
break
except Exception as e:
print(f"Error reading payload data: {e}")
# Also try to read the db_backup files which might contain metadata
db_files = [m for m in tar.getmembers() if 'db_backup/shared_checksum' in m.name and m.name.endswith('.sst')]
for db_file in db_files[:5]: # Only check first 5 files to save time
print(f"\nChecking database file: {db_file.name}")
f = tar.extractfile(db_file)
if f:
data = f.read()
# Try to find JSON-like structures
try:
start = 0
found = 0
while start < len(data) and found < 5: # Look for up to 5 JSON objects
try:
json_start = data.index(b'{', start)
# Look for matching closing brace
brace_count = 1
pos = json_start + 1
while brace_count > 0 and pos < len(data):
if data[pos] == ord('{'):
brace_count += 1
elif data[pos] == ord('}'):
brace_count -= 1
pos += 1
if brace_count == 0:
json_data = data[json_start:pos]
try:
# Try to decode and parse JSON
decoded = json.loads(json_data.decode('utf-8'))
print("\nFound metadata:")
print(json.dumps(decoded, indent=2))
found += 1
# Track metadata fields
for field, value in decoded.items():
field_stats[f"metadata_{field}"]['count'] += 1
value_type = type(value).__name__
field_stats[f"metadata_{field}"]['types'].add(value_type)
if isinstance(value, (list, dict)):
field_stats[f"metadata_{field}"]['example'] = value
else:
field_stats[f"metadata_{field}"]['example'] = str(value)
except:
pass
start = pos
else:
start = json_start + 1
except ValueError:
break
except Exception as e:
print(f"Error reading database file: {e}")
def main():
# Path to the segments directory
segments_dir = "extracted_users_data/0/segments"
# Counter for users found (using list to make it mutable)
users_found = [0]
# Track field statistics
field_stats = defaultdict(lambda: {'count': 0, 'types': set(), 'example': None})
# Examine each segment file until we find 100 users
for filename in os.listdir(segments_dir):
if filename.endswith('.tar'):
segment_path = os.path.join(segments_dir, filename)
examine_payload(segment_path, users_found, field_stats)
if users_found[0] >= 100:
break
print(f"\nTotal users found: {users_found[0]}")
# Print field statistics
print("\nField Statistics:")
print("=" * 50)
for field, stats in field_stats.items():
print(f"\nField: {field}")
print(f"Count: {stats['count']}")
print(f"Types: {', '.join(stats['types'])}")
print(f"Example: {stats['example']}")
if __name__ == "__main__":
main()