Skip to content

Commit 3fa0b03

Browse files
committed
Remove json dumps [unneeded for production code] due to excessive memory and disk space usage
1 parent 0cb7fe4 commit 3fa0b03

1 file changed

Lines changed: 13 additions & 57 deletions

File tree

dataset-generation/create_mingw_db.py

Lines changed: 13 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from dapper_python.databases_v2.database import Metadata
3838
from dapper_python.databases_v2.mingw_db import MinGWDatabase
3939
from dapper_python.databases_v2.mingw_db import Package, PackageFile, SourceFile
40-
from dapper_python.databases_v2.mingw_db import FunctionSymbol, PreprocessDefine, StringLiteral
40+
from dapper_python.databases_v2.mingw_db import FunctionSymbol
4141
from dapper_python.dataset_generation.parsing.cpp import CPPTreeParser
4242
from dapper_python.dataset_generation.utils.archive import SafeTarFile, SafeZipFile
4343

@@ -138,7 +138,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
138138
self._package_dir = None
139139
return self._exit_stack.__exit__(exc_type, exc_val, exc_tb)
140140

141-
def analyze_package(self) -> tuple[Package | None, dict[str, Any]]:
141+
def analyze_package(self) -> Package | None:
142142
"""Analyzes the package and returns the parsed data"""
143143
if self._temp_dir is None:
144144
raise RuntimeError("Must be used within context manager")
@@ -152,7 +152,7 @@ def analyze_package(self) -> tuple[Package | None, dict[str, Any]]:
152152
analyzed_package_sources = self._analyze_package_source()
153153
mingw_package.source_files = analyzed_package_sources
154154
except (zstd.ZstdError, tarfile.ReadError):
155-
return None, {}
155+
return None
156156

157157
with suppress(zstd.ZstdError, tarfile.ReadError):
158158
analyzed_package_files, symbols = self._analyze_package_contents()
@@ -165,7 +165,7 @@ def analyze_package(self) -> tuple[Package | None, dict[str, Any]]:
165165
# Such as std::string -> std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>
166166
function_symbol.in_binary = function_symbol.qualified_symbol_name in symbols
167167

168-
return mingw_package, self._as_json_dict(mingw_package)
168+
return mingw_package
169169

170170
def _analyze_package_source(self) -> list[SourceFile]:
171171
if self._source_dir is None:
@@ -221,17 +221,6 @@ def _analyze_package_source(self) -> list[SourceFile]:
221221
)
222222
for x in tree.parse_functions()
223223
]
224-
# Monkey-patch in pre-process defines and string literals
225-
# Which are not included in the database but should be passed up the chain, associated with each SourceFile
226-
source_file.preproc_defines = [
227-
PreprocessDefine(name=x.name, value=x.value)
228-
for x in tree.parse_preproc_defs()
229-
]
230-
source_file.string_literals = [
231-
StringLiteral(value=x.value)
232-
for x in tree.parse_string_literals()
233-
]
234-
235224
source_files.append(source_file)
236225

237226
return source_files
@@ -287,29 +276,6 @@ def _analyze_package_contents(self) -> tuple[list[PackageFile], set[str]]:
287276

288277
return package_files, symbols
289278

290-
# noinspection Pydantic
291-
def _as_json_dict(self, package: Package) -> dict[str, Any]:
292-
# preproc_defines and string_literals attributes are monkey-patched onto each file
293-
# Since they are [currently] not part of the actual database schema
294-
return {
295-
"package_info": {
296-
"name": self._mysys_package.package_name,
297-
"version": self._mysys_package.package_version,
298-
"description": self._mysys_package.description,
299-
"package_url": self._mysys_package.package_url,
300-
"source_url": self._mysys_package.source_url,
301-
"contents_url": self._mysys_package.contents_url,
302-
},
303-
"contents": {
304-
str(_source_file.file_path): {
305-
"functions": [x.model_dump(exclude={"id", "file_id"}) for x in _source_file.functions],
306-
"preproc_defines": [x.model_dump(exclude={"id", "file_id"}) for x in _source_file.preproc_defines],
307-
"string_literals": [x.model_dump(exclude={"id", "file_id"}) for x in _source_file.string_literals],
308-
}
309-
for _source_file in package.source_files
310-
},
311-
}
312-
313279
_NON_FUNCTION_PREFIXES = (
314280
"sub_", # Special case since we don't want anonymous functions/code sections angr finds without a name
315281
"vtable for",
@@ -347,14 +313,9 @@ def disable_logging(highest_level=logging.CRITICAL):
347313

348314
def main():
349315
parser = argparse.ArgumentParser()
350-
parser.add_argument(
351-
"-d", "--dir",
352-
type=Path, default=Path.cwd(),
353-
help="Directory to save database + generated files to",
354-
)
355316
parser.add_argument(
356317
"-o", "--output",
357-
type=str, default="MinGWDB.db",
318+
type=Path, default=Path("MinGWDB.db"),
358319
help="Name of the output database file",
359320
)
360321
parser.add_argument(
@@ -364,9 +325,6 @@ def main():
364325
)
365326
args = parser.parse_args()
366327

367-
if not args.dir.exists() or not args.dir.is_dir():
368-
raise FileNotFoundError(f"No such directory: {args.dir}")
369-
370328
params = {"repo": Arch.MINGW_64}
371329
with suppress_warnings(), requests.get(PACKAGE_INDEX_URL, params=params, verify=False) as response:
372330
response.raise_for_status()
@@ -397,8 +355,7 @@ def main():
397355
package_url=package_link,
398356
)
399357

400-
db_path = args.dir.joinpath(args.output)
401-
mingw_db = MinGWDatabase.create_database(db_path, exist_ok=True)
358+
mingw_db = MinGWDatabase.create_database(args.output, exist_ok=True)
402359
with mingw_db.session() as session:
403360
# Remove any outdated packages
404361
with session.begin():
@@ -419,14 +376,12 @@ def main():
419376

420377
# noinspection PyTypeChecker, Pydantic
421378
saved_packages: set[str] = set(session.exec(select(Package.package_name)))
379+
to_update = sorted(list(set(package_list.keys()) - saved_packages))
422380
to_update = [
423381
package_list[package_name]
424-
for package_name in set(package_list.keys()) - saved_packages
382+
for package_name in to_update
425383
]
426384

427-
json_dir: Path = args.dir.joinpath("json_dump")
428-
json_dir.mkdir(exist_ok=True)
429-
430385
# Get new packages and add to the database
431386
progress_iter = tqdm(
432387
to_update,
@@ -437,15 +392,16 @@ def main():
437392
)
438393
for package in progress_iter:
439394
with PackageAnalyzer(package) as analyzer:
440-
mingw_package, json_dump = analyzer.analyze_package()
395+
mingw_package = analyzer.analyze_package()
441396
if not mingw_package:
442397
continue
443398

444-
dump_path = json_dir.joinpath(f"{mingw_package.package_name}.json")
445399
with session.begin():
446400
session.add(mingw_package)
447-
with open(dump_path, "w", encoding="utf-8") as f:
448-
json.dump(json_dump, f, indent="\t")
401+
402+
# Due to somewhat high memory usage, free up memory before the next loop starts
403+
del mingw_package
404+
mingw_package = None
449405

450406
# Reset the metadata if it already exists and set new version
451407
with session.begin():

0 commit comments

Comments
 (0)