3737from dapper_python .databases_v2 .database import Metadata
3838from dapper_python .databases_v2 .mingw_db import MinGWDatabase
3939from dapper_python .databases_v2 .mingw_db import Package , PackageFile , SourceFile
40- from dapper_python .databases_v2 .mingw_db import FunctionSymbol , PreprocessDefine , StringLiteral
40+ from dapper_python .databases_v2 .mingw_db import FunctionSymbol
4141from dapper_python .dataset_generation .parsing .cpp import CPPTreeParser
4242from dapper_python .dataset_generation .utils .archive import SafeTarFile , SafeZipFile
4343
@@ -138,7 +138,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
138138 self ._package_dir = None
139139 return self ._exit_stack .__exit__ (exc_type , exc_val , exc_tb )
140140
141- def analyze_package (self ) -> tuple [ Package | None , dict [ str , Any ]] :
141+ def analyze_package (self ) -> Package | None :
142142 """Analyzes the package and returns the parsed data"""
143143 if self ._temp_dir is None :
144144 raise RuntimeError ("Must be used within context manager" )
@@ -152,7 +152,7 @@ def analyze_package(self) -> tuple[Package | None, dict[str, Any]]:
152152 analyzed_package_sources = self ._analyze_package_source ()
153153 mingw_package .source_files = analyzed_package_sources
154154 except (zstd .ZstdError , tarfile .ReadError ):
155- return None , {}
155+ return None
156156
157157 with suppress (zstd .ZstdError , tarfile .ReadError ):
158158 analyzed_package_files , symbols = self ._analyze_package_contents ()
@@ -165,7 +165,7 @@ def analyze_package(self) -> tuple[Package | None, dict[str, Any]]:
165165 # Such as std::string -> std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>
166166 function_symbol .in_binary = function_symbol .qualified_symbol_name in symbols
167167
168- return mingw_package , self . _as_json_dict ( mingw_package )
168+ return mingw_package
169169
170170 def _analyze_package_source (self ) -> list [SourceFile ]:
171171 if self ._source_dir is None :
@@ -221,17 +221,6 @@ def _analyze_package_source(self) -> list[SourceFile]:
221221 )
222222 for x in tree .parse_functions ()
223223 ]
224- # Monkey-patch in pre-process defines and string literals
225- # Which are not included in the database but should be passed up the chain, associated with each SourceFile
226- source_file .preproc_defines = [
227- PreprocessDefine (name = x .name , value = x .value )
228- for x in tree .parse_preproc_defs ()
229- ]
230- source_file .string_literals = [
231- StringLiteral (value = x .value )
232- for x in tree .parse_string_literals ()
233- ]
234-
235224 source_files .append (source_file )
236225
237226 return source_files
@@ -287,29 +276,6 @@ def _analyze_package_contents(self) -> tuple[list[PackageFile], set[str]]:
287276
288277 return package_files , symbols
289278
290- # noinspection Pydantic
291- def _as_json_dict (self , package : Package ) -> dict [str , Any ]:
292- # preproc_defines and string_literals attributes are monkey-patched onto each file
293- # Since they are [currently] not part of the actual database schema
294- return {
295- "package_info" : {
296- "name" : self ._mysys_package .package_name ,
297- "version" : self ._mysys_package .package_version ,
298- "description" : self ._mysys_package .description ,
299- "package_url" : self ._mysys_package .package_url ,
300- "source_url" : self ._mysys_package .source_url ,
301- "contents_url" : self ._mysys_package .contents_url ,
302- },
303- "contents" : {
304- str (_source_file .file_path ): {
305- "functions" : [x .model_dump (exclude = {"id" , "file_id" }) for x in _source_file .functions ],
306- "preproc_defines" : [x .model_dump (exclude = {"id" , "file_id" }) for x in _source_file .preproc_defines ],
307- "string_literals" : [x .model_dump (exclude = {"id" , "file_id" }) for x in _source_file .string_literals ],
308- }
309- for _source_file in package .source_files
310- },
311- }
312-
313279 _NON_FUNCTION_PREFIXES = (
314280 "sub_" , # Special case since we don't want anonymous functions/code sections angr finds without a name
315281 "vtable for" ,
@@ -347,14 +313,9 @@ def disable_logging(highest_level=logging.CRITICAL):
347313
348314def main ():
349315 parser = argparse .ArgumentParser ()
350- parser .add_argument (
351- "-d" , "--dir" ,
352- type = Path , default = Path .cwd (),
353- help = "Directory to save database + generated files to" ,
354- )
355316 parser .add_argument (
356317 "-o" , "--output" ,
357- type = str , default = "MinGWDB.db" ,
318+ type = Path , default = Path ( "MinGWDB.db" ) ,
358319 help = "Name of the output database file" ,
359320 )
360321 parser .add_argument (
@@ -364,9 +325,6 @@ def main():
364325 )
365326 args = parser .parse_args ()
366327
367- if not args .dir .exists () or not args .dir .is_dir ():
368- raise FileNotFoundError (f"No such directory: { args .dir } " )
369-
370328 params = {"repo" : Arch .MINGW_64 }
371329 with suppress_warnings (), requests .get (PACKAGE_INDEX_URL , params = params , verify = False ) as response :
372330 response .raise_for_status ()
@@ -397,8 +355,7 @@ def main():
397355 package_url = package_link ,
398356 )
399357
400- db_path = args .dir .joinpath (args .output )
401- mingw_db = MinGWDatabase .create_database (db_path , exist_ok = True )
358+ mingw_db = MinGWDatabase .create_database (args .output , exist_ok = True )
402359 with mingw_db .session () as session :
403360 # Remove any outdated packages
404361 with session .begin ():
@@ -419,14 +376,12 @@ def main():
419376
420377 # noinspection PyTypeChecker, Pydantic
421378 saved_packages : set [str ] = set (session .exec (select (Package .package_name )))
379+ to_update = sorted (list (set (package_list .keys ()) - saved_packages ))
422380 to_update = [
423381 package_list [package_name ]
424- for package_name in set ( package_list . keys ()) - saved_packages
382+ for package_name in to_update
425383 ]
426384
427- json_dir : Path = args .dir .joinpath ("json_dump" )
428- json_dir .mkdir (exist_ok = True )
429-
430385 # Get new packages and add to the database
431386 progress_iter = tqdm (
432387 to_update ,
@@ -437,15 +392,16 @@ def main():
437392 )
438393 for package in progress_iter :
439394 with PackageAnalyzer (package ) as analyzer :
440- mingw_package , json_dump = analyzer .analyze_package ()
395+ mingw_package = analyzer .analyze_package ()
441396 if not mingw_package :
442397 continue
443398
444- dump_path = json_dir .joinpath (f"{ mingw_package .package_name } .json" )
445399 with session .begin ():
446400 session .add (mingw_package )
447- with open (dump_path , "w" , encoding = "utf-8" ) as f :
448- json .dump (json_dump , f , indent = "\t " )
401+
402+ # Due to somewhat high memory usage, free up memory before the next loop starts
403+ del mingw_package
404+ mingw_package = None
449405
450406 # Reset the metadata if it already exists and set new version
451407 with session .begin ():
0 commit comments