2727sys .path .append ("/workspace/codonfm" )
2828from src .tokenizer import Tokenizer
2929
30- def main (data_dir : Path ):
31- data_path = data_dir / Path ("processed_unfiltered" )
30+ def main (pretraining_processed_data_dir : Path , data_dir : Path ):
3231 tax_ids_to_remove = json .load (open (data_dir / Path ("taxids_to_remove.json" )))
33- metadata = json .load (open (data_path / "metadata.json" ))
32+ metadata = json .load (open (pretraining_processed_data_dir / "metadata.json" ))
3433 tokenizer = Tokenizer ()
3534
3635 groups = set ([x ["file_name" ][:- 4 ] for x in metadata ["file_metadata" ]]) # noqa: C403
@@ -42,13 +41,13 @@ def main(data_dir: Path):
4241 else :
4342 curr_taxids_to_remove = set ()
4443 mmap = np .memmap (
45- data_path / cm ["sequences" ]["path" ],
44+ pretraining_processed_data_dir / cm ["sequences" ]["path" ],
4645 dtype = cm ["sequences" ]["dtype" ],
4746 mode = "r" ,
4847 shape = tuple (cm ["sequences" ]["shape" ]),
4948 )
5049 idx_mmap = np .memmap (
51- data_path / cm ["index" ]["path" ], dtype = cm ["index" ]["dtype" ], mode = "r" , shape = tuple (cm ["index" ]["shape" ])
50+ pretraining_processed_data_dir / cm ["index" ]["path" ], dtype = cm ["index" ]["dtype" ], mode = "r" , shape = tuple (cm ["index" ]["shape" ])
5251 )
5352 for start , end , taxid in idx_mmap :
5453 if taxid in curr_taxids_to_remove :
@@ -60,11 +59,12 @@ def main(data_dir: Path):
6059 # %%
6160 for g in counts :
6261 counts [g ] = counts [g ].tolist ()
63- json .dump (counts , open ("/data/ncbi/ codon_counts_nopathogen.json" , "w" ))
62+ json .dump (counts , open (data_dir / " codon_counts_nopathogen.json" , "w" ))
6463
6564
6665if __name__ == "__main__" :
6766 parser = argparse .ArgumentParser (description = "Check codon frequency" )
67+ parser .add_argument ("--pretraining_processed_data_dir" , type = str , required = True )
6868 parser .add_argument ("--data_dir" , type = str , required = True )
6969 args = parser .parse_args ()
70- main (Path (args .data_dir ))
70+ main (Path (args .pretraining_processed_data_dir ), Path ( args . data_dir ))
0 commit comments