Skip to content

Commit 60c6422

Browse files
committed
updated nb
1 parent 617e82f commit 60c6422

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@
2727
sys.path.append("/workspace/codonfm")
2828
from src.tokenizer import Tokenizer
2929

30-
def main(data_dir: Path):
31-
data_path = data_dir / Path("processed_unfiltered")
30+
def main(pretraining_processed_data_dir: Path, data_dir: Path):
3231
tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json")))
33-
metadata = json.load(open(data_path / "metadata.json"))
32+
metadata = json.load(open(pretraining_processed_data_dir / "metadata.json"))
3433
tokenizer = Tokenizer()
3534

3635
groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403
@@ -42,13 +41,13 @@ def main(data_dir: Path):
4241
else:
4342
curr_taxids_to_remove = set()
4443
mmap = np.memmap(
45-
data_path / cm["sequences"]["path"],
44+
pretraining_processed_data_dir / cm["sequences"]["path"],
4645
dtype=cm["sequences"]["dtype"],
4746
mode="r",
4847
shape=tuple(cm["sequences"]["shape"]),
4948
)
5049
idx_mmap = np.memmap(
51-
data_path / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"])
50+
pretraining_processed_data_dir / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"])
5251
)
5352
for start, end, taxid in idx_mmap:
5453
if taxid in curr_taxids_to_remove:
@@ -60,11 +59,12 @@ def main(data_dir: Path):
6059
# %%
6160
for g in counts:
6261
counts[g] = counts[g].tolist()
63-
json.dump(counts, open("/data/ncbi/codon_counts_nopathogen.json", "w"))
62+
json.dump(counts, open(data_dir / "codon_counts_nopathogen.json", "w"))
6463

6564

6665
if __name__ == "__main__":
6766
parser = argparse.ArgumentParser(description="Check codon frequency")
67+
parser.add_argument("--pretraining_processed_data_dir", type=str, required=True)
6868
parser.add_argument("--data_dir", type=str, required=True)
6969
args = parser.parse_args()
70-
main(Path(args.data_dir))
70+
main(Path(args.pretraining_processed_data_dir), Path(args.data_dir))

0 commit comments

Comments
 (0)