-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrim_model.py
More file actions
46 lines (36 loc) · 1.56 KB
/
trim_model.py
File metadata and controls
46 lines (36 loc) · 1.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""Trim bm25_ru_default.json by removing rare terms.
Rare terms (low DF) have very high IDF but contribute almost nothing to
BM25 ranking in practice — they appear in too few documents to be useful
for retrieval. Removing them dramatically reduces memory usage.
Usage:
python trim_model.py --min-df 10
python trim_model.py --min-df 209 # 0.01% of docs
"""
import json
import os
import sys
def trim_model(input_path, output_path, min_df):
with open(input_path) as f:
d = json.load(f)
freq = d["_doc_token_frequency"]
docs = d["_doc_count"]
total = len(freq)
# Filter
trimmed = {k: v for k, v in freq.items() if v >= min_df}
kept = len(trimmed)
removed = total - kept
d["_doc_token_frequency"] = trimmed
with open(output_path, "w") as f:
json.dump(d, f, ensure_ascii=False)
size_mb = os.path.getsize(output_path) / 1024 / 1024
est_ram = kept * 80 / 1024 / 1024 # C++ unordered_map estimate
print(f"Min DF: {min_df} ({min_df/docs*100:.3f}% of {docs:,} docs)")
print(f"Terms: {total:,} -> {kept:,} (removed {removed:,}, {removed/total*100:.1f}%)")
print(f"File: {size_mb:.1f} MB")
print(f"Est. C++ RAM: ~{est_ram:.0f} MB")
print(f"Output: {output_path}")
if __name__ == "__main__":
min_df = int(sys.argv[1]) if len(sys.argv) > 1 else 10
input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "bm25_ru_default.json")
output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"bm25_ru_min{min_df}.json")
trim_model(input_path, output_path, min_df)