-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathgenerate_embeddings.py
More file actions
49 lines (36 loc) · 1.63 KB
/
generate_embeddings.py
File metadata and controls
49 lines (36 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from pathlib import Path
import os
import yaml
import argparse
from domain_orchestrator.utils import get_domain_args
DETECTRON2_DATASET_PATH = os.getenv("DETECTRON2_DATASETS")
if __name__ == "__main__":
# Argparse
parser = argparse.ArgumentParser()
# Path to the yaml file that contains the paths to the domains training data
parser.add_argument("--source_domains_file", type=str, required=True)
# Path to the lora library where the statistics will be stored
parser.add_argument("--lora_library_path", type=str, required=True)
# Parse arguments
args = parser.parse_args()
source_domains_file = Path(args.source_domains_file)
lora_library_path = Path(args.lora_library_path)
with open(source_domains_file, "r") as f:
source_domains = yaml.safe_load(f)
embedding_manager = None
print("Generating embeddings for all source domains ...")
for domain_name in source_domains:
args = get_domain_args(domain_name, "train", get_cofing_only=True)
train_dataset_path = Path(args.train_dataset_path)
print(train_dataset_path)
assert train_dataset_path.exists(), f"Path to training dataset {train_dataset_path} does not exist!"
if embedding_manager is None:
from domain_orchestrator import embedding
embedding_manager = embedding.EmbeddingManager()
domain_path = lora_library_path / Path(domain_name)
embedding_manager.calculate_statistics(
domain_name=domain_name,
domain_path=domain_path,
train_path=train_dataset_path,
)
print("Finished generating embeddings for all domains")