-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
54 lines (51 loc) · 1.54 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python
import argparse
from src.link_records import LinkRecords
from src.dedupe_records import DedupeRecords
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="PyMarc Dedupe",
description="""Script that compares two Marc XML or JSON files
to find duplicates using machine learning""",
)
parser.add_argument(
"--file1",
"-f1",
required=True,
type=str,
help="the path to the first Marc Xml file for comparison",
)
parser.add_argument(
"--file2",
"-f2",
required=False,
type=str,
help="the path to the second Marc Xml file for comparison",
)
parser.add_argument(
"--dir",
"-d",
default="experiments_files_and_output",
type=str,
help="""the directory where %(prog)s will save settings,
training, and data output files. (default: %(default)s)""",
)
args = parser.parse_args()
file1 = args.file1
file2 = args.file2
output_dir = args.dir
print("file1 is " + file1)
try:
print("file2 is " + file2)
except TypeError:
print("No second file provided, finding duplicates within first file")
print("dir is " + output_dir)
print("importing data ...")
try:
my_class = LinkRecords(file1, file2, output_dir)
model = my_class.linker()
my_class.cluster(model)
except TypeError:
my_class = DedupeRecords(file1, output_dir)
model = my_class.deduper()
my_class.cluster(model)