-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlibrary.py
61 lines (50 loc) · 1.77 KB
/
library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from fuzzywuzzy import process, fuzz
import requests
from unidecode import unidecode
def get_with_cache(url, cache_path):
try:
with open(cache_path, 'r') as f:
return f.read()
except OSError:
page = requests.get(url)
with open(cache_path, 'w') as f:
f.write(page.text)
return page.text
def fuzzy_value(fuzzy_dict, fuzzy_key, scorer=None):
return fuzzy_dict[fuzzy_name_match(fuzzy_dict.keys(), fuzzy_key, scorer=scorer)]
def fuzzy_name_match(fuzzy_set, name, scorer=None):
if scorer is None:
scorer = fuzz.partial_token_sort_ratio
normalised_name = process.extractOne(name, fuzzy_set, scorer=scorer)
if normalised_name[1] < 86:
raise KeyError()
elif normalised_name[1] < 90:
print("Potential fuzzy match not used {} == {} ({})".format(
normalised_name[0], name, normalised_name[1]))
raise KeyError()
elif normalised_name[1] < 95:
print("Fuzzy matched {} == {} ({})".format(
normalised_name[0], name, normalised_name[1]))
return normalised_name[0]
name_filter = set()
normalised_names = dict()
def normalise_name(name, filter_add=False, filter_apply=False):
name = name.partition("(")[0].rstrip()
name = unidecode(name)
if filter_add:
name_filter.add(name)
if name in normalised_names:
return normalised_names[name]
elif filter_apply:
try:
if name in name_filter:
matched_name = name
else:
matched_name = fuzzy_name_match(name_filter, name)
normalised_names[name] = matched_name
return matched_name
except KeyError:
normalised_names[name] = None
return None
else:
return name