-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfix_annotations.py
77 lines (64 loc) · 2.9 KB
/
fix_annotations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import argparse
import re
import omero.cli
parser = argparse.ArgumentParser(description="Generate SQL script to fix duplicated URL annotations")
parser.add_argument("url", help="Regex of the URL to check for (incl a named group to match the ID!)"
", e.g. \".+ncbi\.nlm\.nih\.gov\/gene\/(?P<ID>.+)\"")
parser.add_argument("namespace", help="The namespace of the annotations, e.g. openmicroscopy.org/mapr/gene")
url_names = {"openmicroscopy.org/mapr/gene": "Gene Identifier URL",
"openmicroscopy.org/mapr/compound": "Compound Name URL",
"openmicroscopy.org/mapr/phenotype": "Phenotype Term Accession URL",
"openmicroscopy.org/mapr/antibody": "Antibody Identifier URL"}
pref_urls = [re.compile(r"^https:\/\/(?!www).+"),\
re.compile(r"^https:\/\/.+")] # prefer https:// without www over with www
def get_annotations(conn, namespace):
"""
Get all map annotation with the specific namespace
:param conn: Reference to the BlitzGateway
:param namespace: The namespace
:return: Generator for map annotations
"""
metadataService = conn.getMetadataService()
annotations = metadataService.loadSpecifiedAnnotations(
'omero.model.MapAnnotation', [namespace], None, None)
for ann in annotations:
yield ann
def get_urls(ann, pattern):
"""
Get all URLs from the map annotation matching the given pattern
:param ann: The map annotation
:param pattern: The regex pattern
:return: Dictionary with lists of URLs (key: ID)
"""
urls = dict()
for nv in ann._mapValue:
if nv.name == url_names[ann._ns._val]:
m = pattern.match(nv.value)
if m:
if m.group("ID") not in urls:
urls[m.group("ID")] = []
urls[m.group("ID")].append(nv.value)
return urls
def check_annotations(conn, args):
pattern = re.compile(f"{args.url}")
for ann in get_annotations(conn, args.namespace):
for id, urls in get_urls(ann, pattern).items():
if (len(urls) > 1):
url_to_keep = None
for url in urls:
for pref_url in pref_urls:
if pref_url.match(url):
url_to_keep = url
break
if url_to_keep:
break
if url_to_keep:
print(f"-- ID: {id} - Annotation ID: {ann._id._val}")
print(f"-- URLs: {urls} - keep: {url_to_keep}")
for url in urls:
if url != url_to_keep:
print(f"DELETE FROM annotation_mapvalue mv WHERE mv.annotation_id = {ann._id._val} AND mv.value = '{url}';\n")
args = parser.parse_args()
with omero.cli.cli_login() as c:
conn = omero.gateway.BlitzGateway(client_obj=c.get_client())
check_annotations(conn, args)