Skip to content

Commit 369605e

Browse files
authored
Paper merging system (#68)
* Add AugmentedProxy, quality, merge algo * Update dependencies and lock file
1 parent f1fcaa3 commit 369605e

File tree

5 files changed

+395
-28
lines changed

5 files changed

+395
-28
lines changed

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,18 @@ readme = "README.md"
1313
license = "MIT"
1414
requires-python = ">=3.12"
1515
dependencies = [
16+
"ovld>=0.5.7",
17+
"serieux>=0.1.1",
18+
"gifnoc>=0.5.3",
1619
"backoff>=2.2.1",
1720
"beautifulsoup4>=4.13.4",
1821
"blessed>=1.21.0",
19-
"gifnoc>=0.5.3",
2022
"requests>=2.32.3",
2123
"openreview-py>=1.50.0",
2224
"requests-cache>=1.2.1",
2325
"lxml>=6.0.0",
26+
"wrapt>=1.17.2",
27+
"unidecode>=1.4.0",
2428
]
2529

2630
[project.urls]

src/paperoni/model/merge.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
from dataclasses import dataclass
2+
from difflib import SequenceMatcher
3+
from numbers import Number
4+
from typing import Any
5+
6+
from ovld import Dataclass, Medley, call_next, ovld, recurse
7+
from ovld.dependent import HasKey
8+
from serieux import Context, Serieux
9+
from wrapt import ObjectProxy
10+
11+
from ..model.classes import Institution, PaperAuthor
12+
from ..utils import associate, plainify
13+
14+
15+
@dataclass
16+
class Annotations:
17+
quality: float
18+
19+
20+
class AugmentedProxy(ObjectProxy):
21+
def __init__(self, wrapped, aug):
22+
super().__init__(wrapped)
23+
self._self_ann = aug
24+
25+
@property
26+
def _(self):
27+
return self._self_ann
28+
29+
def __repr__(self): # pragma: no cover
30+
return f"<{self.__wrapped__!r}>"
31+
32+
33+
@Serieux.extend
34+
class HandleProxy(Medley):
35+
@ovld(priority=1)
36+
def serialize(self, t: Any, obj: AugmentedProxy, ctx: Context):
37+
return {
38+
"$ann": recurse(Annotations, obj._self_ann, ctx),
39+
"$value": recurse(t, obj.__wrapped__, ctx),
40+
}
41+
42+
@ovld(priority=1)
43+
def deserialize(self, t: Any, obj: HasKey["$ann"], ctx: Context):
44+
return AugmentedProxy(
45+
recurse(t, obj["$value"], ctx),
46+
recurse(Annotations, obj["$ann"], ctx),
47+
)
48+
49+
50+
def qual(x, q):
51+
if isinstance(x, AugmentedProxy):
52+
x = x.__wrapped__
53+
return AugmentedProxy(x, Annotations(quality=q))
54+
55+
56+
@ovld
57+
def merge(x: object, y: object):
58+
return recurse(x, y, 0.0, 0.0)
59+
60+
61+
@ovld(priority=10)
62+
def merge(x: object, y: object, qx: Number, qy: Number):
63+
if qx <= -10:
64+
return y
65+
elif qy <= -10:
66+
return x
67+
else:
68+
return call_next(x, y, qx, qy)
69+
70+
71+
@ovld
72+
def merge(x: object, y: object, qx: Number, qy: Number):
73+
return x if qx > qy else y
74+
75+
76+
@ovld(priority=1)
77+
def merge(x: AugmentedProxy, y: object, qx: Number, qy: Number):
78+
qx = x._.quality
79+
return qual(recurse(x.__wrapped__, y, qx, qy), max(qx, qy))
80+
81+
82+
@ovld
83+
def merge(x: object, y: AugmentedProxy, qx: Number, qy: Number):
84+
qy = y._.quality
85+
return qual(recurse(x, y.__wrapped__, qx, qy), max(qx, qy))
86+
87+
88+
@ovld
89+
def merge(x: dict, y: dict, qx: Number, qy: Number):
90+
main, other, qx, qy = (x, y, qx, qy) if qx >= qy else (y, x, qy, qx)
91+
results = dict(other)
92+
for k, v in main.items():
93+
if k in other:
94+
results[k] = recurse(v, other[k], qx, qy)
95+
else:
96+
results[k] = v
97+
return results
98+
99+
100+
@ovld
101+
def merge(x: Dataclass, y: Dataclass, qx: Number, qy: Number):
102+
return type(x)(**recurse(vars(x), vars(y), qx, qy))
103+
104+
105+
@ovld
106+
def merge(x: list, y: list, qx: Number, qy: Number):
107+
main, other, qx, qy = (x, y, qx, qy) if qx >= qy else (y, x, qy, qx)
108+
if not main:
109+
return other
110+
elif not other:
111+
return main
112+
first = x[0]
113+
if isinstance(first, AugmentedProxy):
114+
first = first.__wrapped__
115+
return recurse(x, y, qx, qy, type(first))
116+
117+
118+
@ovld
119+
def merge(
120+
x: list, y: list, qx: Number, qy: Number, et: type[PaperAuthor] | type[Institution]
121+
):
122+
results = []
123+
ass = associate(x, y, key=similarity, threshold=0.5)
124+
for x1, x2 in ass:
125+
merged = recurse(x1, x2, qx, qy) if x2 is not None else x1
126+
results.append(merged)
127+
return results
128+
129+
130+
@ovld
131+
def merge(x: list, y: list, qx: Number, qy: Number, et: type[object]):
132+
return x + [a for a in y if a not in x]
133+
134+
135+
@ovld(priority=1)
136+
def similarity(a: AugmentedProxy, b: object):
137+
return recurse(a.__wrapped__, b)
138+
139+
140+
@ovld
141+
def similarity(a: object, b: AugmentedProxy):
142+
return recurse(a, b.__wrapped__)
143+
144+
145+
@ovld
146+
def similarity(a: PaperAuthor, b: PaperAuthor):
147+
return similarity(a.display_name, b.display_name)
148+
149+
150+
@ovld
151+
def similarity(a: Institution, b: Institution):
152+
return similarity(a.name, b.name)
153+
154+
155+
@ovld
156+
def similarity(a: str, b: str):
157+
a = plainify(a)
158+
b = plainify(b)
159+
return SequenceMatcher(a=a, b=b).ratio()

src/paperoni/utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
import itertools
2+
import re
13
import unicodedata
24

5+
from unidecode import unidecode
6+
37
link_generators = {
48
"arxiv": {
59
"abstract": "https://arxiv.org/abs/{}",
@@ -111,3 +115,29 @@ def mostly_latin(s: str, threshold: float = 0.9) -> bool:
111115
if "LATIN" in unicodedata.name(base, ""):
112116
good += 1
113117
return good / total >= threshold
118+
119+
120+
def plainify(name):
121+
name = unidecode(name).lower()
122+
name = re.sub(string=name, pattern="[()-]", repl=" ")
123+
name = re.sub(string=name, pattern="['.]", repl="")
124+
return name
125+
126+
127+
def associate(l1, l2, key, threshold=0):
128+
el1 = list(enumerate(l1))
129+
el2 = list(enumerate(l2))
130+
sims = [
131+
(value, i1, i2)
132+
for (i1, x1), (i2, x2) in itertools.product(el1, el2)
133+
if (value := key(x1, x2)) > threshold
134+
]
135+
sims.sort(key=lambda tup: -tup[0])
136+
mapping = {}
137+
n = len(l1)
138+
for _, i1, i2 in sims:
139+
if i1 not in mapping:
140+
mapping[i1] = l2[i2]
141+
if len(mapping) == n:
142+
break
143+
return [(x1, mapping.get(i1, None)) for i1, x1 in el1]

tests/model/test_merge.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
from dataclasses import dataclass
2+
3+
from serieux import deserialize, serialize
4+
5+
from paperoni.model.classes import (
6+
Author,
7+
Institution,
8+
InstitutionCategory,
9+
Link,
10+
PaperAuthor,
11+
)
12+
from paperoni.model.merge import merge, qual, similarity
13+
14+
15+
@dataclass
16+
class Point:
17+
x: int
18+
y: int
19+
20+
21+
@dataclass
22+
class Person:
23+
name: str
24+
job: str
25+
26+
27+
def test_similarity():
28+
assert similarity("bonjour", "bonjour") == 1
29+
assert similarity("bon jour", "bon-jour.") == 1
30+
31+
32+
def test_similarity_2():
33+
assert similarity("Hugo Larochelle", "Marc Bellemare") < 0.5
34+
35+
36+
def test_augment():
37+
pt = Point(qual(3, 2.5), 4)
38+
ser = serialize(Point, pt)
39+
assert ser == {"x": {"$ann": {"quality": 2.5}, "$value": 3}, "y": 4}
40+
deser = deserialize(Point, ser)
41+
assert deser.x._.quality == 2.5
42+
43+
44+
def test_merge_dicts():
45+
d1 = {"a": 1, "b": 2}
46+
d2 = {"c": 3}
47+
48+
assert merge(d1, d2) == {"a": 1, "b": 2, "c": 3}
49+
assert merge(d1, qual(d2, -10)) == {"a": 1, "b": 2}
50+
assert merge(qual(d1, -10), d2) == {"c": 3}
51+
52+
53+
def test_merge():
54+
p1 = Person(name=qual("John", 2), job="Carpenter")
55+
p2 = Person(name=qual("Johnny", 1), job=qual("Lawyer", 3))
56+
p12 = merge(p1, p2)
57+
assert p12 == Person(name="John", job="Lawyer")
58+
59+
p3 = Person(name=qual("NO!", 1.5), job=qual("Philosopher", 4))
60+
p123 = merge(p12, p3)
61+
assert p123 == Person(name="John", job="Philosopher")
62+
63+
p4 = qual(Person(name="Gunther", job="Unemployed"), 3)
64+
p34 = merge(p3, p4)
65+
assert p34 == Person(name="Gunther", job="Philosopher")
66+
67+
68+
def test_merge_lists():
69+
p1 = Person(name="John", job="Carpenter")
70+
p2 = Person(name="John", job=qual("Lawyer", 3))
71+
p3 = qual(Person(name="Gunther", job="Unemployed"), 3)
72+
73+
l1 = [p3, p1]
74+
l2 = [p2, p1]
75+
76+
assert merge(l1, l2) == [p3, p1, p2]
77+
78+
79+
def test_merge_lists_empty():
80+
l1 = [1, 2]
81+
l2 = []
82+
assert merge(l1, l2) == l1
83+
assert merge(l2, l1) == l1
84+
85+
86+
def test_merge_author_lists():
87+
p1 = PaperAuthor(
88+
display_name="John",
89+
author=Author(name="John", links=[Link(type="job", link="baker")]),
90+
)
91+
p2 = PaperAuthor(
92+
display_name="John",
93+
author=Author(name="John", links=[Link(type="hair", link="yes")]),
94+
)
95+
p3 = qual(
96+
PaperAuthor(display_name="Gunther", author=Author(name="Gunther", links=[])), 3
97+
)
98+
99+
l1 = [p3, p1]
100+
l2 = [p2]
101+
102+
assert merge(l1, l2) == [
103+
PaperAuthor(display_name="Gunther", author=Author(name="Gunther", links=[])),
104+
PaperAuthor(
105+
display_name="John",
106+
author=Author(
107+
name="John",
108+
links=[Link(type="job", link="baker"), Link(type="hair", link="yes")],
109+
),
110+
),
111+
]
112+
113+
114+
def test_merge_author_lists_similarity():
115+
p1 = PaperAuthor(
116+
display_name="J. Smith",
117+
author=Author(name="J.", links=[Link(type="job", link="baker")]),
118+
)
119+
p2 = qual(
120+
PaperAuthor(
121+
display_name="John Smith",
122+
author=Author(name="John Smith", links=[Link(type="hair", link="yes")]),
123+
),
124+
2,
125+
)
126+
p3 = qual(
127+
PaperAuthor(display_name="Gunther", author=Author(name="Gunther", links=[])), 3
128+
)
129+
130+
l1 = [p3, p1]
131+
l2 = [p2]
132+
133+
assert merge(l1, l2) == [
134+
PaperAuthor(display_name="Gunther", author=Author(name="Gunther", links=[])),
135+
PaperAuthor(
136+
display_name="John Smith",
137+
author=Author(
138+
name="John Smith",
139+
links=[Link(type="hair", link="yes"), Link(type="job", link="baker")],
140+
),
141+
),
142+
]
143+
144+
145+
def test_merge_institution_lists():
146+
i1 = Institution(name="MIT")
147+
i2 = qual(Institution(name="MIT", category=InstitutionCategory.academia), 2)
148+
i3 = qual(Institution(name="Stanford University"), 3)
149+
150+
l1 = [i3, i1]
151+
l2 = [i2]
152+
153+
merged = merge(l1, l2)
154+
assert merged == [
155+
Institution(name="Stanford University"),
156+
Institution(name="MIT", category=InstitutionCategory.academia),
157+
]

0 commit comments

Comments
 (0)