-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmerge.py
More file actions
95 lines (76 loc) · 2.33 KB
/
merge.py
File metadata and controls
95 lines (76 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from model import model
from spliting import split
def make_segments(text):
return [
{
'text': segment,
'scores': {
predict['label']: predict['score']
for predict in model(segment, top_k=None)
}
}
for segment in split(text)
]
def merge_segments(segment1, segment2):
segment = segment1['text'] + ' ' + segment2['text']
return {
'text': segment,
'scores': {
predict['label']: predict['score']
for predict in model(segment, top_k=None)
}
}
def merge_by_sim(segments, bottom_threshold, top_threshold):
i = 0
while i != len(segments) - 1:
conf_1 = max(segments[i]['scores'].values())
conf_2 = max(segments[i + 1]['scores'].values())
try:
merged = merge_segments(segments[i], segments[i + 1])
except RuntimeError:
i += 1
continue
conf_m = max(merged['scores'].values())
if (
(conf_m > top_threshold) and
(conf_1 < bottom_threshold or conf_2 < bottom_threshold)
):
segments[i] = merged
segments.pop(i + 1)
else:
i += 1
def merge_by_same(segments):
i = 0
while i != len(segments) - 1:
try:
merged = merge_segments(segments[i], segments[i + 1])
except RuntimeError:
i += 1
continue
conf = (
max(segments[i]['scores'], key=segments[i]['scores'].get) ==
max(segments[i + 1]['scores'], key=segments[i + 1]['scores'].get)
)
if conf:
segments[i] = merged
segments.pop(i + 1)
else:
i += 1
def set_low_as(segments, as_, threshold):
for i in range(len(segments)):
cls = max(segments[i]['scores'], key=segments[i]['scores'].get)
if segments[i]['scores'][cls] < threshold:
segments[i]['scores'][as_] = threshold + 0.1
def segmentize_and_merge(text):
segments = make_segments(text)
merge_by_sim(segments, 0.93, 0.96)
merge_by_same(segments)
set_low_as(segments, 'notes', 0.56)
result = [
[
segment['text'],
max(segment['scores'], key=segment['scores'].get)
]
for segment in segments
]
return result