-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathuniversal_ner.py
115 lines (108 loc) · 3.15 KB
/
universal_ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
from unitxt import add_to_catalog
from unitxt.blocks import LoadHF, TaskCard
from unitxt.operators import (
Copy,
GetItemByIndex,
Rename,
Set,
Shuffle,
)
from unitxt.span_lableing_operators import IobExtractor
from unitxt.test_utils.card import test_card
sub_tasks = [
"ceb_gja",
"zh_gsd",
"zh_gsdsimp",
"zh_pud",
"hr_set",
"da_ddt",
"en_ewt",
"en_pud",
"de_pud",
"pt_bosque",
"pt_pud",
"ru_pud",
"sr_set",
"sk_snk",
"sv_pud",
"sv_talbanken",
"tl_trg",
"tl_ugnayan",
]
classes = [
"O",
"B-PER",
"I-PER",
"B-ORG",
"I-ORG",
"B-LOC",
"I-LOC",
]
for sub_task in sub_tasks:
card = TaskCard(
loader=LoadHF(
path="universalner/universal_ner",
name=sub_task,
requirements=["conllu"],
),
preprocess_steps=[
# The dataset is sorted by classes
Shuffle(page_size=sys.maxsize),
Rename(
field_to_field={"ner_tags": "labels"},
),
GetItemByIndex(
field="labels", items_list=classes, process_every_value=True
),
IobExtractor(
labels=["Person", "Organization", "Location"],
begin_labels=["B-PER", "B-ORG", "B-LOC"],
inside_labels=["I-PER", "I-ORG", "I-LOC"],
outside_label="O",
),
Copy(
field_to_field={
"spans/*/start": "spans_starts",
"spans/*/end": "spans_ends",
"spans/*/label": "labels",
},
get_default=[],
not_exist_ok=True,
),
Set(
fields={
"entity_types": ["Person", "Organization", "Location"],
}
),
],
task="tasks.span_labeling.extraction",
templates="templates.span_labeling.extraction.all",
__tags__={
"arxiv": "2311.09122",
"language": [
"ceb",
"da",
"de",
"en",
"hr",
"pt",
"ru",
"sk",
"sr",
"sv",
"tl",
"zh",
],
"license": "cc-by-sa-4.0",
"region": "us",
"task_categories": "token-classification",
},
__description__=(
"Universal Named Entity Recognition (UNER) aims to fill a gap in multilingual NLP: high quality NER datasets in many languages with a shared tagset. UNER is modeled after the Universal Dependencies project, in that it is intended to be a large community annotation effort with language-universal guidelines. Further, we use the same text corpora as Universal Dependencies… See the full description on the dataset page: https://huggingface.co/datasets/universalner/universal_ner"
),
)
if sub_task == "en_ewt":
test_card(card)
sub_task = sub_task.replace("_", ".")
add_to_catalog(card, f"cards.universal_ner.{sub_task}", overwrite=True)