Skip to content

Commit 960cc14

Browse files
committed
add metadata.yaml plus some fixes
1 parent 6b1302b commit 960cc14

15 files changed

Lines changed: 2945 additions & 172 deletions

File tree

.editorconfig

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# https://editorconfig.org/
2+
3+
root = true
4+
5+
[*]
6+
charset = utf-8
7+
end_of_line = lf
8+
insert_final_newline = true
9+
10+
[*.py]
11+
indent_style = space
12+
indent_size = 4
13+
max_line_length = 120
14+
trim_trailing_whitespace = true
15+
16+
[*.yaml]
17+
indent_style = space
18+
indent_size = 2
19+
20+
[*.html]
21+
indent_style = space
22+
indent_size = 2

.gitignore

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
9+
# Virtual environments
10+
.venv
11+
12+
# Test files
13+
.coverage
14+
coverage.lcov
15+
coverage.xml
16+
17+
## Example files
18+
/examples/**/export
19+
/examples/**/logs
20+
/examples/**/.snakemake
21+
/examples/**/sparv-workdir

examples/metadata/config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
metadata:
2+
id: metadata-generation
3+
language: swe
4+
5+
export:
6+
annotations:
7+
- text

examples/valmanifest/config.yaml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,22 @@ export:
1414
default:
1515
- xml_export:pretty
1616
- sbx_superlim:predictions
17+
# - sbx_superlim:swenli_parallel
18+
# - sbx_superlim:swepar_parallel
1719
annotations:
1820
- <token>
1921
- <sentence>
2022
- <sentence>:sbx_superlim.migration_stance
2123
- <sentence>:sbx_superlim.nuclear_stance
24+
- <sentence>:sbx_superlim.dalaj-ged.label
25+
- <sentence>:sbx_superlim.dalaj-ged.certainty
26+
- <sentence>:sbx_superlim.swenli.label
27+
- <sentence>:sbx_superlim.swenli.certainty
2228
sbx_superlim:
2329
hf_model_path:
24-
absabank-imm: 'sbx/KB-bert-base-swedish-cased_absabank-imm'
25-
argumentation: 'sbx/KB-bert-base-swedish-argumentation_sent'
30+
absabank-imm: "sbx/KB-bert-base-swedish-cased_absabank-imm"
31+
argumentation: "sbx/KB-bert-base-swedish-cased_argumentation_sent"
32+
dalaj-ged: "sbx/bert-base-swedish-cased_dalaj-ged"
33+
swenli: "sbx/bert-base-swedish-cased_swenli"
2634
hf_inference_args:
2735
batch_size: 32

pyproject.toml

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,34 @@
1-
[build-system]
2-
requires = ["hatchling"]
3-
build-backend = "hatchling.build"
41
[project]
52
name = "sparv-sbx-superlim"
63
version = "0.1.0"
74
description = "A Sparv plugin for classifying text using the Superlim baseline models "
85
readme = "README.md"
9-
requires-python = ">=3.9"
10-
license.text = "The GNU General Public License v3.0"
11-
authors = [
12-
{ name = "Felix Morger", email = "felix.morger@gu.se" }
13-
]
14-
dependencies = [
15-
"sparv-pipeline~=5.0",
16-
"transformers~=4.41.2",
17-
"datasets~=2.20.0"
6+
requires-python = ">=3.11,<3.14"
7+
license = "GPL-3.0-only"
8+
authors = [
9+
{ name = "Felix Morger", email = "felix.morger@gu.se" },
10+
{ name = "Språkbanken Text", email = "sbx-info@svenska.gu.se" }
1811
]
1912
entry-points."sparv.plugin" = { sbx_superlim = "sbx_superlim" }
13+
dependencies = [
14+
"datasets>=2.20.0,<2.21",
15+
"setuptools>=80,<81",
16+
"sparv>=5.3.1",
17+
"transformers>=4.41.2,<4.42",
18+
]
19+
20+
[dependency-groups]
21+
metadata = [
22+
"sparv-sbx-metadata",
23+
]
24+
25+
[build-system]
26+
requires = ["uv_build>=0.10,<0.11"]
27+
build-backend = "uv_build"
28+
29+
[tool.uv.build-backend]
30+
module-name = "sbx_superlim"
31+
module-root = ""
2032

21-
[tool.hatch]
22-
build.include = ["/sbx_superlim"]
33+
[tool.uv.sources]
34+
sparv-sbx-metadata = { git = "https://github.com/spraakbanken/sparv-sbx-metadata", tag = "v2.3.1" }

sbx_superlim/__init__.py

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,57 @@
11
"""A Sparv plugin for classifying text using the Superlim baseline models."""
2+
23
from sparv.api import Config
3-
from .annotators import (
4-
absabank_imm,
5-
argumentation_sent,
6-
dalaj_ged,
7-
swenli
8-
)
4+
95
from . import exporters
6+
from .annotators import absabank_imm, argumentation_sent, dalaj_ged, swenli
7+
8+
__all__ = ["absabank_imm", "argumentation_sent", "dalaj_ged", "swenli", "exporters"]
109

1110
__config__ = [
1211
# TODO: Split this variable into many different variables according to superlim task
1312
# TODO: Make it configurable from config.yaml in corpus directory
14-
Config("sbx_superlim.hf_model_path.argumentation", "sbx/bert-base-swedish-cased_argumentation_sent", description="HuggingFace model fine-tuned on argumentation_sent"),
15-
Config("sbx_superlim.hf_model_path.absabank-imm", "sbx/bert-base-swedish-cased_absabank-imm", description="HuggingFace model fine-tuned on absabank-imm"),
16-
Config("sbx_superlim.hf_model_path.dalaj-ged", "sbx/bert-base-swedish-cased_dalaj-ged", description="HuggingFace model fine-tuned on dalaj-ged"),
17-
Config("sbx_superlim.hf_model_path.swenli", "sbx/bert-base-swedish-cased_swenli", description="HuggingFace model fine-tuned on swenli"),
18-
Config("sbx_superlim.hf_model_path.swepar", "sbx/bert-base-swedish-cased_swepar", description="HuggingFace model fine-tuned on swepar"),
19-
Config("sbx_superlim.hf_inference_args.batch_size", None, description="Batch size for inference. Required with large files and limited CPU/GPU memory."),
20-
Config("sbx_superlim.predictions.contains_words", [], description="Batch size for inference. Required with large files and limited CPU/GPU memory."),
21-
Config("sbx_superlim.predictions.source_files_spans", {}, description="Spans of the source files to analyze.")
13+
Config(
14+
"sbx_superlim.hf_model_path.argumentation",
15+
"sbx/bert-base-swedish-cased_argumentation_sent",
16+
description="HuggingFace model fine-tuned on argumentation_sent",
17+
),
18+
Config(
19+
"sbx_superlim.hf_model_path.absabank-imm",
20+
"sbx/bert-base-swedish-cased_absabank-imm",
21+
description="HuggingFace model fine-tuned on absabank-imm",
22+
),
23+
Config(
24+
"sbx_superlim.hf_model_path.dalaj-ged",
25+
"sbx/bert-base-swedish-cased_dalaj-ged",
26+
description="HuggingFace model fine-tuned on dalaj-ged",
27+
),
28+
Config(
29+
"sbx_superlim.hf_model_path.swenli",
30+
"sbx/bert-base-swedish-cased_swenli",
31+
description="HuggingFace model fine-tuned on swenli",
32+
),
33+
Config(
34+
"sbx_superlim.hf_model_path.swepar",
35+
"sbx/bert-base-swedish-cased_swepar",
36+
description="HuggingFace model fine-tuned on swepar",
37+
),
38+
Config(
39+
"sbx_superlim.hf_inference_args.batch_size",
40+
None,
41+
description="Batch size for inference. Required with large files and limited CPU/GPU memory.",
42+
),
43+
Config(
44+
"sbx_superlim.predictions.contains_words",
45+
[],
46+
description="Batch size for inference. Required with large files and limited CPU/GPU memory.",
47+
),
48+
Config(
49+
"sbx_superlim.predictions.source_files_spans",
50+
{},
51+
description="Spans of the source files to analyze.",
52+
),
2253
]
2354

24-
__description__ = "A Sparv plugin for classifying text using the Superlim baseline models."
55+
__description__ = (
56+
"A Sparv plugin for classifying text using the Superlim baseline models."
57+
)
Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,9 @@
1-
from sparv.api import (
2-
Annotation,
3-
Config,
4-
Output,
5-
Text,
6-
annotator
7-
)
8-
1+
from sparv.api import Annotation, Config, Output, Text, annotator
92
from transformers import (
10-
AutoTokenizer,
113
AutoModelForSequenceClassification,
12-
TextClassificationPipeline
13-
)
4+
AutoTokenizer,
5+
TextClassificationPipeline,
6+
)
147

158
from ..common import prepare_inputs
169

@@ -21,13 +14,16 @@ def postprocess(self, model_outputs):
2114
return best_class
2215

2316

24-
@annotator("Label the sentiment towards immigration on a continuous 1--5 scale", language="swe")
17+
@annotator(
18+
"Label the sentiment towards immigration on a continuous 1--5 scale",
19+
language=["swe"],
20+
)
2521
def migration_stance(
26-
text : Text = Text(),
22+
text: Text = Text(),
2723
sentence: Annotation = Annotation("<sentence>"),
2824
out_score: Output = Output("<sentence>:sbx_superlim.migration_stance"),
2925
hf_model_path: str = Config("sbx_superlim.hf_model_path.absabank-imm"),
30-
hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size")
26+
hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size"),
3127
):
3228
inputs = prepare_inputs(text, sentence)
3329
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
@@ -36,4 +32,4 @@ def migration_stance(
3632
pipe = ABSAbankPipeline(model=model, tokenizer=tokenizer)
3733
model_outputs = pipe(inputs, batch_size=hf_batch_size)
3834
scores = [str(float(o)) for o in model_outputs]
39-
out_score.write(scores)
35+
out_score.write(scores)
Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,58 @@
11
"""Annotations for a model trained on argumentation_sentences."""
22

3-
43
from datasets import get_dataset_config_info
5-
from sparv.api import (
6-
Annotation,
7-
Config,
8-
Output,
9-
Text,
10-
annotator
11-
)
12-
4+
from sparv.api import Annotation, Config, Output, Text, annotator, get_logger
135
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
146

157
from ..common import prepare_inputs
168
from ..helpers import get_label_mapper
179

18-
1910
TOPIC_EN_SV = {
20-
'abortion': 'abort',
21-
'cloning': 'kloning',
22-
'death_penalty': 'dödsstraff',
23-
'marijuana_legalization': 'marijuanalegalisering',
24-
'minimum_wage': 'minimilön',
25-
'nuclear': 'kärnkraft'
11+
"abortion": "abort",
12+
"cloning": "kloning",
13+
"death_penalty": "dödsstraff",
14+
"marijuana_legalization": "marijuanalegalisering",
15+
"minimum_wage": "minimilön",
16+
"nuclear": "kärnkraft",
2617
}
2718

19+
logger = get_logger(__name__)
20+
21+
2822
def create_argsent_annotator(topic: str):
2923
if topic not in TOPIC_EN_SV:
3024
raise ValueError(f"{t} is not a valid topic")
31-
@annotator(f"Identify the stance towards {topic}", topic)
25+
26+
@annotator(f"Identify the stance towards {topic}", topic, language=["swe"])
3227
def argsent_func(
3328
out_stance: Output = Output(f"<sentence>:sbx_superlim.{topic}_stance"),
34-
out_stance_certainty: Output = Output(f"<sentence>:sbx_superlim.{topic}_stance.certainty"),
29+
out_stance_certainty: Output = Output(
30+
f"<sentence>:sbx_superlim.{topic}_stance.certainty"
31+
),
3532
sentence: Annotation = Annotation("<sentence>"),
36-
text = Text(),
37-
hf_model_path = Config("sbx_superlim.hf_model_path.argumentation"),
38-
hf_batch_size = Config("sbx_superlim.hf_inference_args.batch_size")
33+
text=Text(),
34+
hf_model_path=Config("sbx_superlim.hf_model_path.argumentation"),
35+
hf_batch_size=Config("sbx_superlim.hf_inference_args.batch_size"),
3936
):
40-
ds_config = get_dataset_config_info('sbx/superlim-2', 'argumentation_sent')
37+
ds_config = get_dataset_config_info("sbx/superlim-2", "argumentation-sentences")
38+
logger.debug("ds_config=%s", ds_config)
4139
model = AutoModelForSequenceClassification.from_pretrained(hf_model_path)
4240
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
43-
sep_token = tokenizer.special_tokens_map['sep_token']
41+
sep_token = tokenizer.special_tokens_map["sep_token"]
4442
topic_sv = TOPIC_EN_SV[topic]
4543
inputs = prepare_inputs(text, sentence, f" {sep_token} {topic_sv}")
46-
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, batch_size=hf_batch_size)
44+
pipe = pipeline(
45+
"text-classification",
46+
model=model,
47+
tokenizer=tokenizer,
48+
batch_size=hf_batch_size,
49+
)
4750
output = pipe(inputs)
4851
label_mapper = get_label_mapper(ds_config, pipe.model.config)
49-
labels = [label_mapper[o['label']] for o in output]
52+
labels = [label_mapper[o["label"]] for o in output]
5053
out_stance.write([l for l in labels])
51-
out_stance_certainty.write([str(o['score']) for o in output])
54+
out_stance_certainty.write([str(o["score"]) for o in output])
55+
5256

5357
for t in TOPIC_EN_SV:
54-
create_argsent_annotator(t)
58+
create_argsent_annotator(t)
Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,25 @@
11
from datasets import get_dataset_config_info
2-
from sparv.api import (
3-
Annotation,
4-
Config,
5-
Output,
6-
Text,
7-
annotator
8-
)
9-
10-
from transformers import pipeline
2+
from sparv.api import Annotation, Config, Output, Text, annotator
3+
from transformers import pipeline
114

125
from ..common import prepare_inputs
136
from ..helpers import get_label_mapper
147

158

16-
@annotator(
17-
"Determine whether a sentence is correct Swedish or not",
18-
language="swe"
19-
)
9+
@annotator("Determine whether a sentence is correct Swedish or not", language=["swe"])
2010
def dalaj_ged(
21-
text : Text = Text(),
11+
text: Text = Text(),
2212
sentence: Annotation = Annotation("<sentence>"),
23-
out_label : Output = Output("<sentence>:sbx_superlim.dalaj-ged.label"),
13+
out_label: Output = Output("<sentence>:sbx_superlim.dalaj-ged.label"),
2414
out_certainty: Output = Output("<sentence>:sbx_superlim.dalaj-ged.certainty"),
2515
hf_model_path: str = Config("sbx_superlim.hf_model_path.dalaj-ged"),
26-
hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size")
16+
hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size"),
2717
):
28-
ds_config = get_dataset_config_info('sbx/superlim-2', 'dalaj-ged')
18+
ds_config = get_dataset_config_info("sbx/superlim-2", "dalaj-ged-superlim")
2919
inputs = prepare_inputs(text, sentence)
3020
pipe = pipeline("text-classification", model=hf_model_path)
31-
output = pipe(inputs, batch_size = hf_batch_size)
21+
output = pipe(inputs, batch_size=hf_batch_size)
3222
label_mapper = get_label_mapper(ds_config, pipe.model.config)
33-
labels = [label_mapper[o['label']] for o in output]
23+
labels = [label_mapper[o["label"]] for o in output]
3424
out_label.write(labels)
35-
out_certainty.write([str(o['score']) for o in output])
25+
out_certainty.write([str(o["score"]) for o in output])

0 commit comments

Comments
 (0)