spraakbanken
diff --git a/‎.editorconfig‎
Lines changed: 22 additions & 0 deletions b/‎.editorconfig‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 21 additions & 0 deletions b/‎.gitignore‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/metadata/config.yaml‎
Lines changed: 7 additions & 0 deletions b/‎examples/metadata/config.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/valmanifest/config.yaml‎
Lines changed: 10 additions & 2 deletions b/‎examples/valmanifest/config.yaml‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 26 additions & 14 deletions b/‎pyproject.toml‎
Lines changed: 26 additions & 14 deletions
diff --git a/‎sbx_superlim/__init__.py‎
Lines changed: 48 additions & 15 deletions b/‎sbx_superlim/__init__.py‎
Lines changed: 48 additions & 15 deletions
diff --git a/‎sbx_superlim/annotators/absabank_imm.py‎
Lines changed: 11 additions & 15 deletions b/‎sbx_superlim/annotators/absabank_imm.py‎
Lines changed: 11 additions & 15 deletions
diff --git a/‎sbx_superlim/annotators/argumentation_sent.py‎
Lines changed: 31 additions & 27 deletions b/‎sbx_superlim/annotators/argumentation_sent.py‎
Lines changed: 31 additions & 27 deletions
diff --git a/‎sbx_superlim/annotators/dalaj_ged.py‎
Lines changed: 10 additions & 20 deletions b/‎sbx_superlim/annotators/dalaj_ged.py‎
Lines changed: 10 additions & 20 deletions
@@ -0,0 +1,22 @@
+# https://editorconfig.org/
+
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+
+[*.py]
+indent_style = space
+indent_size = 4
+max_line_length = 120
+trim_trailing_whitespace = true
+
+[*.yaml]
+indent_style = space
+indent_size = 2
+
+[*.html]
+indent_style = space
+indent_size = 2
@@ -0,0 +1,21 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
+# Test files
+.coverage
+coverage.lcov
+coverage.xml
+
+## Example files
+/examples/**/export
+/examples/**/logs
+/examples/**/.snakemake
+/examples/**/sparv-workdir
@@ -0,0 +1,7 @@
+metadata:
+  id: metadata-generation
+  language: swe
+
+export:
+  annotations:
+    - text
@@ -14,14 +14,22 @@ export:
   default:
     - xml_export:pretty
     - sbx_superlim:predictions
+    # - sbx_superlim:swenli_parallel
+    # - sbx_superlim:swepar_parallel
   annotations:
     - <token>
     - <sentence>
     - <sentence>:sbx_superlim.migration_stance
     - <sentence>:sbx_superlim.nuclear_stance
+    - <sentence>:sbx_superlim.dalaj-ged.label
+    - <sentence>:sbx_superlim.dalaj-ged.certainty
+    - <sentence>:sbx_superlim.swenli.label
+    - <sentence>:sbx_superlim.swenli.certainty
 sbx_superlim:
   hf_model_path:
-    absabank-imm: 'sbx/KB-bert-base-swedish-cased_absabank-imm'
-    argumentation: 'sbx/KB-bert-base-swedish-argumentation_sent'
+    absabank-imm: "sbx/KB-bert-base-swedish-cased_absabank-imm"
+    argumentation: "sbx/KB-bert-base-swedish-cased_argumentation_sent"
+    dalaj-ged: "sbx/bert-base-swedish-cased_dalaj-ged"
+    swenli: "sbx/bert-base-swedish-cased_swenli"
   hf_inference_args:
     batch_size: 32
@@ -1,22 +1,34 @@
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
 [project]
 name = "sparv-sbx-superlim"
 version = "0.1.0"
 description = "A Sparv plugin for classifying text using the Superlim baseline models "
 readme = "README.md"
-requires-python = ">=3.9"
-license.text = "The GNU General Public License v3.0"
- authors = [
-     { name = "Felix Morger", email = "felix.morger@gu.se" }
- ]
-dependencies = [
-    "sparv-pipeline~=5.0",
-    "transformers~=4.41.2",
-    "datasets~=2.20.0"
+requires-python = ">=3.11,<3.14"
+license = "GPL-3.0-only"
+authors = [
+  { name = "Felix Morger", email = "felix.morger@gu.se" },
+  { name = "Språkbanken Text", email = "sbx-info@svenska.gu.se" }
 ]
 entry-points."sparv.plugin" = { sbx_superlim = "sbx_superlim" }
+dependencies = [
+  "datasets>=2.20.0,<2.21",
+  "setuptools>=80,<81",
+  "sparv>=5.3.1",
+  "transformers>=4.41.2,<4.42",
+]
+
+[dependency-groups]
+metadata = [
+  "sparv-sbx-metadata",
+]
+
+[build-system]
+requires = ["uv_build>=0.10,<0.11"]
+build-backend = "uv_build"
+
+[tool.uv.build-backend]
+module-name = "sbx_superlim"
+module-root = ""
 
-[tool.hatch]
-build.include = ["/sbx_superlim"]
+[tool.uv.sources]
+sparv-sbx-metadata = { git = "https://github.com/spraakbanken/sparv-sbx-metadata", tag = "v2.3.1" }
@@ -1,24 +1,57 @@
 """A Sparv plugin for classifying text using the Superlim baseline models."""
+
 from sparv.api import Config
-from .annotators import (
-    absabank_imm,
-    argumentation_sent,
-    dalaj_ged,
-    swenli
-    )
+
 from . import exporters
+from .annotators import absabank_imm, argumentation_sent, dalaj_ged, swenli
+
+__all__ = ["absabank_imm", "argumentation_sent", "dalaj_ged", "swenli", "exporters"]
 
 __config__ = [
     # TODO: Split this variable into many different variables according to superlim task
     # TODO: Make it configurable from config.yaml in corpus directory
-    Config("sbx_superlim.hf_model_path.argumentation", "sbx/bert-base-swedish-cased_argumentation_sent", description="HuggingFace model fine-tuned on argumentation_sent"),
-    Config("sbx_superlim.hf_model_path.absabank-imm", "sbx/bert-base-swedish-cased_absabank-imm", description="HuggingFace model fine-tuned on absabank-imm"),
-    Config("sbx_superlim.hf_model_path.dalaj-ged", "sbx/bert-base-swedish-cased_dalaj-ged", description="HuggingFace model fine-tuned on dalaj-ged"),
-    Config("sbx_superlim.hf_model_path.swenli", "sbx/bert-base-swedish-cased_swenli", description="HuggingFace model fine-tuned on swenli"),
-    Config("sbx_superlim.hf_model_path.swepar", "sbx/bert-base-swedish-cased_swepar", description="HuggingFace model fine-tuned on swepar"),
-    Config("sbx_superlim.hf_inference_args.batch_size", None, description="Batch size for inference. Required with large files and limited CPU/GPU memory."),
-    Config("sbx_superlim.predictions.contains_words", [], description="Batch size for inference. Required with large files and limited CPU/GPU memory."),
-    Config("sbx_superlim.predictions.source_files_spans", {}, description="Spans of the source files to analyze.")
+    Config(
+        "sbx_superlim.hf_model_path.argumentation",
+        "sbx/bert-base-swedish-cased_argumentation_sent",
+        description="HuggingFace model fine-tuned on argumentation_sent",
+    ),
+    Config(
+        "sbx_superlim.hf_model_path.absabank-imm",
+        "sbx/bert-base-swedish-cased_absabank-imm",
+        description="HuggingFace model fine-tuned on absabank-imm",
+    ),
+    Config(
+        "sbx_superlim.hf_model_path.dalaj-ged",
+        "sbx/bert-base-swedish-cased_dalaj-ged",
+        description="HuggingFace model fine-tuned on dalaj-ged",
+    ),
+    Config(
+        "sbx_superlim.hf_model_path.swenli",
+        "sbx/bert-base-swedish-cased_swenli",
+        description="HuggingFace model fine-tuned on swenli",
+    ),
+    Config(
+        "sbx_superlim.hf_model_path.swepar",
+        "sbx/bert-base-swedish-cased_swepar",
+        description="HuggingFace model fine-tuned on swepar",
+    ),
+    Config(
+        "sbx_superlim.hf_inference_args.batch_size",
+        None,
+        description="Batch size for inference. Required with large files and limited CPU/GPU memory.",
+    ),
+    Config(
+        "sbx_superlim.predictions.contains_words",
+        [],
+        description="Batch size for inference. Required with large files and limited CPU/GPU memory.",
+    ),
+    Config(
+        "sbx_superlim.predictions.source_files_spans",
+        {},
+        description="Spans of the source files to analyze.",
+    ),
 ]
 
-__description__ = "A Sparv plugin for classifying text using the Superlim baseline models."
+__description__ = (
+    "A Sparv plugin for classifying text using the Superlim baseline models."
+)
@@ -1,16 +1,9 @@
-from sparv.api import (
-    Annotation,
-    Config,
-    Output,
-    Text,
-    annotator
-    )
-
+from sparv.api import Annotation, Config, Output, Text, annotator
 from transformers import (
-    AutoTokenizer, 
     AutoModelForSequenceClassification,
-    TextClassificationPipeline
-    )
+    AutoTokenizer,
+    TextClassificationPipeline,
+)
 
 from ..common import prepare_inputs
 
@@ -21,13 +14,16 @@ def postprocess(self, model_outputs):
         return best_class
 
 
-@annotator("Label the sentiment towards immigration on a continuous 1--5 scale", language="swe")
+@annotator(
+    "Label the sentiment towards immigration on a continuous 1--5 scale",
+    language=["swe"],
+)
 def migration_stance(
-    text : Text = Text(),
+    text: Text = Text(),
     sentence: Annotation = Annotation("<sentence>"),
     out_score: Output = Output("<sentence>:sbx_superlim.migration_stance"),
     hf_model_path: str = Config("sbx_superlim.hf_model_path.absabank-imm"),
-    hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size")
+    hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size"),
 ):
     inputs = prepare_inputs(text, sentence)
     tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
@@ -36,4 +32,4 @@ def migration_stance(
     pipe = ABSAbankPipeline(model=model, tokenizer=tokenizer)
     model_outputs = pipe(inputs, batch_size=hf_batch_size)
     scores = [str(float(o)) for o in model_outputs]
-    out_score.write(scores)
+    out_score.write(scores)
@@ -1,54 +1,58 @@
 """Annotations for a model trained on argumentation_sentences."""
 
-
 from datasets import get_dataset_config_info
-from sparv.api import (
-    Annotation,
-    Config,
-    Output,
-    Text,
-    annotator
-    )
-
+from sparv.api import Annotation, Config, Output, Text, annotator, get_logger
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 
 from ..common import prepare_inputs
 from ..helpers import get_label_mapper
 
-
 TOPIC_EN_SV = {
-    'abortion': 'abort',
-    'cloning': 'kloning',
-    'death_penalty': 'dödsstraff',
-    'marijuana_legalization': 'marijuanalegalisering',
-    'minimum_wage': 'minimilön',
-    'nuclear': 'kärnkraft'
+    "abortion": "abort",
+    "cloning": "kloning",
+    "death_penalty": "dödsstraff",
+    "marijuana_legalization": "marijuanalegalisering",
+    "minimum_wage": "minimilön",
+    "nuclear": "kärnkraft",
 }
 
+logger = get_logger(__name__)
+
+
 def create_argsent_annotator(topic: str):
     if topic not in TOPIC_EN_SV:
         raise ValueError(f"{t} is not a valid topic")
-    @annotator(f"Identify the stance towards {topic}", topic)
+
+    @annotator(f"Identify the stance towards {topic}", topic, language=["swe"])
     def argsent_func(
         out_stance: Output = Output(f"<sentence>:sbx_superlim.{topic}_stance"),
-        out_stance_certainty: Output = Output(f"<sentence>:sbx_superlim.{topic}_stance.certainty"),
+        out_stance_certainty: Output = Output(
+            f"<sentence>:sbx_superlim.{topic}_stance.certainty"
+        ),
         sentence: Annotation = Annotation("<sentence>"),
-        text = Text(),
-        hf_model_path = Config("sbx_superlim.hf_model_path.argumentation"),
-        hf_batch_size = Config("sbx_superlim.hf_inference_args.batch_size")
+        text=Text(),
+        hf_model_path=Config("sbx_superlim.hf_model_path.argumentation"),
+        hf_batch_size=Config("sbx_superlim.hf_inference_args.batch_size"),
     ):
-        ds_config = get_dataset_config_info('sbx/superlim-2', 'argumentation_sent')
+        ds_config = get_dataset_config_info("sbx/superlim-2", "argumentation-sentences")
+        logger.debug("ds_config=%s", ds_config)
         model = AutoModelForSequenceClassification.from_pretrained(hf_model_path)
         tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
-        sep_token = tokenizer.special_tokens_map['sep_token']
+        sep_token = tokenizer.special_tokens_map["sep_token"]
         topic_sv = TOPIC_EN_SV[topic]
         inputs = prepare_inputs(text, sentence, f" {sep_token} {topic_sv}")
-        pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, batch_size=hf_batch_size)
+        pipe = pipeline(
+            "text-classification",
+            model=model,
+            tokenizer=tokenizer,
+            batch_size=hf_batch_size,
+        )
         output = pipe(inputs)
         label_mapper = get_label_mapper(ds_config, pipe.model.config)
-        labels = [label_mapper[o['label']] for o in output]
+        labels = [label_mapper[o["label"]] for o in output]
         out_stance.write([l for l in labels])
-        out_stance_certainty.write([str(o['score']) for o in output])
+        out_stance_certainty.write([str(o["score"]) for o in output])
+
 
 for t in TOPIC_EN_SV:
-    create_argsent_annotator(t)
+    create_argsent_annotator(t)
@@ -1,35 +1,25 @@
 from datasets import get_dataset_config_info
-from sparv.api import (
-    Annotation,
-    Config,
-    Output,
-    Text,
-    annotator
-    )
-
-from transformers import pipeline    
+from sparv.api import Annotation, Config, Output, Text, annotator
+from transformers import pipeline
 
 from ..common import prepare_inputs
 from ..helpers import get_label_mapper
 
 
-@annotator(
-        "Determine whether a sentence is correct Swedish or not",
-        language="swe"
-)
+@annotator("Determine whether a sentence is correct Swedish or not", language=["swe"])
 def dalaj_ged(
-    text : Text = Text(),
+    text: Text = Text(),
     sentence: Annotation = Annotation("<sentence>"),
-    out_label : Output = Output("<sentence>:sbx_superlim.dalaj-ged.label"),
+    out_label: Output = Output("<sentence>:sbx_superlim.dalaj-ged.label"),
     out_certainty: Output = Output("<sentence>:sbx_superlim.dalaj-ged.certainty"),
     hf_model_path: str = Config("sbx_superlim.hf_model_path.dalaj-ged"),
-    hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size")
+    hf_batch_size: int = Config("sbx_superlim.hf_inference_args.batch_size"),
 ):
-    ds_config = get_dataset_config_info('sbx/superlim-2', 'dalaj-ged')
+    ds_config = get_dataset_config_info("sbx/superlim-2", "dalaj-ged-superlim")
     inputs = prepare_inputs(text, sentence)
     pipe = pipeline("text-classification", model=hf_model_path)
-    output = pipe(inputs, batch_size = hf_batch_size)
+    output = pipe(inputs, batch_size=hf_batch_size)
     label_mapper = get_label_mapper(ds_config, pipe.model.config)
-    labels = [label_mapper[o['label']] for o in output]
+    labels = [label_mapper[o["label"]] for o in output]
     out_label.write(labels)
-    out_certainty.write([str(o['score']) for o in output])
+    out_certainty.write([str(o["score"]) for o in output])