diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 35336f26..97d65bfb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,7 +35,7 @@ jobs:
python -m poetry config virtualenvs.in-project true
- name: Set up cache
- uses: actions/cache@v2
+ uses: actions/cache@v4
id: cache
with:
path: .venv
@@ -53,4 +53,4 @@ jobs:
- name: Test with pytest
run: |
- poetry run pytest
+ poetry run pytest tests
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 2273ff4f..72d7fb74 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,67 +1,29 @@
-name: Docs2Pages
-
-on: [pull_request]
-
+name: docs2pages
+on:
+ push:
+ branches:
+ - dev
+ - main
+permissions:
+ contents: write
jobs:
- run:
+ deploy:
runs-on: ubuntu-latest
-
- strategy:
- matrix:
- os: [ubuntu-latest]
- python-version: [3.9]
- fail-fast: false
-
steps:
- - uses: actions/checkout@v2
- with:
- fetch-depth: 2
-
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v1
- with:
- python-version: ${{ matrix.python-version }}
-
- - name: Bootstrap poetry
- shell: bash
- run: |
- python -m ensurepip
- python -m pip install --upgrade pip
- python -m pip install poetry
-
- - name: Configure poetry
- shell: bash
+ - uses: actions/checkout@v4
+ - name: Configure Git Credentials
run: |
- python -m poetry config virtualenvs.in-project true
-
- # - name: Set up cache
- # uses: actions/cache@v2
- # id: cache
- # with:
- # path: .venv
- # key: venv-${{ runner.os }}-${{ steps.full-python-version.outputs.version }}-${{ hashFiles('**/poetry.lock') }}
-
- # - name: Ensure cache is healthy
- # if: steps.cache.outputs.cache-hit == 'true'
- # shell: bash
- # run: timeout 10s python -m poetry run pip --version || rm -rf .venv
-
- - name: Install dependencies
- shell: bash
- run: |
- python -m poetry install --with docs
-
- - name: Build documentation
- run: |
- mkdir gh-pages
- touch gh-pages/.nojekyll
- cd docs/
- poetry run sphinx-build -b html source _build
- cp -r _build/* ../gh-pages/
-
- - name: Deploy documentation
- # if: ${{ github.event_name == 'push' }}
- uses: JamesIves/github-pages-deploy-action@4.1.4
+ git config user.name github-actions[bot]
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
+ - uses: actions/setup-python@v5
+ with:
+ python-version: 3.x
+ - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+ - uses: actions/cache@v4
with:
- branch: gh-pages
- folder: gh-pages
+ key: mkdocs-material-${{ env.cache_id }}
+ path: .cache
+ restore-keys: |
+ mkdocs-material-
+ - run: pip install mkdocs-material mkdocs-material-extensions
+ - run: mkdocs gh-deploy --force
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index cad2bab3..25af8b38 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,6 +55,9 @@ coverage.xml
*.mo
*.pot
+# OS stuff:
+.DS_Store
+
# Django stuff:
*.log
local_settings.py
@@ -150,4 +153,4 @@ preprocessing/toponymmatching/experiments/
# Docs
_build
-test.ipynb
\ No newline at end of file
+test.ipynb
diff --git a/app/app_template.py b/app/app_template.py
old mode 100644
new mode 100755
index 92e435ae..a4084e8b
--- a/app/app_template.py
+++ b/app/app_template.py
@@ -1,35 +1,45 @@
+import importlib
import os
import sys
import time
from pathlib import Path
-from typing import Union, Optional, List
+from typing import List, Optional, Union
import uvicorn
from fastapi import FastAPI, Request
from pydantic import BaseModel
-from config import CONFIG as pipeline_config
-
from t_res.geoparser import pipeline
+from t_res.utils.dataclasses import SentenceMentions, Candidates
+
+os.environ["APP_CONFIG_NAME"] = "t-res_deezy_reldisamb-wpubl-wmtops"
+
+config_mod = importlib.import_module(
+ ".t-res_deezy_reldisamb-wpubl-wmtops", "app.configs"
+)
+pipeline_config = config_mod.CONFIG
geoparser = pipeline.Pipeline(**pipeline_config)
-class APIQuery(BaseModel):
+class RecognitionAPIQuery(BaseModel):
text: str
- place: Optional[Union[str, None]] = None
- place_wqid: Optional[Union[str, None]] = None
class CandidatesAPIQuery(BaseModel):
- toponyms: List[dict]
+ sentence_mentions: List[dict]
+ place_of_pub_wqid: Optional[str] = None
+ place_of_pub: Optional[str] = None
class DisambiguationAPIQuery(BaseModel):
- dataset: List[dict]
- wk_cands: dict
- place: Optional[Union[str, None]] = None
- place_wqid: Optional[Union[str, None]] = None
+ candidates: dict
+
+
+class PipelineAPIQuery(BaseModel):
+ text: str
+ place_of_pub_wqid: Optional[str] = None
+ place_of_pub: Optional[str] = None
app_config_name = os.environ["APP_CONFIG_NAME"]
@@ -38,74 +48,61 @@ class DisambiguationAPIQuery(BaseModel):
@app.get("/")
async def read_root(request: Request):
- return {"Welcome to T-Res!": request.app.title}
-
-
-@app.get("/test")
-async def test_pipeline():
- resolved = geoparser.run_sentence(
- "Harvey, from London;Thomas and Elizabeth, Barnett.",
- place="Manchester",
- place_wqid="Q18125",
- )
-
- return resolved
-
-
-@app.get("/resolve_sentence")
-async def run_sentence(api_query: APIQuery, request_id: Union[str, None] = None):
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
- resolved = geoparser.run_sentence(
- api_query.text, place=place, place_wqid=place_wqid
- )
-
- return resolved
-
-
-@app.get("/resolve_full_text")
-async def run_text(api_query: APIQuery):
-
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
- resolved = geoparser.run_text(api_query.text, place=place, place_wqid=place_wqid)
-
- return resolved
-
+ return {
+ "Title": request.app.title,
+ "request.url": request.url,
+ "request.query_params": request.query_params,
+ "root_path": request.scope.get("root_path"),
+ "request.client": request.client,
+ "hostname": os.uname()[1],
+ "worker_id": os.getpid(),
+ }
@app.get("/run_ner")
-async def run_ner(api_query: APIQuery):
-
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
+async def run_ner(api_query: RecognitionAPIQuery):
ner_output = geoparser.run_text_recognition(
- api_query.text, place=place, place_wqid=place_wqid
+ api_query.text
)
-
return ner_output
-
@app.get("/run_candidate_selection")
async def run_candidate_selection(cand_api_query: CandidatesAPIQuery):
-
- wk_cands = geoparser.run_candidate_selection(cand_api_query.toponyms)
- return wk_cands
-
+ sentence_mentions = SentenceMentions.from_json(cand_api_query.sentence_mentions)
+ candidates = geoparser.run_candidate_selection(
+ sentence_mentions,
+ place_of_pub_wqid=cand_api_query.place_of_pub_wqid,
+ place_of_pub=cand_api_query.place_of_pub,
+ )
+ return candidates
@app.get("/run_disambiguation")
async def run_disambiguation(api_query: DisambiguationAPIQuery):
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
- disamb_output = geoparser.run_disambiguation(
- api_query.dataset, api_query.wk_cands, place, place_wqid
+ candidates = Candidates.from_dict(api_query.candidates)
+ predictions = geoparser.run_disambiguation(candidates)
+ return predictions
+
+@app.get("/run_pipeline")
+async def run_pipeline(api_query: PipelineAPIQuery):
+ predictions = geoparser.run(
+ text=api_query.text,
+ place_of_pub_wqid=api_query.place_of_pub_wqid,
+ place_of_pub=api_query.place_of_pub,
)
- return disamb_output
+ return predictions
+@app.get("/test")
+async def test_pipeline():
+ predictions = geoparser.run(
+ "Harvey, from London;Thomas and Elizabeth, Barnett.",
+ place_of_pub_wqid="Q18125",
+ place_of_pub="Manchester",
+ )
+ return predictions
@app.get("/health")
async def healthcheck():
return {"status": "ok"}
-
if __name__ == "__main__":
- uvicorn.run(app, host="0.0.0.0", port=8000)
+ # poetry run uvicorn app.run_local_app:app --port 8123
+ uvicorn.run(app, host="0.0.0.0", port=8123)
diff --git a/app/app_template_old.py b/app/app_template_old.py
new file mode 100644
index 00000000..3e9f0c7d
--- /dev/null
+++ b/app/app_template_old.py
@@ -0,0 +1,111 @@
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Union, Optional, List
+
+import uvicorn
+from fastapi import FastAPI, Request
+from pydantic import BaseModel
+
+from config import CONFIG as pipeline_config
+
+from t_res.geoparser import pipeline
+
+geoparser = pipeline.Pipeline(**pipeline_config)
+
+
+class APIQuery(BaseModel):
+ text: str
+ place: Optional[Union[str, None]] = None
+ place_wqid: Optional[Union[str, None]] = None
+
+
+class CandidatesAPIQuery(BaseModel):
+ toponyms: List[dict]
+
+
+class DisambiguationAPIQuery(BaseModel):
+ dataset: List[dict]
+ wk_cands: dict
+ place: Optional[Union[str, None]] = None
+ place_wqid: Optional[Union[str, None]] = None
+
+
+app_config_name = os.environ["APP_CONFIG_NAME"]
+app = FastAPI(title=f"Toponym Resolution Pipeline API ({app_config_name})")
+
+
+@app.get("/")
+async def read_root(request: Request):
+ return {"Welcome to T-Res!": request.app.title}
+
+
+@app.get("/test")
+async def test_pipeline():
+ resolved = geoparser.run_sentence_deprecated(
+ "Harvey, from London;Thomas and Elizabeth, Barnett.",
+ place="Manchester",
+ place_wqid="Q18125",
+ )
+
+ return resolved
+
+
+@app.get("/resolve_sentence")
+async def run_sentence(api_query: APIQuery, request_id: Union[str, None] = None):
+ place = "" if api_query.place is None else api_query.place
+ place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
+ resolved = geoparser.run_sentence_deprecated(
+ api_query.text, place=place, place_wqid=place_wqid
+ )
+
+ return resolved
+
+
+@app.get("/resolve_full_text")
+async def run_text(api_query: APIQuery):
+
+ place = "" if api_query.place is None else api_query.place
+ place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
+ resolved = geoparser.run_text_deprecated(api_query.text, place=place, place_wqid=place_wqid)
+
+ return resolved
+
+
+@app.get("/run_ner")
+async def run_ner(api_query: APIQuery):
+
+ place = "" if api_query.place is None else api_query.place
+ place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
+ ner_output = geoparser.run_text_recognition_deprecated(
+ api_query.text, place=place, place_wqid=place_wqid
+ )
+
+ return ner_output
+
+
+@app.get("/run_candidate_selection")
+async def run_candidate_selection(cand_api_query: CandidatesAPIQuery):
+
+ wk_cands = geoparser.run_candidate_selection_deprecated(cand_api_query.toponyms)
+ return wk_cands
+
+
+@app.get("/run_disambiguation")
+async def run_disambiguation(api_query: DisambiguationAPIQuery):
+ place = "" if api_query.place is None else api_query.place
+ place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
+ disamb_output = geoparser.run_disambiguation_deprecated(
+ api_query.dataset, api_query.wk_cands, place, place_wqid
+ )
+ return disamb_output
+
+
+@app.get("/health")
+async def healthcheck():
+ return {"status": "ok"}
+
+
+if __name__ == "__main__":
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py b/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py
index e4e6468f..fea07f61 100644
--- a/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py
+++ b/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py
@@ -5,8 +5,7 @@
# --------------------------------------
# Instantiate the ranker:
-myranker = ranking.Ranker(
- method="deezymatch",
+ranker = ranking.DeezyMatchRanker(
resources_path="./resources/",
strvar_parameters={
# Parameters to create the string pair dataset:
@@ -37,9 +36,9 @@
with sqlite3.connect("./resources/rel_db/embeddings_database.db") as conn:
cursor = conn.cursor()
- mylinker = linking.Linker(
- method="reldisamb",
+ linker = linking.RelDisambLinker(
resources_path="./resources/",
+ ranker=ranker,
experiments_path="./experiments/",
linking_resources=dict(),
rel_params={
@@ -56,5 +55,5 @@
overwrite_training=False,
)
-# geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)
-CONFIG = {"myranker": myranker, "mylinker": mylinker}
+# geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)
+CONFIG = {"ranker": ranker, "linker": linker}
diff --git a/app/run_local_app.py b/app/run_local_app.py
index 9afd86b0..a4084e8b 100755
--- a/app/run_local_app.py
+++ b/app/run_local_app.py
@@ -10,6 +10,7 @@
from pydantic import BaseModel
from t_res.geoparser import pipeline
+from t_res.utils.dataclasses import SentenceMentions, Candidates
os.environ["APP_CONFIG_NAME"] = "t-res_deezy_reldisamb-wpubl-wmtops"
@@ -21,21 +22,24 @@
geoparser = pipeline.Pipeline(**pipeline_config)
-class APIQuery(BaseModel):
+class RecognitionAPIQuery(BaseModel):
text: str
- place: Optional[Union[str, None]] = None
- place_wqid: Optional[Union[str, None]] = None
class CandidatesAPIQuery(BaseModel):
- toponyms: List[dict]
+ sentence_mentions: List[dict]
+ place_of_pub_wqid: Optional[str] = None
+ place_of_pub: Optional[str] = None
class DisambiguationAPIQuery(BaseModel):
- dataset: List[dict]
- wk_cands: dict
- place: Optional[Union[str, None]] = None
- place_wqid: Optional[Union[str, None]] = None
+ candidates: dict
+
+
+class PipelineAPIQuery(BaseModel):
+ text: str
+ place_of_pub_wqid: Optional[str] = None
+ place_of_pub: Optional[str] = None
app_config_name = os.environ["APP_CONFIG_NAME"]
@@ -54,73 +58,51 @@ async def read_root(request: Request):
"worker_id": os.getpid(),
}
-
-@app.get("/test")
-async def test_pipeline():
- resolved = geoparser.run_sentence(
- "Harvey, from London;Thomas and Elizabeth, Barnett.",
- place="Manchester",
- place_wqid="Q18125",
- )
-
- return resolved
-
-
-@app.get("/resolve_sentence")
-async def run_sentence(api_query: APIQuery, request_id: Union[str, None] = None):
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
- resolved = geoparser.run_sentence(
- api_query.text, place=place, place_wqid=place_wqid
- )
-
- return resolved
-
-
-@app.get("/resolve_full_text")
-async def run_text(api_query: APIQuery):
-
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
- resolved = geoparser.run_text(api_query.text, place=place, place_wqid=place_wqid)
-
- return resolved
-
-
@app.get("/run_ner")
-async def run_ner(api_query: APIQuery):
-
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
+async def run_ner(api_query: RecognitionAPIQuery):
ner_output = geoparser.run_text_recognition(
- api_query.text, place=place, place_wqid=place_wqid
+ api_query.text
)
-
return ner_output
-
@app.get("/run_candidate_selection")
async def run_candidate_selection(cand_api_query: CandidatesAPIQuery):
-
- wk_cands = geoparser.run_candidate_selection(cand_api_query.toponyms)
- return wk_cands
-
+ sentence_mentions = SentenceMentions.from_json(cand_api_query.sentence_mentions)
+ candidates = geoparser.run_candidate_selection(
+ sentence_mentions,
+ place_of_pub_wqid=cand_api_query.place_of_pub_wqid,
+ place_of_pub=cand_api_query.place_of_pub,
+ )
+ return candidates
@app.get("/run_disambiguation")
async def run_disambiguation(api_query: DisambiguationAPIQuery):
- place = "" if api_query.place is None else api_query.place
- place_wqid = "" if api_query.place_wqid is None else api_query.place_wqid
- disamb_output = geoparser.run_disambiguation(
- api_query.dataset, api_query.wk_cands, place, place_wqid
+ candidates = Candidates.from_dict(api_query.candidates)
+ predictions = geoparser.run_disambiguation(candidates)
+ return predictions
+
+@app.get("/run_pipeline")
+async def run_pipeline(api_query: PipelineAPIQuery):
+ predictions = geoparser.run(
+ text=api_query.text,
+ place_of_pub_wqid=api_query.place_of_pub_wqid,
+ place_of_pub=api_query.place_of_pub,
)
- return disamb_output
+ return predictions
+@app.get("/test")
+async def test_pipeline():
+ predictions = geoparser.run(
+ "Harvey, from London;Thomas and Elizabeth, Barnett.",
+ place_of_pub_wqid="Q18125",
+ place_of_pub="Manchester",
+ )
+ return predictions
@app.get("/health")
async def healthcheck():
return {"status": "ok"}
-
if __name__ == "__main__":
# poetry run uvicorn app.run_local_app:app --port 8123
uvicorn.run(app, host="0.0.0.0", port=8123)
diff --git a/app/test_app.py b/app/test_app.py
deleted file mode 100644
index 3141465a..00000000
--- a/app/test_app.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import os
-import pytest
-import requests
-
-
-# API_URL = "http://127.0.0.1:8123"
-API_URL = f"http://{os.getenv('HOST_URL')}:8000/v2/t-res_deezy_reldisamb-wpubl-wmtops"
-
-
-@pytest.mark.skip(reason="integration test")
-def test_health():
- response = requests.get(f'{API_URL}/health')
- assert response.status_code == 200
- assert response.json() == {'status': 'ok'}
-
-
-@pytest.mark.skip(reason="integration test")
-def test_t_res():
-
- test_body = {
- "sentence": "A remarkable case of rattening has just occurred in the building trade at Newtown.",
- "place": "Powys",
- "place_wqid": "Q156150"
- }
- expected_response = [{'mention': 'Newtown', 'ner_score': 0.996, 'pos': 74, 'sent_idx': 0, 'end_pos': 81, 'tag': 'LOC', 'sentence': 'A remarkable case of rattening has just occurred in the building trade at Newtown.', 'prediction': 'Q669171', 'ed_score': 0.034, 'cross_cand_score': {'Q669171': 0.41, 'Q1851145': 0.298, 'Q5355774': 0.143, 'Q738356': 0.107, 'Q15262210': 0.024, 'Q7020654': 0.018, 'Q18748305': 0.0}, 'prior_cand_score': {'Q1851145': 0.86, 'Q669171': 0.734, 'Q5355774': 0.537, 'Q738356': 0.516, 'Q15262210': 0.485, 'Q7020654': 0.483, 'Q18748305': 0.476}, 'latlon': [52.5132, -3.3141], 'wkdt_class': 'Q3957'}]
-
- response = requests.get(f'{API_URL}/toponym_resolution', json=test_body)
- assert response.status_code == 200
- assert response.json() == expected_response
-
-
-@pytest.mark.skip(reason="integration test")
-def test_ner():
-
- test_body = {"sentence": "Harvey, from London;Thomas and Elizabeth, Barnett."}
- expected_response = [{"entity":"B-LOC","score":0.990628182888031,"word":"London","start":13,"end":19}]
- response = requests.get(f'{API_URL}/ner', json=test_body)
-
- assert response.status_code == 200
- assert response.json() == expected_response
-
-#
\ No newline at end of file
diff --git a/docs/Makefile b/docs-sphinx/Makefile
similarity index 100%
rename from docs/Makefile
rename to docs-sphinx/Makefile
diff --git a/docs/make.bat b/docs-sphinx/make.bat
similarity index 100%
rename from docs/make.bat
rename to docs-sphinx/make.bat
diff --git a/docs/source/conf.py b/docs-sphinx/source/conf.py
similarity index 100%
rename from docs/source/conf.py
rename to docs-sphinx/source/conf.py
diff --git a/docs/source/experiments/index.rst b/docs-sphinx/source/experiments/index.rst
similarity index 100%
rename from docs/source/experiments/index.rst
rename to docs-sphinx/source/experiments/index.rst
diff --git a/docs/source/getting-started/complete-tour.rst b/docs-sphinx/source/getting-started/complete-tour.rst
similarity index 100%
rename from docs/source/getting-started/complete-tour.rst
rename to docs-sphinx/source/getting-started/complete-tour.rst
diff --git a/docs/source/getting-started/index.rst b/docs-sphinx/source/getting-started/index.rst
similarity index 100%
rename from docs/source/getting-started/index.rst
rename to docs-sphinx/source/getting-started/index.rst
diff --git a/docs/source/getting-started/installation.rst b/docs-sphinx/source/getting-started/installation.rst
similarity index 100%
rename from docs/source/getting-started/installation.rst
rename to docs-sphinx/source/getting-started/installation.rst
diff --git a/docs/source/getting-started/resources.rst b/docs-sphinx/source/getting-started/resources.rst
similarity index 100%
rename from docs/source/getting-started/resources.rst
rename to docs-sphinx/source/getting-started/resources.rst
diff --git a/docs/source/index.rst b/docs-sphinx/source/index.rst
similarity index 100%
rename from docs/source/index.rst
rename to docs-sphinx/source/index.rst
diff --git a/docs/source/reference/geoparser/index.rst b/docs-sphinx/source/reference/geoparser/index.rst
similarity index 100%
rename from docs/source/reference/geoparser/index.rst
rename to docs-sphinx/source/reference/geoparser/index.rst
diff --git a/docs/source/reference/geoparser/linker.rst b/docs-sphinx/source/reference/geoparser/linker.rst
similarity index 100%
rename from docs/source/reference/geoparser/linker.rst
rename to docs-sphinx/source/reference/geoparser/linker.rst
diff --git a/docs/source/reference/geoparser/pipeline.rst b/docs-sphinx/source/reference/geoparser/pipeline.rst
similarity index 100%
rename from docs/source/reference/geoparser/pipeline.rst
rename to docs-sphinx/source/reference/geoparser/pipeline.rst
diff --git a/docs/source/reference/geoparser/ranker.rst b/docs-sphinx/source/reference/geoparser/ranker.rst
similarity index 100%
rename from docs/source/reference/geoparser/ranker.rst
rename to docs-sphinx/source/reference/geoparser/ranker.rst
diff --git a/docs/source/reference/geoparser/recogniser.rst b/docs-sphinx/source/reference/geoparser/recogniser.rst
similarity index 100%
rename from docs/source/reference/geoparser/recogniser.rst
rename to docs-sphinx/source/reference/geoparser/recogniser.rst
diff --git a/docs/source/reference/index.rst b/docs-sphinx/source/reference/index.rst
similarity index 100%
rename from docs/source/reference/index.rst
rename to docs-sphinx/source/reference/index.rst
diff --git a/docs/source/reference/utils/deezy_processing.rst b/docs-sphinx/source/reference/utils/deezy_processing.rst
similarity index 100%
rename from docs/source/reference/utils/deezy_processing.rst
rename to docs-sphinx/source/reference/utils/deezy_processing.rst
diff --git a/docs/source/reference/utils/get_data.rst b/docs-sphinx/source/reference/utils/get_data.rst
similarity index 100%
rename from docs/source/reference/utils/get_data.rst
rename to docs-sphinx/source/reference/utils/get_data.rst
diff --git a/docs/source/reference/utils/index.rst b/docs-sphinx/source/reference/utils/index.rst
similarity index 100%
rename from docs/source/reference/utils/index.rst
rename to docs-sphinx/source/reference/utils/index.rst
diff --git a/docs/source/reference/utils/ner.rst b/docs-sphinx/source/reference/utils/ner.rst
similarity index 100%
rename from docs/source/reference/utils/ner.rst
rename to docs-sphinx/source/reference/utils/ner.rst
diff --git a/docs/source/reference/utils/preprocess_data.rst b/docs-sphinx/source/reference/utils/preprocess_data.rst
similarity index 100%
rename from docs/source/reference/utils/preprocess_data.rst
rename to docs-sphinx/source/reference/utils/preprocess_data.rst
diff --git a/docs/source/reference/utils/process_data.rst b/docs-sphinx/source/reference/utils/process_data.rst
similarity index 100%
rename from docs/source/reference/utils/process_data.rst
rename to docs-sphinx/source/reference/utils/process_data.rst
diff --git a/docs/source/reference/utils/process_wikipedia.rst b/docs-sphinx/source/reference/utils/process_wikipedia.rst
similarity index 100%
rename from docs/source/reference/utils/process_wikipedia.rst
rename to docs-sphinx/source/reference/utils/process_wikipedia.rst
diff --git a/docs/source/reference/utils/rel/entity_disambiguation.rst b/docs-sphinx/source/reference/utils/rel/entity_disambiguation.rst
similarity index 100%
rename from docs/source/reference/utils/rel/entity_disambiguation.rst
rename to docs-sphinx/source/reference/utils/rel/entity_disambiguation.rst
diff --git a/docs/source/reference/utils/rel/index.rst b/docs-sphinx/source/reference/utils/rel/index.rst
similarity index 100%
rename from docs/source/reference/utils/rel/index.rst
rename to docs-sphinx/source/reference/utils/rel/index.rst
diff --git a/docs/source/reference/utils/rel/mulrel_ranker.rst b/docs-sphinx/source/reference/utils/rel/mulrel_ranker.rst
similarity index 100%
rename from docs/source/reference/utils/rel/mulrel_ranker.rst
rename to docs-sphinx/source/reference/utils/rel/mulrel_ranker.rst
diff --git a/docs/source/reference/utils/rel/utils.rst b/docs-sphinx/source/reference/utils/rel/utils.rst
similarity index 100%
rename from docs/source/reference/utils/rel/utils.rst
rename to docs-sphinx/source/reference/utils/rel/utils.rst
diff --git a/docs/source/reference/utils/rel/vocabulary.rst b/docs-sphinx/source/reference/utils/rel/vocabulary.rst
similarity index 100%
rename from docs/source/reference/utils/rel/vocabulary.rst
rename to docs-sphinx/source/reference/utils/rel/vocabulary.rst
diff --git a/docs/source/reference/utils/rel_e2e.rst b/docs-sphinx/source/reference/utils/rel_e2e.rst
similarity index 100%
rename from docs/source/reference/utils/rel_e2e.rst
rename to docs-sphinx/source/reference/utils/rel_e2e.rst
diff --git a/docs/source/reference/utils/rel_utils.rst b/docs-sphinx/source/reference/utils/rel_utils.rst
similarity index 100%
rename from docs/source/reference/utils/rel_utils.rst
rename to docs-sphinx/source/reference/utils/rel_utils.rst
diff --git a/docs/source/t-res-api/index.rst b/docs-sphinx/source/t-res-api/index.rst
similarity index 100%
rename from docs/source/t-res-api/index.rst
rename to docs-sphinx/source/t-res-api/index.rst
diff --git a/docs/source/t-res-api/installation.rst b/docs-sphinx/source/t-res-api/installation.rst
similarity index 100%
rename from docs/source/t-res-api/installation.rst
rename to docs-sphinx/source/t-res-api/installation.rst
diff --git a/docs/source/t-res-api/usage.rst b/docs-sphinx/source/t-res-api/usage.rst
similarity index 100%
rename from docs/source/t-res-api/usage.rst
rename to docs-sphinx/source/t-res-api/usage.rst
diff --git a/docs/assets/candidates-dataclass.svg b/docs/assets/candidates-dataclass.svg
new file mode 100644
index 00000000..c7cbbf5e
--- /dev/null
+++ b/docs/assets/candidates-dataclass.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/linker-classes.svg b/docs/assets/linker-classes.svg
new file mode 100644
index 00000000..a05a04dc
--- /dev/null
+++ b/docs/assets/linker-classes.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/mention-dataclass.svg b/docs/assets/mention-dataclass.svg
new file mode 100644
index 00000000..31439316
--- /dev/null
+++ b/docs/assets/mention-dataclass.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/mentioncandidates-dataclass.svg b/docs/assets/mentioncandidates-dataclass.svg
new file mode 100644
index 00000000..dfd81c91
--- /dev/null
+++ b/docs/assets/mentioncandidates-dataclass.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/pipeline-classes.svg b/docs/assets/pipeline-classes.svg
new file mode 100644
index 00000000..7b0a80f9
--- /dev/null
+++ b/docs/assets/pipeline-classes.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/predictions-dataclass.svg b/docs/assets/predictions-dataclass.svg
new file mode 100644
index 00000000..69b60aa5
--- /dev/null
+++ b/docs/assets/predictions-dataclass.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/ranker-classes.svg b/docs/assets/ranker-classes.svg
new file mode 100644
index 00000000..840ba478
--- /dev/null
+++ b/docs/assets/ranker-classes.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/recogniser-classes.svg b/docs/assets/recogniser-classes.svg
new file mode 100644
index 00000000..9877d18a
--- /dev/null
+++ b/docs/assets/recogniser-classes.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/sentencementions-dataclass.svg b/docs/assets/sentencementions-dataclass.svg
new file mode 100644
index 00000000..2e69d6e0
--- /dev/null
+++ b/docs/assets/sentencementions-dataclass.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/experiments/evaluation.md b/docs/experiments/evaluation.md
new file mode 100644
index 00000000..88c5f01e
--- /dev/null
+++ b/docs/experiments/evaluation.md
@@ -0,0 +1,2 @@
+
+# Evaluation
\ No newline at end of file
diff --git a/docs/experiments/index.md b/docs/experiments/index.md
new file mode 100644
index 00000000..8d4f628d
--- /dev/null
+++ b/docs/experiments/index.md
@@ -0,0 +1,35 @@
+# Experiments and evaluation
+
+Follow these steps to reproduce the experiments in our paper.
+
+## 1. Obtain the external resources
+
+Follow the instructions in the "`/getting-started/resources`{.interpreted-text role="doc"}" page in the documentation to obtain the resources required for running the experiments.
+
+## 2. Preparing the data
+
+To create the datasets that we use in the experiments presented in the paper, run the following command from the `./experiments/` folder:
+
+``` bash
+$ python ./prepare_data.py -p ../resources
+```
+
+This script takes care of downloading the LwM and HIPE datasets and format them as needed in the experiments.
+
+## 3. Running the experiments
+
+To run the experiments, run the following script from the `./experiments/` folder:
+
+``` bash
+$ python ./toponym_resolution.py -p ../resources
+```
+
+This script does runs for all different scenarios reported in the experiments in the paper.
+
+## 4. Evaluate
+
+To evaluate the different approaches and obtain a table with results such as the one provided in the paper, go to the `./evaluation/` directory. There, you should clone the [HIPE scorer](https://github.com/hipe-eval/HIPE-scorer). We are using the code version at commit `50dff4e`, and have added the line `return eval_stats` at the end of the `get_results()` function. From `./evaluation/`, run the following script to obtain the results in latex format:
+
+``` bash
+$ python display_results.py
+```
diff --git a/docs/getting-started/batch-jobs.md b/docs/getting-started/batch-jobs.md
new file mode 100644
index 00000000..391b2de7
--- /dev/null
+++ b/docs/getting-started/batch-jobs.md
@@ -0,0 +1,118 @@
+# Batch Jobs
+
+The T-Res pipeline may be run on individual blocks of text, or at scale on large input datasets. The latter case is referred to as a batch job.
+
+To facilitate batch jobs, there is a `BatchJob` class which ensures efficient processing of large datasets, particularly when GPU hardware is available.
+
+## Batch job CLI
+
+A command line interface is provided to make it easy to initiate batch jobs. This is called with the `batch-job` command. To see the help page for this tool, pass the `-h` flag:
+```bash
+batch-job -h
+```
+
+This prints the following information:
+```bash
+usage: batch-job [-h] config_file input_file resources_path results_path [place_of_pub_file]
+
+Run a T-Res batch job.
+
+positional arguments:
+ config_file Path to the YAML batch job config file.
+ input_file Path to the input CSV data file.
+ resources_path Path to the resources directory.
+ results_path Path to the results directory.
+ place_of_pub_file [Optional] Path to the place of publication CSV data file. Must include
+ columns named "Wikidata ID" and "Location"
+
+optional arguments:
+ -h, --help show this help message and exit
+```
+
+## Batch job configuration file
+
+The first required argument to the `batch-job` command is the `config_file`, which must specify the path to a valid batch job configuration file in YAML format.
+
+The following is an example of a batch job configuration file. It specifies a pretrained NER model, candidate selection using Deezy Match and Linking via the REL disambiguation method (with various sub-parameters). The batch size is set to zero, which means all of the input data will be treated as a single batch.
+```
+recogniser:
+ method_name: pretrained
+ model_name: Livingwithmachines/toponym-19thC-en
+ranker:
+ method_name: deezymatch
+ deezy_parameters:
+ ranking_metric: faiss
+ selection_threshold: 50
+ strvar_parameters:
+ ocr_threshold: 80
+linker:
+ method_name: reldisamb
+ rel_params:
+ with_publication: True
+ predict_place_of_publication: True
+ combined_score: True
+ without_microtoponyms: True
+ default_publname: United Kingdom
+ default_publwqid: Q145
+batch_size: 0
+```
+
+More details about the applicable configuration parameters are given in the [documentation](../getting-started/pipeline/index.md) for each stage of the T-Res pipeline.
+
+!!! title "The `batch_size` parameter"
+
+ The `batch_size` parameter is an integer that specifies the size of each batch of text inputs to be processed by T-Res. The options for this parameter are:
+
+ - 0: this indicates an unlimited batch, i.e. the complete input data file will be processed in a single batch.
+ - 1: this indicates a that each input text should be processed individually by running the end-to-end pipeline.
+ - N > 1: an integer greater than 1 indicates the size of each processing batch.
+
+ Unless the input data file is very large, setting the `batch_size` to 0 (zero) is usually the best option. However if errors occur due to lack of system memory, a limited batch size should be chosen.
+
+## Batch job input data file
+
+The input data to a T-Res batch job must take the form of a CSV file with a column named `text`. That column will contain the passages of input text to be processed by T-Res.
+
+If place of publication information is also provided, via the `place_of_pub_file`, then the input data file must also contain a column named `NLP`. This column must contain a unique identifier for the publication in which the text originally appeared, and these same unique identifiers must be used in the `place_of_pub_file`.
+
+Other columns may also appear in the input data file. They will be read into memory during processing, and will therefore consume resources, but will be ignored by T-Res.
+
+## Batch job resources path
+
+The `resources_path` argument specifies the directory path containing the T-Res resources, as explained in the section on [Resources and sile structure](../getting-started/resources.md)
+
+## Batch job results path
+
+The `results_path` argument specifies the directory in which the results of the batch job will be saved.
+
+Each batch job will create a new folder with a timestamp recording the time at which the job was executed. The name of this folder follows the following convention: `t-res_batch_YYYY-MM-DD_HH-mm-ss`.
+
+If the batch job is completed successfully, four files will be saved inside the timestamped folder:
+
+ - A CSV results file, containing the input data with an additional column named `predictions`.
+ - A CSV file named `predictions.csv` containing one row per toponym prediction.
+ - A Python pickle file named `predictions.pkl` containing the complete set of `Predictions` [data structures](../getting-started/data-structures.md#predictions) generated by T-Res during the batch processing.
+ - A timestamped log file containing the logging messages generated during processing. This includes a copy of the batch job configuration parameters.
+
+## Place of publication file
+
+The **optional** `place_of_pub_file` argument specifies a CSV data file containing the geographical location in which each input text was originally published.
+
+This file must contain two columns with the following names:
+
+ - "Wikidata ID": the Wikidata identifier (QID) for the place of publication
+ - "Location": the name of the place of publication
+
+## Running a batch job
+
+Use the `batch-job` command to initiate a batch job:
+```bash
+batch-job \
+ \
+ \
+ \
+ \
+
+```
+
+
diff --git a/docs/getting-started/data-structures.md b/docs/getting-started/data-structures.md
new file mode 100644
index 00000000..efef75dc
--- /dev/null
+++ b/docs/getting-started/data-structures.md
@@ -0,0 +1,77 @@
+# T-Res data structures
+
+T-Res defines a collection of interrelated data structures to manage the flow of information through the toponym resolution pipeline. These structures are implemented as [Python dataclasses](https://realpython.com/python-data-classes/). Understanding these dataclasses, and how to use them, will help you to understand the toponym resolution workflow and how to work with the results.
+
+Here we describe the purpose of the most important dataclasses, what they represent and where they appear within the pipeline.
+
+We also provide class diagrams that capture the relationships between the different dataclasses. Further details, including complete lists of attributes for each dataclass, and relevant snippets of source code, can be found in the [reference section](../reference/utils/dataclasses.md) of this site.
+
+## Mention
+
+The `Mention` dataclass represents a toponym mention in a piece of text. In addition to the toponym mention itself, found in the `mention` attribute, it records its location (i.e. its character position and token offset within the text), together with its NER confidence score and NER label (e.g. `LOC` for location).
+
+
+{ width="260" }
+
+
+There is a predicate method named `is_microtoponym` which returns `True` if the mention refers to a toponym which is *not* a location. For instance, mentions having the NER label `BUILDING` are microtoponyms.
+
+## SentenceMentions
+
+The `SentenceMentions` dataclass represents a collection of toponym mentions within a sentence. It consists of a sentence of text, together with a list of `Mention` instances.
+
+
+{ width="260" }
+
+
+The `is_empty` method returns `True` if there are no toponym mentions in the sentence. The `exclude_microtoponyms` method returns another `SentenceMentions` instance which is identical except all microtoponym mentions are omitted.
+
+!!! title "Note"
+
+ The output from the T-Res named entity recognition process is a list of `SentenceMentions` instances, one for each sentence in the text.
+
+## MentionCandidates
+
+The `MentionCandidates` dataclass represents a collection of candidate links for a given toponym mention. It consists of a `Mention` instance together with list of `CandidateLinks` instances. Each `CandidateLinks` object contains a candidate string match for the toponym and, for each candidate string match, a list of candidate links in the knowledgebase.
+
+This dataclass also records the ranking and linking methods used to generate the candidates, and the place of publication information, if it was provided.
+
+
+{ width="260" }
+
+
+## Candidates
+
+The `Candidates` dataclass represents candidate matches for all toponym mentions in a block of text.
+
+
+{ width="260" }
+
+
+Internally, candidates are stored as a list of `SentenceCandidates` instances. This preserves the sentence structure of the text.
+
+If, however, this structure is not needed, the `candidates` method can be executed to obtain the list of candidates by toponoym mention.
+
+!!! title "Note"
+
+ The output from the T-Res candidate selection process is an instance of the `Candidates` dataclass.
+
+## Predictions
+
+The `Predictions` dataclass is a subclass of `Candidates`, and represents predicted matches (in the knowledgebase) for all toponym mentions in a block of text.
+
+
+{ width="260" }
+
+
+Details of the predicted toponym matches can be obtained from a `Predictions` instance using the methods `best_wqids`, `best_coords` and `best_disambiguation_scores` which return (respectively), for each toponym mention, the predicted Wikidata link (by its Wikidata QID), the predicted geographical coordinates of the toponym and the highest disambiguation score.
+
+!!! tip
+
+ To obtain a list of toponym matches from an instance of the `Predictions` dataclass, always use the `candidates` method, which returns a list of `MentionCandidates` objects, one per toponym mention.
+
+!!! note
+
+ The output from the T-Res end-to-end pipeline is an instance of the `Predictions` dataclass.
+
+
diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md
new file mode 100644
index 00000000..32e3fdb4
--- /dev/null
+++ b/docs/getting-started/index.md
@@ -0,0 +1,11 @@
+# Getting started
+
+
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
new file mode 100644
index 00000000..4959d767
--- /dev/null
+++ b/docs/getting-started/installation.md
@@ -0,0 +1,138 @@
+# Installing T-Res
+
+If you want to work directly on the codebase, we suggest to install T-Res following these instructions (which have been tested on Linux Ubuntu 20.04).
+
+## Update the system
+
+First, you need to make sure the system is up to date and all essential libraries are installed:
+```console
+$ sudo apt update
+$ sudo apt install build-essential curl libbz2-dev libffi-dev \
+ liblzma-dev libncursesw5-dev libreadline-dev libsqlite3-dev \
+ libssl-dev libxml2-dev libxmlsec1-dev llvm make tk-dev wget \
+ xz-utils zlib1g-dev
+```
+
+## Install pyenv
+
+Then you need to install pyenv, which we use to manage virtual environments:
+
+```console
+$ curl https://pyenv.run | bash
+```
+
+And also to make sure paths are properly exported:
+
+```console
+$ echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
+$ echo 'export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
+$ echo -e 'if command -v pyenv 1>/dev/null 2>&1; \
+ then\n eval "$(pyenv init --path)"\nfi' >> ~/.bashrc
+```
+
+Then you can restart your bash session, to make sure all changes are updated:
+
+```console
+$ source ~/.bashrc
+```
+
+And then you run the following commands to update ``pyenv`` and create the needed environemnt.
+
+```console
+$ pyenv update
+$ pyenv install 3.9.7
+$ pyenv global 3.9.7
+```
+
+## Install poetry
+
+To manage dipendencies across libraries, we use Poetry. To install it, do the following:
+
+```console
+$ curl -sSL https://install.python-poetry.org | python3 -
+$ echo 'export PATH=$PATH:$HOME/.poetry/bin' >> ~/.bashrc
+```
+
+## Project Installation
+
+You can now clone the repo and ``cd`` into it:
+
+```console
+$ git clone git@github.com:Living-with-machines/T-Res.git
+$ cd T-Res
+```
+
+Explicitly tell poetry to use the python version defined above:
+
+```console
+$ poetry env use python
+```
+
+Install all dependencies using `poetry`:
+
+```console
+$ poetry update
+$ poetry install
+```
+
+Create an IPython (Jupyter) kernel:
+
+```console
+$ poetry run ipython kernel install --user --name=t_res
+```
+
+## How to use poetry
+
+To activate the environment:
+
+```console
+$ poetry shell
+```
+
+Now you can run a script as usual, for instance:
+
+```console
+$ python experiments/toponym_resolution.py
+```
+
+To add a package:
+
+```console
+$ poetry add [package name]
+```
+
+To run the Python unit tests:
+
+```console
+$ poetry run pytest tests
+```
+
+To run unit and integration tests, some of which depend on the [T-Res resources](../getting-started/resources.md):
+
+```console
+$ poetry run pytest tests --include-resources
+```
+
+If you want to use Jupyter notebook, run it as usual, and then select the created kernel in "Kernel" > "Change kernel".
+
+```console
+$ jupyter notebook
+```
+
+## Pre-commit hoooks
+
+In order to guarantee style consistency across our codebase we use a few basic pre-commit hooks.
+
+To use them, first run:
+
+```console
+$ poetry run pre-commit install --install-hooks
+```
+
+To run the hooks on all files, you can do:
+
+```console
+$ poetry run pre-commit run --all-files
+```
+
+
diff --git a/docs/getting-started/pipeline/index.md b/docs/getting-started/pipeline/index.md
new file mode 100644
index 00000000..e67b8066
--- /dev/null
+++ b/docs/getting-started/pipeline/index.md
@@ -0,0 +1,249 @@
+# T-Res Pipeline
+
+The T-Res codebase contains three main classes:
+
+ - the **Recogniser** class (which performs toponym recognition, which is a named entity recognition task),
+ - the **Ranker** class (which performs candidate selection and ranking for the named entities identified by the Recogniser),
+ - the **Linker** class (which selects the most likely candidate from those provided by the Ranker).
+
+An additional class, the **Pipeline**, wraps these three components into one, therefore making it easier for the user to perform end-to-end entity linking.
+
+Here we provide a step-by-step guide to instantiating and using the T-Res Pipeline. We recommend that you first try to run T-Res using the default pipeline, and then change it according to your needs.
+
+!!! Warning
+
+ Before being able to run the pipeline, you will need to make sure you have all the required resources. Refer to the "[Resources & directory structure](resources.md)" page in the documentation.
+
+## Pipeline Class
+
+To perform toponym resolution with the T-Res pipeline you must first construct an instance of the `Pipeline` class, as explained in [Section 1](#1-instantiate-the-pipeline) below. The following diagram shows the class structure, which consists of a single `Pipeline` class which is composed of a [`Recogniser`](recogniser.md), [`Ranker`](ranker.md) and [`Linker`](linker.md) instance.
+
+The `run` method executes the end-to-end pipeline. The result of this is equivalent to running the three steps separately using the methods `run_text_recognition`, `run_candidate_selection` and `run_disambiguation`.
+
+
+
+
+{ width="260" }
+
+
+## 1. Instantiate the Pipeline
+
+By default, the Pipeline instantiates:
+
+- a Recogniser (from a HuggingFace model),
+- a Ranker (using the ``perfectmatch`` approach), and
+- a Linker (using the ``mostpopular`` approach).
+
+To instantiate the default T-Res pipeline, do:
+
+```python
+from t_res.geoparser import pipeline
+
+geoparser = pipeline.Pipeline(
+ resources_path="../resources/"
+)
+```
+
+!!! title "Note"
+
+ You should update the resources path argument to reflect your set up.
+
+You can also instantiate a pipeline using a customised Recogniser, Ranker and Linker. To see the different options, refer to the sections on instantiating each of them: [Recogniser](#recogniser), [Ranker](#ranker) and [Linker](#linker).
+
+In order to instantiate a pipeline using a customised Recogniser, Ranker and Linker, just instantiate them beforehand, and then pass them as arguments to the Pipeline, as follows:
+
+```python
+from geoparser import pipeline, ner, ranking, linking
+
+recogniser = ner.Recogniser(...)
+ranker = ranking.Ranker(...)
+linker = linking.Linker(...)
+
+geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+```
+
+!!! Warning
+
+ Note that the default Pipeline expects to be run from the `experiments/` or the `examples` folder (or any other folder in the same level). The Pipeline will look for the resources at `../resources/`. Make sure all the required resources are in the right locations.
+
+!!! title "Note"
+
+ If a model needs to be trained, the Pipeline itself will take care of it. Therefore, you should expect that the first time the Pipeline is used (or if you change certain input parameters) T-Res will take some time before it is ready to be used for prediction, as it will train the models if the approaches require so.
+
+## 2. Use the Pipeline
+
+Once instantiated (and once all the models have been trained or loaded, if needed), the Pipeline can be used to perform end-to-end toponym recognition and linking (given an input text) or to perform each of the three steps individually:
+
+ 1. toponym recognition given an input text,
+ 2. candidate selection given a toponym or list of toponyms, and
+ 3. toponym disambiguation given the output from the first two steps.
+
+### End-to-end pipeline
+
+The Pipeline can be used to perform end-to-end toponym recognition and linking given an input text, using the [`run()`][t_res.geoparser.pipeline.Pipeline.run] method (which takes care of splitting a text into sentences, before running the pipeline on each sentence).
+
+!!! example "Example: Pipeline `run()` method"
+ ```python
+ output = geoparser.run("Inspector Liddle said: I am an inspector of police, living in the city of Durham.")
+ ```
+
+The following parameters are optional:
+
+- `place_of_pub_wqid`: The Wikidata ID of the place of publication (e.g. `"Q84"`).
+- `place_of_pub`: The place of publication associated with the text document as a human-legible string (e.g. `"London"`).
+
+[](){#predictions-output}
+
+!!! example "Example: Pipeline `run()` method including place of publication"
+ ```python
+ output = geoparser.run("Inspector Liddle said: I am an inspector of police, living in the city of Durham.",
+ place_of_pub_wqid="Q2560190",
+ place_of_pub="Alston, Cumbria, England",
+ )
+ ```
+
+When printed, the output looks like this:
+```python
+Predictions for text: 'Inspector Liddle said:...city of Durham.':
+ Durham => Durham [1.000]: Q179815 (0.439), Q49229 (0.216), Q23082 (0.071), ...
+```
+
+In the above output, the first line indicates that the this is an instance of the [`Predictions`][t_res.utils.dataclasses.Predictions] dataclass, and includes a snippet of the text that has been processed. Then there is a line for each toponym identified in the text (in this case only one, Durham).
+
+Each of these lines has the following format:
+```bash
+ mention => string_match [string_similarity]: WQID1 (score1), WQID2 (score2), WQID3 (score3), ...
+```
+where:
+
+ - `mention` is the identified toponym mention, exactly as found in the text
+ - `string_match` is the **best** string match found for the toponym mention
+ - `string_similarity` is the string matching similarity score
+ - `WQID1` is the Wikidata ID of the **best** link found in the knowledgebase
+ - `score1` is the disambiguation score (i.e. confidence) for the link `WQID1`
+ - `WQID2`, `score2` and `WQID3`, `score3` are the IDs and scores for the second- and third-best links, respectively
+ - if present, the ellipsis `...` indicates that additional (poorer) links were identified but are not shown.
+
+Thus, the printed output provides a summary of the toponyms resolved from the given text. To interrogate the output more closely, see the documentation for the [`Predictions`][t_res.utils.dataclasses.Predictions] dataclass for a list of all available methods.
+
+### Step-by-step pipeline
+
+**Step 1: Named Entity Recognition.** See how to perform toponym recognition with the Pipeline, with an example:
+
+```python
+mentions = geoparser.run_text_recognition(text="Inspector Liddle said: I am an inspector of police, living in the city of Durham.")
+```
+
+This call produces a list of instances of the [`SentenceMentions`][t_res.utils.dataclasses.SentenceMentions] dataclass, one for each sentence in the text. In this case there is a single sentence. When printed, the result looks like this:
+
+```python
+Toponym mentions for sentence: 'Inspector Liddle said: I am an inspector of police, living in the city of Durham.'
+ Durham LOC chars: 74-80 confidence: 0.999
+```
+
+In the above output there is a line for each toponym mention found in the text (in this case only one, Durham).
+
+Each of these lines has the following format:
+```bash
+ mention => ner_label chars: start-end confidence: string_similarity
+```
+where:
+
+ - `mention` is the identified toponym mention, exactly as found in the text
+ - `ner_label` is the NER label for this mention (e.g. `LOC` indicates this is a location)
+ - `chars: start-end` is the character span of the toponym mention within the sentence
+ - `confidence: string_similarity` is the similarity (confidence) score of the string match.
+
+To interrogate the output more closely, see the documentation for the [`SentenceMentions`][t_res.utils.dataclasses.SentenceMentions] dataclass for a list of all available methods.
+
+
+**Step 2: Candidate Selection.** See how to perform candidate selection given the `mentions` output from the previous step, with an example:
+
+```python
+candidates = geoparser.run_candidate_selection(
+ mentions,
+ place_of_pub_wqid="Q2560190",
+ place_of_pub="Alston, Cumbria, England",
+)
+```
+
+This is the printed output for this example:
+```python
+Candidates for text: 'Inspector Liddle said:...city of Durham.':
+ Durham => Durham [1.000]: Q1137286, Q5316477, Q752266, ...
+```
+It is an instance of the [`Candidates`][t_res.utils.dataclasses.Candidates] dataclass, and resembles the output [displayed above][predictions-output] for the [`Predictions`][t_res.utils.dataclasses.Predictions] dataclass (which is a subclass of `Candidates`), except that the entity linking disambiguation scores are omitted, because they have not yet been computed.
+
+**Step 3: Disambiguation.** Finally, see how to perform toponym disambiguation given the output from the two previous steps, with this example:
+
+```python
+predictions = geoparser.run_disambiguation(candidates)
+```
+
+This will produce the exact same output as we [obtained above][predictions-output] when running the pipeline end-to-end.
+
+#### Description of the output
+
+The output of running the pipeline (both using the end-to-end method or in a step-wise manner, regardless of the methods used for each of the three components), will have the following format:
+
+``` json
+[{"mention": "Durham",
+ "ner_score": 0.999,
+ "pos": 74,
+ "sent_idx": 0,
+ "end_pos": 80,
+ "tag": "LOC",
+ "sentence": "Inspector Liddle said: I am an inspector of police, living in the city of Durham.",
+ "prediction": "Q179815",
+ "ed_score": 0.039,
+ "cross_cand_score": {
+ "Q179815": 0.396,
+ "Q23082": 0.327,
+ "Q49229": 0.141,
+ "Q5316459": 0.049,
+ "Q458393": 0.045,
+ "Q17003433": 0.042,
+ "Q1075483": 0.0
+ },
+ "string_match_score": {"Durham": [1.0, ["Q1137286", "Q5316477", "Q752266", "..."]]},
+ "prior_cand_score": {
+ "Q179815": 0.881,
+ "Q49229": 0.522,
+ "Q5316459": 0.457,
+ "Q17003433": 0.455,
+ "Q23082": 0.313,
+ "Q458393": 0.295,
+ "Q1075483": 0.293
+ },
+ "latlon": [54.783333, -1.566667],
+ "wkdt_class": "Q515"}]
+```
+
+Description of the fields:
+
+- `mention`: The mention text.
+- `ner_score`: The NER confidence score of the mention.
+- `pos`: The starting position of the mention in the sentence.
+- `sent_idx`: The index of the sentence.
+- `end_pos`: The ending position of the mention in the sentence.
+- `tag`: The NER label of the mention.
+- `sentence`: The input sentence.
+- `prediction`: The predicted entity linking result (a Wikidata QID or NIL).
+- `ed_score`: The entity disambiguation score.
+- `string_match_score`: A dictionary of candidate entities and their string matching confidence scores.
+- `prior_cand_score`: A dictionary of candidate entities and their prior confidence scores.
+- `cross_cand_score`: A dictionary of candidate entities and their cross-candidate confidence scores.
+- `latlon`: The latitude and longitude coordinates of the predicted entity.
+- `wkdt_class`: The Wikidata class of the predicted entity.
+
+## Pipeline recommendations
+
+- To get started with T-Res, we recommend to start using the default pipeline, as its significantly less complex than the better performing approaches.
+- The default pipeline may not be a bad option if you are planning to perform toponym recognition on modern global clean data. However, take into account that it uses context-agnostic approaches, which often perform quantitavively quite well just because of the higher probability of the most common sense to appear in texts.
+- Running T-Res with DeezyMatch for candidate selection (by choosing the [DeezyMatchRanker][t_res.geoparser.ranking.DeezyMatchRanker]) and the REL model for entity disambiguation (by choosing the [RelDisambLinker][t_res.geoparser.linking.RelDisambLinker]) leads to considerably longer execution times than using the default pipeline. If you want to run T-Res on a few sentences, you can use the end-to-end pipeline `run()` method. If, however, you have a large number of texts on which to run T-Res, then we recommend you use the step-wise approach. If done efficiently, this can save a lot of time. Using this approach, you should:
+
+ 1. Perform toponym recognition on all the texts,
+ 1. Obtain the set of all unique toponyms identified in the full dataset, and perform candidate selection on the unique set of toponyms,
+ 1. Perform toponym disambiguation on a per-text basis, passing as argument the dictionary of candidates returned in the previous step.
+
+
diff --git a/docs/getting-started/pipeline/linker.md b/docs/getting-started/pipeline/linker.md
new file mode 100644
index 00000000..be30a5b3
--- /dev/null
+++ b/docs/getting-started/pipeline/linker.md
@@ -0,0 +1,190 @@
+# Linker
+
+The Linker takes as input the set of candidates selected by the Ranker and disambiguates them, selecting the best matching entity depending on the approach selected for disambiguation.
+
+We provide two different strategies for disambiguation:
+
+- `mostpopular`: Unsupervised method, which, given a set of candidates for a given mention, returns as a prediction the candidate that is most popular in terms of inlink structure in Wikipedia.
+
+- `reldisamb`: Given a set of candidates, this approach uses the [REL re-implementation](https://github.com/informagi/REL/) of the [ment-norm algorithm](https://github.com/lephong/mulrel-nel) proposed by Le and Titov (2018) and partially based on Ganea and Hofmann (2017), and adapts it. To know more:
+
+ Van Hulst, Johannes M., Faegheh Hasibi, Koen Dercksen, Krisztian Balog, and
+ Arjen P. de Vries. "Rel: An entity linker standing on the shoulders of giants."
+ In Proceedings of the 43rd International ACM SIGIR Conference on Research and
+ Development in Information Retrieval, pp. 2197-2200. 2020.
+
+ Le, Phong, and Ivan Titov. "Improving Entity Linking by Modeling Latent Relations
+ between Mentions." In Proceedings of the 56th Annual Meeting of the Association
+ for Computational Linguistics (Volume 1: Long Papers), pp. 1595-1604. 2018.
+
+ Ganea, Octavian-Eugen, and Thomas Hofmann. "Deep Joint Entity Disambiguation
+ with Local Neural Attention." In Proceedings of the 2017 Conference on
+ Empirical Methods in Natural Language Processing, pp. 2619-2629. 2017.
+
+## Linker Classes
+
+To perform candidate linking and disambiguation with T-Res you must first construct an instance of the `Linker` class, as explained in [Section 1](#1-instantiate-the-linker) below. The following diagram shows the class structure, with the abstract base class `Linker` in green and its three concrete subclasses in orange. When constructing an instance, choose the appropriate subclass for your candidate selection method.
+
+
+
+{ width="560" }
+
+## 1. Instantiate the Linker
+
+### Most Popular Linker
+
+To use the Linker with the `mostpopular` approach, instantiate it as follows:
+
+```python
+from t_res.geoparser import linking
+
+linker = linking.MostPopularLinker(
+ resources_path="resources/"
+)
+```
+
+Description of the parameters:
+
+- `resources_path`: path to the resources directory.
+
+Note that `resources_path` should contain the path to the directory where the resources are stored.
+
+When using the `mostpopular` linking approach, the resources folder should at least contain the following resources:
+
+ T-Res/
+ └── resources/
+ └── wikidata/
+ ├── entity2class.txt
+ ├── mentions_to_wikidata.json
+ └── wikidata_gazetteer.csv
+
+### By Distance Linker
+
+To use the Linker with the `bydistance` approach, instantiate it as follows:
+
+```python
+from t_res.geoparser import linking
+
+linker = linking.ByDistanceLinker(
+ resources_path="resources/"
+)
+```
+
+Description of the parameters:
+
+- `resources_path`: path to the resources directory.
+
+Note that `resources_path` should contain the path to the directory where the resources are stored.
+
+When using the `bydistance` linking approach, the resources folder should at least contain the following resources:
+
+ T-Res/
+ └── resources/
+ └── wikidata/
+ ├── entity2class.txt
+ ├── mentions_to_wikidata.json
+ └── wikidata_gazetteer.csv
+
+### REL Disambiguation Linker
+
+To use the Linker with the `reldisamb` approach, instantiate it as follows:
+
+```python
+from t_res.geoparser import linking
+
+with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn:
+ cursor = conn.cursor()
+ linker = linking.RelDisambLinker(
+ resources_path="resources/",
+ rel_params={
+ "model_path": "resources/models/disambiguation/",
+ "data_path": "experiments/outputs/data/lwm/",
+ "training_split": "originalsplit",
+ "db_embeddings": cursor,
+ "with_publication": True,
+ "without_microtoponyms": True,
+ "do_test": False,
+ "default_publname": "London",
+ "default_publwqid": "Q84",
+ },
+ overwrite_training=False,
+ )
+```
+
+Description of the parameters:
+
+- `method`: name of the method, in this case `reldisamb`.
+- `resources_path`: path to the resources directory.
+- `overwrite_training`: whether to overwrite the training of the entity disambiguation model provided a model with the same path and name already exists.
+- `rel_params`: set of parameters specific to the `reldisamb` method:
+ - `model_path`: Path to the entity disambiguation model.
+ - `data_path`: Path to the dataset file `linking_df_split.tsv` used for training a model (see information about the dataset in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation).
+ - `training_split`: Column from the `linking_df_split.tsv` file that indicates which documents are used for training, development, and testing (see more information about this in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation).
+ - `db_embeddings`: cursor for the embeddings database (see more information about this in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation).
+ - `with_publication`: whether place of publication should be used as a feature when disambiguating (by adding it as an already disambiguated entity).
+ - `without_microtoponyms`: whether to filter out microtoponyms or not (i.e. filter out all entities that are not `LOC`).
+ - `do_test`: Whether to train an entity disambiguation model in test mode.
+ - `default_publname`: The default value for the place of publication of the texts. For example, "London". This will be the default publication place name, but you will be able to override it when using the Linker to do predictions. This will be ignored if `with_publication` is `False`.
+ - `default_publwqid`: The wikidata ID of the place of publication. For example, `Q84` for London. As in `default_publname`, you will be able to override it at inference time, and it will be ignored if `with_publication` is `False`.
+
+In this way, an entity disambiguation model will be trained unless a model trained using the same characteristics already exists (i.e. same candidate ranker method, same `training_split` column name, and same values for `with_publication` and `without_microtoponyms`).
+
+When using the `reldisamb` linking approach, the resources folder should at least contain the following resources:
+
+ T-Res/
+ └── resources/
+ ├── wikidata/
+ | ├── entity2class.txt
+ | ├── mentions_to_wikidata.json
+ | └── wikidata_gazetteer.csv
+ └── rel_db/
+ └── embeddings_database.db
+
+## 2. Load the resources
+
+!!! title "Note"
+
+ Note that this step is already taken care of if you use the `Pipeline`.
+
+The following line of code loads the resources required by the Linker, regardless of the Linker method.
+
+```python
+linker.load()
+```
+
+## 3. Train an entity disambiguation model
+
+!!! title "Note: Only the `RelDisambLinker` requires training"
+
+ The training step is only possible if the `reldisamb` linking method is selected by instantiating a linker of type `RelDisambLinker`. The other linking methods (`mostpopular` and `bydistance`) are rule-based and therefore no model training is necessary.
+
+!!! title "Note"
+
+ Note that this step is already taken care of if you use the `Pipeline`.
+
+The following line will train a REL model for entity disambiguation, given the arguments specified when instantiating the `RelDisambLinker`.
+
+```python
+linker.train_load_model()
+```
+
+Note that if the model already exists and `overwrite_training` is set to `False`, the training will be skipped, even if you call the `train_load_model()` method.
+
+The resulting model will be stored in the location specified when instantiating the Linker (i.e. `resources/models/disambiguation/` in the example) in a new folder whose name combines information about the ranking and linking arguments used in training the method.
+
+## 4. Link & disambiguate candidates to obtain predictions
+
+This example demonstrates the use of the Linker to link toponym matches to entities in the knowledgebase. The Linker's `run` method requires that the input is a `CandidateMatches` instance, which can be obtained by executing the [`Ranker`](ranker.md) on a toponym `Mention`. Here we assume the `candidate_matches` variable is such an instance. The `run` method also takes two optional arguments relating to the place of publication of the text:
+```python
+mention_candidates = linker.run(candidate_matches, place_of_pub_wqid="Q84", place_of_pub="London")
+print(mention_candidates)
+```
+The result is a `MentionCandidates` instance.
+
+The final step is entity disambiguation, which produces a disambiguation score for each fo the candidate links in the knowledgebase. This step is also performed by the Linker, via the `disambiguate` method, which takes a list of `SentenceCandidates` instances:
+```python
+predictions = linker.disambiguate(sentence_candidates)
+print(predictions)
+```
+
+
diff --git a/docs/getting-started/pipeline/ranker.md b/docs/getting-started/pipeline/ranker.md
new file mode 100644
index 00000000..e52f73c0
--- /dev/null
+++ b/docs/getting-started/pipeline/ranker.md
@@ -0,0 +1,281 @@
+# Ranker
+
+The Ranker takes the named entities detected by the Recogniser as input. Given a knowledge base, it ranks the entities names according to their string similarity to the target named entity, and selects a subset of candidates that will be passed on to the next component, the Linker, to do the disambiguation and select the most likely entity.
+
+In order to use the Ranker and the Linker, we need a knowledge base, a gazetteer. T-Res uses a gazetteer which combines data from Wikipedia and Wikidata. See how to obtain the Wikidata-based resources in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation.
+
+T-Res provides four different strategies for selecting candidates:
+
+- `perfectmatch` retrieves candidates from the knowledge base if one of their alternate names is identical to the detected named entity. For example, given the mention "Wiltshire", the following Wikidata entities will be retrieved: [Q23183](https://www.wikidata.org/wiki/Q23183), [Q55448990](https://www.wikidata.org/wiki/Q55448990), and [Q8023421](https://www.wikidata.org/wiki/Q8023421), because all these entities are referred to as "Wiltshire" in Wikipedia anchor texts.
+- `partialmatch` retrieves candidates from the knowledge base if there is a (partial) match between the query and the candidate names, based on string overlap. Therefore, the mention "Ashton-under" returns candidates for "Ashton-under-Lyne".
+- `levenshtein` retrieves candidates from the knowledge base if there is a fuzzy match between the query and the candidate names, based on levenshtein distance. Therefore, mention "Wiltshrre" would still return the candidates for "Wiltshire". This method is often quite accurate when it comes to OCR variations, but it is very slow.
+- `deezymatch` retrieves candidates from the knowledge base if there is a fuzzy match between the query and the candidate names, based on similarity between [DeezyMatch](https://github.com/Living-with-machines/DeezyMatch) embeddings. It is significantly more complex than the other methods to set up from scratch, and you will need to train a DeezyMatch model (which takes about two hours), but once it is set up, it is the fastest approach (except for `perfectmatch`).
+
+## Ranker Classes
+
+To perform candidate selection with T-Res you must first construct an instance of the `Ranker` class, as explained in [Section 1](#1-instantiate-the-ranker) below. The following diagram shows the class structure, with the abstract base class `Ranker` in green and its four concrete subclasses in orange. When constructing an instance, choose the appropriate subclass for your candidate selection method.
+
+It can be seen that all subclasses extend the `PerfectMatchRanker` class. This is because every ranking method begins by attempting to find a perfect string match in the Wikidata knowledgebase. Only if this attempt is unsuccessful will a more flexible string matching method be attempted.
+
+
+
+{ width="560" }
+
+## 1. Instantiate the Ranker
+
+### Perfect Match Ranker
+
+To use the Ranker for exact matching (`perfectmatch`), instantiate it as follows:
+```python
+from t_res.geoparser import ranking
+
+ranker = ranking.PerfectMatchRanker(
+ resources_path="resources/"
+)
+```
+
+Note that `resources_path` should contain the path to the directory where the Wikidata- and Wikipedia-based resources are stored, as described in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation.
+
+### Partial Match & Levenshtein Rankers
+
+To use the Ranker for partial string matching based on overlap distance (`partialmatch`), instantiate it as follows:
+```python
+from t_res.geoparser import ranking
+
+ranker = ranking.PartialMatchRanker(
+ resources_path="resources/"
+)
+```
+Or, for partial string matching based on Levenshtein distance (`levenshtein`), use:
+```python
+from t_res.geoparser import ranking
+
+ranker = ranking.LevenshteinRanker(
+ resources_path="resources/"
+)
+```
+Note that `resources_path` should contain the path to the directory where the Wikidata- and Wikipedia-based resources are stored, as described in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation.
+
+### Deezy Match Ranker
+
+DeezyMatch instantiation is trickier, as it requires training a model that, ideally, should capture the types of string variations that can be found in your data (such as OCR errrors). Using the Ranker, you can:
+
+- **Option 1:** Train a DeezyMatch model from scratch, including generating a string pairs dataset.
+- **Option 2:** Train a DeezyMatch model, given an existing string pairs dataset.
+
+Once a DeezyMatch has been trained, you can load it and use it. The following notebooks provide examples of each case:
+
+ ./examples/train_use_deezy_model_1.ipynb # Option 1
+ ./examples/train_use_deezy_model_2.ipynb # Option 2
+ ./examples/train_use_deezy_model_3.ipynb # Load an existing DeezyMatch model.
+
+See below each option in detail.
+
+#### Option 1. Train a DeezyMatch model from scratch, given an existing string pairs dataset
+
+To train a DeezyMatch model from scratch, using an existing string pairs dataset, you will need to have the following ``resources`` file structure (as described in the "[Resources and file structure](resources.md)" page in the documentation):
+
+ T-RES/
+ ├── ...
+ ├── resources/
+ │ ├── deezymatch/
+ │ │ ├── data/
+ │ │ │ └── w2v_ocr_pairs.txt
+ │ │ └── inputs/
+ │ │ ├── characters_v001.vocab
+ │ │ └── input_dfm.yaml
+ │ ├── models/
+ │ ├── news_datasets/
+ │ ├── wikidata/
+ │ │ ├── mentions_to_wikidata_normalized.json
+ │ │ └── wikidata_to_mentions_normalized.json
+ │ └── wikipedia/
+ └── ...
+
+The Ranker can then be instantiated as follows:
+
+```python
+from t_res.geoparser import ranking
+from pathlib import Path
+
+ranker = ranking.DeezyMatchRanker(
+ # Generic Ranker parameters:
+ resources_path="resources/",
+ # Parameters to create the string pair dataset:
+ strvar_parameters=dict(),
+ # Parameters to train, load and use a DeezyMatch model:
+ deezy_parameters={
+ # Paths and filenames of DeezyMatch models and data:
+ "dm_path": str(Path("resources/deezymatch/").resolve()),
+ "dm_cands": "wkdtalts",
+ "dm_model": "w2v_ocr",
+ "dm_output": "deezymatch_on_the_fly",
+ # Ranking measures:
+ "ranking_metric": "faiss",
+ "selection_threshold": 50,
+ "num_candidates": 1,
+ "verbose": False,
+ # DeezyMatch training:
+ "overwrite_training": False,
+ "do_test": False,
+ },
+)
+```
+
+Description of the parameters (to learn more, refer to the [DeezyMatch readme](https://github.com/Living-with-machines/DeezyMatch/blob/master/README.md#candidate-ranking)):
+
+- `strvar_parameters` contains the parameters needed to generate the DeezyMatch training set. It can be left empty, since the training set already exists.
+- `deezy_parameters`: contains the set of parameters to train or load a DeezyMatch model:
+ - `dm_path`: The path to the folder where the DeezyMatch model and data will be stored.
+ - `dm_cands`: The name given to the set of alternate names from which DeezyMatch will try to find a match for a given mention.
+ - `dm_model`: Name of the DeezyMatch model to train (or load if the model already exists).
+ - `dm_output`: Name of the DeezyMatch output file (not really needed).
+ - `ranking_metric`: DeezyMatch parameter: the metric used to rank the string variations based on their vectors.
+ - `selection_threshold`: DeezyMatch parameter: selection threshold based on the ranking metric.
+ - `num_candidates`: DeezyMatch parameter: maximum number of string variations that will be retrieved.
+ - `verbose`: DeezyMatch parameter: verbose output or not.
+ - `overwrite_training`: Whether to overwrite the training of a DeezyMatch model provided it already exists.
+ - `do_test`: Whether to train a model in test mode.
+
+#### Option 2. Train a DeezyMatch model from scratch, including generating a string pairs dataset
+
+To train a DeezyMatch model from scratch, including generating a string pairs dataset, you will need to have the following `resources` file structure (as described in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation):
+
+ T-RES/
+ ├── ...
+ ├── resources/
+ │ ├── deezymatch/
+ │ ├── models/
+ │ │ └── w2v/
+ │ │ ├── w2v_1800s_news
+ │ │ │ ├── w2v.model
+ │ │ │ ├── w2v.model.syn1neg.npy
+ │ │ │ └── w2v.model.wv.vectors.npy
+ │ │ ├── ...
+ │ │ └── w2v_1860s_news
+ │ │ ├── w2v.model
+ │ │ ├── w2v.model.syn1neg.npy
+ │ │ └── w2v.model.wv.vectors.npy
+ │ ├── news_datasets/
+ │ ├── wikidata/
+ │ │ ├── mentions_to_wikidata_normalized.json
+ │ │ └── wikidata_to_mentions_normalized.json
+ │ └── wikipedia/
+ └── ...
+
+The Ranker can then be instantiated as follows:
+
+```python
+from t_res.geoparser import ranking
+from pathlib import Path
+
+ranker = ranking.DeezyMatchRanker(
+ # Generic Ranker parameters:
+ resources_path="resources/",
+ # Parameters to create the string pair dataset:
+ strvar_parameters={
+ "ocr_threshold": 60,
+ "top_threshold": 85,
+ "min_len": 5,
+ "max_len": 15,
+ "w2v_ocr_path": str(Path("../resources/models/w2v/").resolve()),
+ "w2v_ocr_model": "w2v_*_news",
+ "overwrite_dataset": False,
+ },
+ # Parameters to train, load and use a DeezyMatch model:
+ deezy_parameters={
+ # Paths and filenames of DeezyMatch models and data:
+ "dm_path": str(Path("resources/deezymatch/").resolve()),
+ "dm_cands": "wkdtalts",
+ "dm_model": "w2v_ocr",
+ "dm_output": "deezymatch_on_the_fly",
+ # Ranking measures:
+ "ranking_metric": "faiss",
+ "selection_threshold": 50,
+ "num_candidates": 1,
+ "verbose": False,
+ # DeezyMatch training:
+ "overwrite_training": False,
+ "do_test": False,
+ },
+)
+```
+
+Description of the parameters (to learn more, refer to the [DeezyMatch readme](https://github.com/Living-with-machines/DeezyMatch/blob/master/README.md#candidate-ranking)):
+
+- `strvar_parameters` contains the parameters needed to generate the DeezyMatch training set:
+ - `ocr_threshold`: Maximum [FuzzyWuzzy](https://pypi.org/project/fuzzywuzzy/) ratio for two strings to be considered negative variations of each other.
+ - `top_threshold`: Minimum [FuzzyWuzzy](https://pypi.org/project/fuzzywuzzy/) ratio for two strings to be considered positive variations of each other.
+ - `min_len`: Minimum length for a word to be included in the dataset.
+ - `max_len`: Maximum length for a word to be included in the dataset.
+ - `w2v_ocr_path`: The path to the word2vec embeddings folders.
+ - `w2v_ocr_model`: The folder name of the word2vec embeddings (it can be a regular expression).
+ - `overwrite_dataset`: Whether to overwrite the dataset if it already exists.
+- `deezy_parameters`: contains the set of parameters to train or load a DeezyMatch model:
+ - `dm_path`: The path to the folder where the DeezyMatch model and data will be stored.
+ - `dm_cands`: The name given to the set of alternate names from which DeezyMatch will try to find a match for a given mention.
+ - `dm_model`: Name of the DeezyMatch model to train or load.
+ - `dm_output`: Name of the DeezyMatch output file (not really needed).
+ - `ranking_metric`: DeezyMatch parameter: the metric used to rank the string variations based on their vectors.
+ - `selection_threshold`: DeezyMatch parameter: selection threshold based on the ranking metric.
+ - `num_candidates`: DeezyMatch parameter: maximum number of string variations that will be retrieved.
+ - `verbose`: DeezyMatch parameter: verbose output or not.
+ - `overwrite_training`: Whether to overwrite the training of a DeezyMatch model provided it already exists.
+ - `do_test`: Whether to train a model in test mode.
+
+## 2. Load the resources
+
+!!! title "Note"
+
+ Note that this step is already taken care of if you use the default [T-Res Pipeline](./index.md).
+
+The following line of code loads the resources (i.e. the `mentions-to-wikidata_normalized.json` and `wikidata_to_mentions_normalized.json` files into dictionaries). They are required in order to perform candidate selection and ranking, regardless of the Ranker method.
+
+```python
+ranker.load()
+```
+
+## 3. Train a DeezyMatch model
+
+!!! title "Note"
+
+ Note that this step is already taken care of if you use the default [T-Res Pipeline](./index.md).
+
+The following line will train a DeezyMatch model, given the arguments specified when instantiating the Ranker.
+
+```python
+ranker.train()
+```
+
+Note that if the model already exists and `overwrite_training` is set to `False`, the training will be skipped, even if you call the `train()` method. The training will also be skipped if the Ranker is instantiated for a different method than DeezyMatch.
+
+The resulting model will be stored in the specified path. In this case, the resulting DeezyMatch model that the Ranker will use is called `w2v_ocr`:
+
+ T-RES/
+ ├── ...
+ ├── resources/
+ │ ├── deezymatch/
+ │ │ └── models/
+ │ │ └── w2v_ocr/
+ │ │ ├── input_dfm.yaml
+ │ │ ├── w2v_ocr.model
+ │ │ ├── w2v_ocr.model_state_dict
+ │ │ └── w2v_ocr.vocab
+ │ ├── models/
+ │ ├── news_datasets/
+ │ ├── wikidata/
+ │ │ ├── mentions_to_wikidata_normalized.json
+ │ │ └── wikidata_to_mentions_normalized.json
+ │ └── wikipedia/
+ └── ...
+
+## 4. Retrieve candidates for a given mention
+
+In order to use the Ranker to retrieve candidates for a given mention, follow the example. The Ranker's `run` method requires that the input is a list of `Mention` instances. Here we assume the `mentions` variable is such a list, obtained by executing the [`Recogniser`](recogniser.md) on a sentence of text:
+```python
+candidate_matches = ranker.run(mentions)
+print(candidate_matches)
+```
+The result is a list of `CandidateMatches` instances.
+
+
diff --git a/docs/getting-started/pipeline/recogniser.md b/docs/getting-started/pipeline/recogniser.md
new file mode 100644
index 00000000..8a4b1f68
--- /dev/null
+++ b/docs/getting-started/pipeline/recogniser.md
@@ -0,0 +1,118 @@
+# Recogniser
+
+The Recogniser performs toponym recognition (i.e. geographic named entity recognition), using HuggingFace's `transformers` library. Users can either:
+
+1. Load an existing model (either directly downloading a model from the HuggingFace hub or loading a locally stored NER model), or
+2. Fine-tune a new model on top of a base model and loading it, or directly load it if it is already pre-trained.
+
+!!! example "Recogniser example notebooks"
+
+ The following notebooks provide examples of both training or loading a NER model using the Recogniser, and using it for detecting entities:
+
+ ./examples/train_use_ner_model.ipynb
+ ./examples/load_use_ner_model.ipynb
+
+## Recogniser Classes
+
+To perform toponym recognition with T-Res you must first construct an instance of the `Recogniser` class, as explained in [Section 1](#1-instantiate-the-recogniser) below. The following diagram shows the class structure, with the abstract base class `Recogniser` in green and its two concrete subclasses in orange. When constructing an instance, choose the appropriate subclass for your NER model.
+
+
+
+{ width="560" }
+
+## 1. Instantiate the Recogniser
+
+### Pretrained Recogniser
+
+To load an already trained model (both from HuggingFace or a locally stored pre-trained model), you can just instantiate the recogniser as follows:
+
+```python
+from t_res.geoparser import ner
+
+recogniser = ner.PretrainedRecogniser(
+ model_name="path-to-model"
+)
+```
+
+For example, in order to load the [Livingwithmachines/toponym-19thC-en](https://huggingface.co/Livingwithmachines/toponym-19thC-en) NER model from the HuggingFace hub, initialise the Recogniser as follows:
+
+```python
+from t_res.geoparser import ner
+
+recogniser = ner.PretrainedRecogniser(
+ model_name="Livingwithmachines/toponym-19thC-en"
+)
+```
+
+You can also load a model that is stored locally in the same way. For example, let's suppose the user has a NER model stored in the relative location `../resources/models/blb_lwm-ner-fine`. The user could load it as follows:
+
+```python
+from t_res.geoparser import ner
+
+recogniser = ner.PretrainedRecogniser(
+ model_name="resources/models/blb_lwm-ner-fine"
+)
+```
+
+### Custom Recogniser
+
+Alternatively, you can use the Recogniser to train a new model (and load it, once it's trained). The model will be trained using HuggingFace's `transformers` library. To instantiate the Recogniser for training a new model and loading it once it's trained, you can do it as in the example (see the description of each parameter below):
+
+```python
+from t_res.geoparser import ner
+
+recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
+ train_dataset="experiments/outputs/data/lwm/ner_fine_train.json",
+ test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json",
+ base_model="Livingwithmachines/bert_1760_1900",
+ model_path="resources/models/",
+ training_args={
+ "batch_size": 8,
+ "num_train_epochs": 10,
+ "learning_rate": 0.00005,
+ "weight_decay": 0.0,
+ },
+ overwrite_training=False,
+ do_test=False,
+)
+```
+
+Description of the parameters:
+
+- `overwrite_training`: it indicates whether a model should be re-trained, even if there already is a model with the same name in the pre-specified output folder. If `load_from_hub` is set to `False` and `overwrite_training` is also set to `False`, then the Recogniser will be prepared to first try to load the model and---if it does not exist---to train it. If `overwrite_training` is set to `True`, it will prepare the Recogniser to train a model, even if a model with the same name already exists.
+- `base_model`: the path to the model that will be used as base to train our NER model. This can be the path to a HuggingFace model (for example, we are using [Livingwithmachines/bert_1760_1900](https://huggingface.co/Livingwithmachines/bert_1760_1900), a BERT model trained on nineteenth-century texts) or the path to a pre-trained model from a local folder.
+- `train_dataset` and `test_dataset`: the path to the train and test data sets necessary for training the NER model. You can find more information about the format of this data in the "[Resources and file structure](../../getting-started/resources.md)" page in the documentation.
+- `model_path`: the path folder where the Recogniser will store the model (and try to load it from).
+- `model`: the name of the NER model.
+- `training_args`: the training arguments: the user can change the learning rate, batch size, number of training epochs, and weight decay.
+- `do_test`: it allows the user to train a mock model and then load it (note that the suffix `_test` will be added to the model name).
+
+## 2. Train the NER model
+
+!!! title "Note"
+
+ Note that this step is already taken care of if you use the default [T-Res Pipeline](./index.md).
+
+Once the Recogniser has been initialised, you can train the model by running:
+
+```python
+recogniser.train()
+```
+
+Note that if `load_to_hub` is set to `True` or the model already exists (and `overwrite_training` is set to `False`), the training will be skipped, even if you call the `train()` method.
+
+## 3. Resolve toponyms in a sentence of text.
+
+In order to use the Recogniser to resolve toponyms from text, follow the example. The Recogniser's `run` method takes as input a sentence (of type `str`):
+```python
+sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield."
+sentence_mentions = recogniser.run(sentence)
+print(sentence_mentions)
+```
+The `run` method returns an instance of the `SentenceMentions` dataclass, containing all of the toponym mentions found in the given sentence. To obtain a list of `Mentions` instance, access the `mentions` attribute:
+```python
+list_of_mentions = sentence_mentions.mentions
+```
+
+
diff --git a/docs/getting-started/resources.md b/docs/getting-started/resources.md
new file mode 100644
index 00000000..0bc8a139
--- /dev/null
+++ b/docs/getting-started/resources.md
@@ -0,0 +1,437 @@
+# Resources & file structure
+
+T-Res requires several resources to work. Some resources can be downloaded and loaded directly from the web. Others will need to be generated, following the instructions provided in this section. In this page, we describe the format of the files that are required by T-Res, therefore also giving the user the option to use their own resources (adapted to T-Res).
+
+## Toponym recognition and disambiguation training data
+
+We provide the dataset we used to train T-Res for the tasks of toponym recognition (i.e. a named entity recognition task) and toponym disambiguation (i.e. an entity linking task focused on geographical entities) in English. The dataset is based on the [TopRes19th dataset](https://openhumanitiesdata.metajnl.com/articles/10.5334/johd.56).
+
+!!! title "Note"
+
+ You can download the data (in the format required by T-Res) from the [British Library research repository](https://bl.iro.bl.uk/concern/datasets/ef537c70-87cb-495a-86c8-edffefa6bdc6).
+
+By default, T-Res assumes the files are stored in the following location:
+
+```
+T-Res/
+└── experiments/
+ └── outputs/
+ └── data/
+ └── lwm/
+ ├── ner_fine_dev.json
+ ├── ner_fine_test.json
+ └── linking_df_split.tsv
+```
+
+Continue reading the sections below to learn more about the datasets, and for a description of the format expected by T-Res.
+
+### 1. Toponym recognition dataset
+
+!!! title "Note"
+
+ You don't need a toponym recognition dataset if you load a NER model directly from the HuggingFace hub, or from a local folder. In that case, you can skip this section.
+
+T-Res allows directly loading a pre-trained BERT-based NER model, either locally or from the HuggingFace models hub. If this is your option, you can skip this section. Otherwise, if you want to train your own NER model using either our dataset or a different dataset, you should continue reading.
+
+T-Res requires that the data for training a NER model is provided as two json files (one for training, one for testing) in the JSON Lines format, where each line corresponds to a sentence. Each sentence is a dictionary with three key-value pairs: `id` (an identifier of the sentence, a string), `tokens` (the list of tokens into which the sentence has been split), and `ner_tags` (the list of annotations per token, in the BIO format). The length of `tokens` and `ner_tags` is therefore always the same. See below an example of three lines from one of the JSON files, corresponding to three annotated sentences:
+
+```json
+{"id":"3896239_29","ner_tags":["O","B-STREET","I-STREET","O","O","O","B-BUILDING","I-BUILDING","O","O","O","O","O","O","O","O","O","O"],"tokens":[",","Old","Millgate",",","to","the","Collegiate","Church",",","where","they","arrived","a","little","after","ten","oclock","."]}
+{"id":"8262498_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","B-LOC","O","O","O","O","O","O"],"tokens":["On","the","'","JSth","November","the","ship","Santo","Christo",",","from","Monteveido","to","Cadiz",",","with","hides","and","copper","."]}
+{"id":"10715509_7","ner_tags":["O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["A","COACH","to","SOUTHAMPTON",",","every","morning","at","a","quarter","before","6",",","Sundays","excepted","."]}
+```
+
+Note that the list of NER labels will be automatically detected from the training data.
+
+### 2. Toponym disambiguation dataset
+
+!!! title "Note"
+
+ You won't need a toponym disambiguation dataset if you use the unsupervised approach for linking (such as the `mostpopular` or `bydistance` linking methods). You will need a toponym disambiguation dataset if you want to use one of the REL-based approaches.
+
+Train and test data examples are required for training a new entity disambiguation (ED) model. They should be provided in a single tsv file, named `linking_df_split.tsv`, one document per row, with the following required columns:
+
+ - `article_id`: article identifier, which consists of the number in the document file in the original dataset (for example, the `article_id` of `1218_Poole1860.tsv` is `1218`).
+ - `sentences`: list of dictionaries, each dictionary corresponding to a sentence in the article, with two fields: `sentence_pos` (the position of the sentence in the article) and `sentence_text` (the text of the sentence). For example:
+ ```json
+ [
+ {
+ "sentence_pos": 1,
+ "sentence_text": "DUKINFIELD. "
+ },
+ {
+ "sentence_pos": 2,
+ "sentence_text": "Knutsford Sessions."
+ },
+ {
+ "sentence_pos": 3,
+ "sentence_text": "—The servant girl, Eliza Ann Byrom, who stole a quantity of clothes from the house where she lodged, in Dukiafield, was sentenced to two months’ imprisonment. "
+ }
+ ]
+ ```
+ - `annotations`: list of dictionaries containing the annotated place names. Each dictionary corresponds to a named entity mentioned in the text, with (at least) the following fields: `mention_pos` (order of the mention in the article), `mention` (the actual mention), `entity_type` (the type of named entity), `wkdt_qid` (the Wikidata ID of the resolved entity), `mention_start` (the character start position of the mention in the sentence), `mention_end` (the character end position of the mention in the sentence), `sent_pos` (the sentence index in which the mention is found). For example:
+ ```json
+ [
+ {
+ "mention_pos": 0,
+ "mention": "DUKINFIELD",
+ "entity_type": "LOC",
+ "wkdt_qid": "Q1976179",
+ "mention_start": 0,
+ "mention_end": 10,
+ "sent_pos": 1
+ },
+ {
+ "mention_pos": 1,
+ "mention": "Knutsford",
+ "entity_type": "LOC",
+ "wkdt_qid": "Q1470791",
+ "mention_start": 0,
+ "mention_end": 9,
+ "sent_pos": 2
+ },
+ {
+ "mention_pos": 2,
+ "mention": "Dukiafield",
+ "entity_type": "LOC",
+ "wkdt_qid": "Q1976179",
+ "mention_start": 104,
+ "mention_end": 114,
+ "sent_pos": 3
+ }
+ ]
+ ```
+ - `place_wqid`: A string with the Wikidata ID of the place of publication. For example, if `place` is London UK, then `place_wqid` should be `Q84`.
+
+Finally, the TSV contains a set of columns which can be used to indicate how to split the dataset into training (`train`), development (`dev`), testing (`test`), or documents to leave out (`left_out`). The Linker requires that the user specifies which column should be used for training the ED model. The code assumes the following columns:
+
+ - `originalsplit`: The articles maintain the `test` set of the original dataset. Train is split into `train` (0.66) and `dev` (0.33).
+ - `apply`: The articles are divided into `train` and `dev`, with no articles left for testing. This split can be used to train the final entity disambiguation model, after the experiments.
+ - `withouttest`: This split can be used for development. The articles in the test set of the original dataset are left out. The training set is split into `train`, `dev` and `test`.
+
+## Wikipedia- and Wikidata-based resources
+
+T-Res requires a series of Wikipedia- and Wikidata-based resources:
+
+- `mentions_to_wikidata.json`
+- `mentions_to_wikidata_normalized.json`
+- `wikidata_to_mentions_normalized.json`
+- `wikidata_gazetteer.csv`
+- `entity2class.txt`
+
+!!! title "Note"
+
+ These files can be generated using the [wiki2gaz](https://github.com/Living-with-machines/wiki2gaz) GitHub repository. For more information on how they are built, refer to the `wiki2gaz` documentation.
+
+T-Res assumes these files exist in the following default location:
+
+```
+T-Res/
+└── resources/
+ └── wikidata/
+ ├── entity2class.txt
+ ├── mentions_to_wikidata_normalized.json
+ ├── mentions_to_wikidata.json
+ ├── wikidata_gazetteer.csv
+ └── wikidata_to_mentions_normalized.json
+```
+
+The sections below describe the contents of the files, as well as their format, in case you prefer to provide your own resources (which should have the same format).
+
+### Resource file: `mentions_to_wikidata.json`
+
+A JSON file consisting of a python dictionary in which the key is a mention of a place in Wikipedia (by means of an anchor text) and the value is an inner dictionary, where the inner keys are the QIDs of all Wikidata entities that can be referred to by the mention in question, and the inner values are the absolute counts (i.e. the number of times such mention is used in Wikipedia to refer to this particular entity).
+
+You can load the dictionary, and access it, as follows:
+
+```python
+>>> import json
+>>> with open('mentions_to_wikidata.json', 'r') as f:
+... mentions_to_wikidata = json.load(f)
+...
+>>> mentions_to_wikidata["Wiltshire"]
+```
+
+In the example, the value assigned to the key "Wiltshire" is:
+
+```json
+{
+ "Q23183": 4457,
+ "Q55448990": 5,
+ "Q8023421": 1
+}
+```
+
+In the example, we see that the mention "Wiltshire" is assigned a mapping between key `Q23183` and value 4457. This means that, on Wikipedia, "Wiltshire" appears 4457 times to refer to entity [Q23183](https://www.wikidata.org/wiki/Q23183) (through the mapping between Wikidata entity `Q23183` and its [corresponding Wikipedia page](https://en.wikipedia.org/wiki/Wiltshire)).
+
+### Resource file: `mentions_to_wikidata_normalized.json`
+
+A JSON file containing the normalised version of the `mentions_to_wikidata.json` dictionary. For example, the value of the mention "Wiltshire" is now:
+
+```json
+{
+ "Q23183": 0.9767696690773614,
+ "Q55448990": 1.0,
+ "Q8023421": 0.03125
+}
+```
+
+Note that these scores do not add up to one, as they are normalised by entity, not by mention. They are a measure of how likely an entity is to be referred to by a mention. In the example, we see that entity `Q55448990` is always referred to as `Wiltshire`.
+
+### Resource file: `wikidata_to_mentions_normalized.json`
+
+A JSON file consisting of a python dictionary in which the key is a Wikidata QID and the value is an inner dictionary, in which the inner keys are the mentions used in Wikipedia to refer to such Wikidata entity, and the values are their relative frequencies.
+
+You can load the dictionary, and access it, as follows:
+
+```python
+>>> import json
+>>> with open('wikidata_to_mentions_normalized.json', 'r') as f:
+... wikidata_to_mentions_normalized = json.load(f)
+...
+>>> wikidata_to_mentions_normalized["Q23183"]
+```
+
+In this example, the value of entity [Q23183](https://www.wikidata.org/wiki/Q23183) is:
+
+```json
+{
+ "Wiltshire, England": 0.005478851632697786,
+ "Wilton": 0.00021915406530791147,
+ "Wiltshire": 0.9767696690773614,
+ "College": 0.00021915406530791147,
+ "Wiltshire Council": 0.0015340784571553803,
+ "West Wiltshire": 0.00021915406530791147,
+ "North Wiltshire": 0.00021915406530791147,
+ "Wilts": 0.0015340784571553803,
+ "County of Wilts": 0.0026298487836949377,
+ "County of Wiltshire": 0.010081087004163929,
+ "Wilts.": 0.00021915406530791147,
+ "Wiltshire county": 0.00021915406530791147,
+ "Wiltshire, United Kingdom": 0.00021915406530791147,
+ "Wiltshire plains": 0.00021915406530791147,
+ "Wiltshire England": 0.00021915406530791147
+}
+```
+
+In this example, we can see that entity `Q23183` is referred to as "Wiltshire, England" in Wikipedia 0.5% of the times and as "Wiltshire" 97.7% of the times. These values add up to one.
+
+### Resource file: `wikidata_gazetteer.csv`
+
+A CSV file consisting of (at least) the following four columns:
+
+- a Wikidata ID (QID) of a location,
+- its English label,
+- its latitude, and
+- its longitude.
+
+You can load the csv, and show the first five rows, as follows:
+
+```python
+>>> import pandas as pd
+>>> df = pd.read_csv("wikidata_gazetteer.csv")
+>>> df. head()
+ wikidata_id english_label latitude longitude
+0 Q5059107 Centennial 40.01140 -87.24330
+1 Q5059144 Centennial Grounds 39.99270 -75.19380
+2 Q5059153 Centennial High School 40.06170 -83.05780
+3 Q5059162 Centennial High School 38.30440 -104.63800
+4 Q5059178 Centennial Memorial Samsung Hall 37.58949 127.03434
+```
+
+Each row corresponds to a Wikidata geographic entity (i.e. a Wikidata entity with coordinates).
+
+!!! title "Note"
+
+ Note that the latitude and longitude are not used by the REL disambiguation method: they are only provided as a post-processing step when rendering the output of the linking. Therefore, the columns can have dummy values (of type `float`) if the user is not interested in obtaining the coordinates: the linking to Wikidata will not be affected. Column `english_label` can likewise be left empty.
+
+### Resource file: `entity2class.txt`
+
+A python dictionary in which each entity in Wikidata is mapped to its most common Wikidata class.
+
+You can load the dictionary, and access it, as follows:
+
+```python
+>>> with open('entity2class.txt', 'r') as f:
+... entity2class = json.load(f)
+...
+>>> entity2class["Q23183"]
+'Q180673'
+>>> entity2class["Q84"]
+'Q515'
+```
+
+For example, Wiltshire ([Q23183](https://www.wikidata.org/wiki/Q23183)) is mapped to [Q180673](https://www.wikidata.org/wiki/Q180673), i.e. "ceremonial county of England", whereas London ([Q84](https://www.wikidata.org/wiki/Q84)) is mapped to [Q515](https://www.wikidata.org/wiki/Q515), i.e. "city".
+
+!!! title "Note"
+
+ Note that the entity2class mapping is not used by the disambiguation method: the Wikidata class is only provided as a post-processing step when rendering the output of the linking. T-Res will complain if the file is not there, but values can be left empty if the user is not interested in obtaining the wikidata class of the predicted entity. The linking to Wikidata will not be affected.
+
+## Entity and word embeddings
+
+!!! title "Note"
+
+ Note that you will not need this if you use the `mostpopular` or `bydistance` disambiguation approach.
+
+In order to perform toponym linking and resolution using the REL-based approaches, T-Res requires a database of word2vec and wiki2vec embeddings.
+
+By default, T-Res expects a database file called `embeddings_database.db` with, at least, one table (`entity_embeddings`) with at least the following columns:
+
+- `word`: Either a lower-cased token (i.e. a word on Wikipedia) or a Wikidata QID preceded by `ENTITY/`. The database should also contain the following two wildcard tokens: `#ENTITY/UNK#` and `#WORD/UNK#`.
+- `emb`: The corresponding word or entity embedding.
+
+In our experiments, we derived the embeddings database from REL's shared resources.
+
+!!! title "Note"
+
+ We are working towards improving this step in the pipeline. Meanwhile, to generate the `embeddings_database.db`, please follow these steps:
+
+ - Make sure you have `wikidata_gazetteer.csv` in `./resources/wikidata/` (see [above](#wikipedia-and-wikidata-based-resources)).
+ - Generate a Wikipedia-to-Wikidata index, following [these instructions](https://github.com/jcklie/wikimapper#create-your-own-index), save it as: `./resources/wikipedia/index_enwiki-latest.db`.
+ - Run [this script](https://github.com/Living-with-machines/wiki2gaz/blob/main/download_and_merge_embeddings_databases.py) to create the embeddings database.
+
+You can load the file, and access a token embedding, as follows:
+
+```python
+>>> import array
+>>> from array import array
+>>> with sqlite3.connect("embeddings_database.db") as conn:
+... cursor = conn.cursor()
+... result = cursor.execute("SELECT emb FROM entity_embeddings WHERE word='lerwick'").fetchone()
+... result = result if result is None else array("f", result[0]).tolist()
+...
+>>> result
+[-0.3257000148296356, -0.00989999994635582, -0.13420000672340393, ...]
+```
+
+You can load the file, and access an entity embedding, as follows:
+
+```python
+>>> import array
+>>> from array import array
+>>> with sqlite3.connect("embeddings_database.db") as conn:
+... cursor = conn.cursor()
+... result = cursor.execute("SELECT emb FROM entity_embeddings WHERE word='ENTITY/Q84'").fetchone()
+... result = result if result is None else array("f", result[0]).tolist()
+...
+>>> result
+[-0.014700000174343586, 0.007899999618530273, -0.1808999925851822, ...]
+```
+
+T-Res expects the `embeddings_database.db` file to be stored as follows:
+
+```
+T-Res/
+└── resources/
+ └── rel_db/
+ └── embeddings_database.db
+```
+
+## DeezyMatch training set
+
+In order to train a DeezyMatch model, a training set consisting of positive and negative string pairs is required. We provide a dataset of positive and negative OCR variations, which can be used to train a DeezyMatch model, which can then be used to perform fuzzy string matching to find candidates for entity linking.
+
+!!! title "Note"
+
+ The DeezyMatch training set can be downloaded from the [British Library research repository](https://bl.iro.bl.uk/concern/datasets/12208b77-74d6-44b5-88f9-df04db881d63). This dataset is only necessary if you want to use the DeezyMatch approach to perform candidate selection. This is not needed if you use the `perfectmatch` or `partialmatch` ranking methods.
+
+T-Res assumes by default the DeezyMatch training set to be named `w2v_ocr_pairs.txt` and to be in the following location:
+
+```
+T-Res/
+└── resources/
+ └── deezymatch/
+ └── data/
+ └── w2v_ocr_pairs.txt
+```
+
+Optionally, T-Res also provides the option to generate a DeezyMatch training set from word2vec embeddings trained on digitised texts. Continue reading the sections below for more information about both types of resources.
+
+### 1. DeezyMatch training set
+
+T-Res can directly load the string pairs dataset required to train a new DeezyMatch model. By default, the code assumes the dataset to be called `w2v_ocr_pairs.txt`. The dataset consists of three columns: `word1`, `word2`, and a boolean describing whether `word2` is an OCR variation of `word1`. For example:
+
+```
+could might FALSE
+could wished FALSE
+could hardly FALSE
+could didnot FALSE
+could never FALSE
+could reusing FALSE
+could could TRUE
+could coeld TRUE
+could could TRUE
+could conld TRUE
+could could TRUE
+could couid TRUE
+```
+
+This dataset has been automatically generated from word2vec embeddings trained on digitised historical news texts (i.e. with OCR noise), and has been expanded with toponym alternate names extracted from Wikipedia.
+
+The dataset we provide consists of 1,085,514 string pairs.
+
+### 2. Word2Vec embeddings trained on noisy data
+
+!!! title "Note"
+
+ The 19thC word2vec embeddings **are not needed** if you already have the DeezyMatch training set `w2v_ocr_pairs.txt` (described in the [section above](#deezymatch-training-set)).
+
+To create a new DeezyMatch training set using T-Res, you need to provide Word2Vec models that have been trained on digitised historical news texts. In our experiments, we used the embeddings trained on a 4.2-billion-word corpus of 19th-century British newspapers using Word2Vec (you can download them from [Zenodo](https://doi.org/10.5281/zenodo.7887305)), but you can also do this with your own word2vec embeddings. The embeddings are divided into periods of ten years each. By default, T-Res assumes that the word2vec models are stored in `./resources/models/w2v/`, in directories named `w2v_xxxxs_news/`, where `xxxx` corresponds to the decade (e.g. 1800 or 1810) of the models.
+
+See the expected directory structure below:
+
+```
+T-Res/
+└── resources/
+ └── models/
+ └── w2v/
+ ├── w2v_1800_news/
+ │ ├── w2v.model
+ │ ├── w2v.model.syn1neg.npy
+ │ └── w2v.model.wv.vectors.npy
+ ├── w2v_1810_news/
+ │ ├── w2v.model
+ │ ├── w2v.model.syn1neg.npy
+ │ └── w2v.model.wv.vectors.npy
+ └── .../
+```
+
+## Summary of resources and directory structure
+
+In the code and our tutorials, we assume the following directory structure for the mentioned resources that are required in order to run the pipeline.
+
+```
+T-Res/
+├── t-res/
+│ ├── geoparser/
+│ └── utils/
+├── app/
+├── evaluation/
+├── examples/
+├── experiments/
+│ └── outputs/
+│ └── data/
+│ └── lwm/
+│ ├── linking_df_split.tsv [*?]
+│ ├── ner_fine_dev.json [*+?]
+│ └── ner_fine_train.json [*+?]
+├── resources/
+│ ├── deezymatch/
+│ │ └── data/
+│ │ └── w2v_ocr_pairs.txt [?]
+│ ├── models/
+│ ├── news_datasets/
+│ ├── rel_db/
+│ │ └── embeddings_database.db [*+?]
+│ └── wikidata/
+│ ├── entity2class.txt [*]
+│ ├── mentions_to_wikidata_normalized.json [*]
+│ ├── mentions_to_wikidata.json [*]
+│ ├── wikidta_gazetteer.csv [*]
+│ └── wikidata_to_mentions_normalized.json [*]
+└── tests/
+```
+
+A question mark (`?`) is used to indicate resources which are only required for some approaches (for example, the `rel_db/embeddings_database.db` file is only required by the REL-based disambiguation approaches). Note that an asterisk (`*`) next to the resource means that the path can be changed when instantiating the T-Res objects, and a plus sign (`+`) if the name of the file can be changed in the instantiation.
+
+
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..7f50278f
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,51 @@
+# T-Res: A Toponym Resolution Pipeline for Digitised Historical Newspapers
+
+[](https://github.com/Living-with-machines/T-Res/blob/master/LICENSE)
+
+T-Res is an end-to-end pipeline for toponym resolution for digitised historical newspapers. Given an input text, T-Res identifies the places that are mentioned in it, links them to their corresponding Wikidata IDs, and provides their geographic coordinates. T-Res has been designed to tackle common problems of working with digitised historical newspapers.
+
+The pipeline has three main components:
+
+1. **The Recogniser** performs named entity recognition.
+2. **The Ranker** performs candidate selection and ranking.
+3. **The Linker** performs entity linking and resolution.
+
+These three components are used in combination in the **Pipeline** class.
+
+We also provide the code to deploy T-Res as an HTTP API, and show how to use it. Each of these elements are described in this documentation.
+
+
+
+- :material-cog-outline:{ .lg .middle } __Installation & Setup__
+
+ ---
+
+ Install T-Res and get up and running
+
+ [:octicons-arrow-right-24: Getting started](getting-started/index.md)
+
+- :material-text-box-outline:{ .lg .middle } __Reference__
+
+ ---
+
+ Complete reference for the T-Res codebase
+
+ [:octicons-arrow-right-24: Reference](reference/index.md)
+
+- :material-swap-vertical:{ .lg .middle } __HTTP API__
+
+ ---
+
+ Deploy & use T-Res via an HTTP API
+
+ [:octicons-arrow-right-24: Customization](t-res-api/index.md)
+
+- :material-flask-outline:{ .lg .middle } __Experiments__
+
+ ---
+
+ Reproduce experimental benchmarks
+
+ [:octicons-arrow-right-24: Experiments](experiments/index.md)
+
+
\ No newline at end of file
diff --git a/docs/javascripts/extra.js b/docs/javascripts/extra.js
new file mode 100644
index 00000000..26aa5d21
--- /dev/null
+++ b/docs/javascripts/extra.js
@@ -0,0 +1,34 @@
+document.addEventListener("DOMContentLoaded", function() {
+ fixCopyOnlyUserSelectable();
+ });
+
+ function fixCopyOnlyUserSelectable() {
+ buttonsToFix = document.querySelectorAll(
+ '.language-console button.md-clipboard');
+ if (buttonsToFix.length)
+ console.log('Fixing copy-to-clipboard text of console code-blocks.');
+ buttonsToFix.forEach((btn) => {
+ var content = extractUserSelectable(btn.dataset.clipboardTarget);
+ btn.dataset.clipboardText = content;
+ });
+ }
+
+ function extractUserSelectable(selector) {
+ var result = '';
+ var element = document.querySelector(selector);
+ element.childNodes.forEach((child) => {
+ if (child instanceof Element) {
+ var s=window.getComputedStyle(child);
+ if (s.getPropertyValue('user-select') == 'none' ||
+ s.getPropertyValue('-webkit-user-select') == 'none' ||
+ s.getPropertyValue('-ms-user-select') == 'none')
+ {
+ return;
+ }
+ }
+ result += child.textContent;
+ });
+ // remove empty lines
+ result = result.replace(/^\s+|\s+$/g, '');
+ return result;
+ }
diff --git a/docs/javascripts/katex.js b/docs/javascripts/katex.js
new file mode 100644
index 00000000..103be6be
--- /dev/null
+++ b/docs/javascripts/katex.js
@@ -0,0 +1,10 @@
+document$.subscribe(({ body }) => {
+ renderMathInElement(body, {
+ delimiters: [
+ { left: "$$", right: "$$", display: true },
+ { left: "$", right: "$", display: false },
+ { left: "\\(", right: "\\)", display: false },
+ { left: "\\[", right: "\\]", display: true }
+ ],
+ })
+})
diff --git a/docs/reference/geoparser/index.md b/docs/reference/geoparser/index.md
new file mode 100644
index 00000000..032c2d0d
--- /dev/null
+++ b/docs/reference/geoparser/index.md
@@ -0,0 +1,10 @@
+# `geoparser` module
+
+
diff --git a/docs/reference/utils/ner_utils.md b/docs/reference/utils/ner_utils.md
new file mode 100644
index 00000000..8fe27cf4
--- /dev/null
+++ b/docs/reference/utils/ner_utils.md
@@ -0,0 +1,5 @@
+# NER Utils module
+
+::: t_res.utils.ner_utils
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/preprocess_data.md b/docs/reference/utils/preprocess_data.md
new file mode 100644
index 00000000..002a5fa0
--- /dev/null
+++ b/docs/reference/utils/preprocess_data.md
@@ -0,0 +1,5 @@
+# Preprocess Data module
+
+::: t_res.utils.preprocess_data
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/process_data.md b/docs/reference/utils/process_data.md
new file mode 100644
index 00000000..b5fd0e09
--- /dev/null
+++ b/docs/reference/utils/process_data.md
@@ -0,0 +1,5 @@
+# Process Data module
+
+::: t_res.utils.process_data
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/process_wikipedia.md b/docs/reference/utils/process_wikipedia.md
new file mode 100644
index 00000000..2c328742
--- /dev/null
+++ b/docs/reference/utils/process_wikipedia.md
@@ -0,0 +1,5 @@
+# Process Wikipedia module
+
+::: t_res.utils.process_wikipedia
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/rel/entity_disambiguation.md b/docs/reference/utils/rel/entity_disambiguation.md
new file mode 100644
index 00000000..2eaae71a
--- /dev/null
+++ b/docs/reference/utils/rel/entity_disambiguation.md
@@ -0,0 +1,5 @@
+# REL: Entity Disambiguation module
+
+::: t_res.utils.REL.entity_disambiguation
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/rel/index.md b/docs/reference/utils/rel/index.md
new file mode 100644
index 00000000..095b2c6c
--- /dev/null
+++ b/docs/reference/utils/rel/index.md
@@ -0,0 +1,23 @@
+# `utils.REL` module
+
+The scripts included in this module are taken and have been adapted from the [REL: Radboud Entity Linker](https://github.com/informagi/REL/) Github repository: Copyright (c) 2020 Johannes Michael van Hulst. See the [permission notice](https://github.com/informagi/REL/blob/main/LICENSE).
+
+ Reference:
+
+ @inproceedings{vanHulst:2020:REL,
+ author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
+ title = {REL: An Entity Linker Standing on the Shoulders of Giants},
+ booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
+ series = {SIGIR '20},
+ year = {2020},
+ publisher = {ACM}
+ }
+
+
diff --git a/docs/reference/utils/rel/mulrel_ranker.md b/docs/reference/utils/rel/mulrel_ranker.md
new file mode 100644
index 00000000..29724035
--- /dev/null
+++ b/docs/reference/utils/rel/mulrel_ranker.md
@@ -0,0 +1,5 @@
+# REL: MulRel Ranker module
+
+::: t_res.utils.REL.mulrel_ranker
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/rel/utils.md b/docs/reference/utils/rel/utils.md
new file mode 100644
index 00000000..4c8003bd
--- /dev/null
+++ b/docs/reference/utils/rel/utils.md
@@ -0,0 +1,5 @@
+# REL: Utils module
+
+::: t_res.utils.REL.utils
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/rel/vocabulary.md b/docs/reference/utils/rel/vocabulary.md
new file mode 100644
index 00000000..1f41a37f
--- /dev/null
+++ b/docs/reference/utils/rel/vocabulary.md
@@ -0,0 +1,5 @@
+# REL: Vocabulary module
+
+::: t_res.utils.REL.vocabulary
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/rel_e2e.md b/docs/reference/utils/rel_e2e.md
new file mode 100644
index 00000000..416aea15
--- /dev/null
+++ b/docs/reference/utils/rel_e2e.md
@@ -0,0 +1,5 @@
+# REL e2e module
+
+::: t_res.utils.rel_e2e
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/reference/utils/rel_utils.md b/docs/reference/utils/rel_utils.md
new file mode 100644
index 00000000..e360cfa7
--- /dev/null
+++ b/docs/reference/utils/rel_utils.md
@@ -0,0 +1,5 @@
+# REL Utils module
+
+::: t_res.utils.rel_utils
+ options:
+ members_order: source
\ No newline at end of file
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
new file mode 100644
index 00000000..482b091c
--- /dev/null
+++ b/docs/stylesheets/extra.css
@@ -0,0 +1,17 @@
+:root {
+ --md-primary-fg-color: #701F33;
+ --md-primary-fg-color--light: #701F33;
+ }
+.md-grid {
+ max-width: 87%;
+}
+.md-header__button.md-logo {
+ margin: 0;
+ /* padding: 0; */
+}
+/* prevent selection of prefix and output for console syntax */
+.language-console .gp, .language-console .go {
+ user-select: none;
+ -webkit-user-select: none; /* Chrome/Safari */
+ -ms-user-select: none; /* IE10+ */
+}
diff --git a/docs/t-res-api/index.md b/docs/t-res-api/index.md
new file mode 100644
index 00000000..0da06be4
--- /dev/null
+++ b/docs/t-res-api/index.md
@@ -0,0 +1,21 @@
+| |
+|-------------------------|
+| Deploying the T-Res API |
+
+T-Res can also be deployed as a [FastAPI](https://fastapi.tiangolo.com) via [Docker](https://www.docker.com), allowing remote users to access your T-Res pipeline instead of their own local installation.
+
+The API consists of the following files:
+
+- `app/app_template.py`
+- `app/configs/.py`
+- `app/template.Dockerfile`
+- `docker-compose.yml`
+
+Example configuration files are provided in this repository, which can be adapted to fit your needs.
+
+
+
+- [Deploying the T-Res API](installation.md)
+- [Using the T-Res API](usage.md)
+
+
diff --git a/docs/t-res-api/installation.md b/docs/t-res-api/installation.md
new file mode 100644
index 00000000..b8502475
--- /dev/null
+++ b/docs/t-res-api/installation.md
@@ -0,0 +1,65 @@
+# Deploying the T-Res API
+
+The T-Res API can be deployed either as a standalone docker container, or via docker compose to deploy multiple configurations of the pipeline simultaneously behind a reverse-proxy ([traefik](https://traefik.io/traefik/)).
+
+Docker and Docker Compose should be installed on your server according to the [official installation guide](https://docs.docker.com/engine/install/ubuntu/) before proceeding with the following steps to build and deploy the containers.
+
+1\. Building the container --------------
+
+To build a docker image for the app using the default configuration provided (`t-res_deezy_reldisamb-wpubl-wmtops.py`), run the following bash commands from the root of the repository:
+
+``` bash
+export CONTAINER_NAME=t-res_deezy_reldisamb-wpubl-wmtops
+sudo -E docker build -f app/template.Dockerfile --no-cache --build-arg APP_NAME=${CONTAINER_NAME} -t ${CONTAINER_NAME}_image .
+```
+
+2\. Deploying the container --------------
+
+The docker image built in step 1 can then be deployed by running the following command, providing the required resources are available according to the :ref: [Resources and directory structure](../getting-started/resources.html) section.
+
+``` bash
+sudo docker run -p 80:80 -it \
+ -v ${HOME}/T-Res/resources/:/app/resources/ \
+ -v ${HOME}/T-Res/geoparser/:/app/geoparser/ \
+ -v ${HOME}/T-Res/utils/:/app/utils/ \
+ -v ${HOME}/T-Res/preprocessing/:/app/preprocessing/ \
+ -v ${HOME}/T-Res/experiments/:/app/experiments/ \
+ -v ${HOME}/T-Res/app/configs/:/app/configs/ \
+ ${CONTAINER_NAME}_image:latest
+```
+
+3\. Deploying multiple containers via Docker Compose --------------To deploy the example configuration behind a traefik load-balancing server:
+
+``` bash
+HOST_URL= sudo -E docker-compose up -d
+```
+
+4\. Configuring your deployment --------------
+
+1. Add your T-Res pipeline configuration file to the `app/config` directory. This file should instantiate the `Recogniser`, `Linker`, and `Ranker` to be used in your pipeline and store them in a dictionary called `CONFIG`, which is then imported and used by the app.
+
+2. Optionally, you can add or edit endpoints or app behaviour in the `app/app_template.py` file
+
+3. Build your docker container as in step 1, setting the `CONTAINER_NAME` environment variable to your new configuration's name
+
+4. Add a section to the docker-compose.yml, updating the service name, image and labels as follows:
+
+ > ``` yaml
+ > ```
+ >
+ > :
+ > : image: _image:latest restart: always expose:
+ > - 80 volumes:
+ > - \${HOME}/T-Res/resources/:/app/resources/
+ > - \${HOME}/T-Res/geoparser/:/app/geoparser/
+ > - \${HOME}/T-Res/utils/:/app/utils/
+ > - \${HOME}/T-Res/preprocessing/:/app/preprocessing/
+ >
+ > - \${HOME}/T-Res/experiments/:/app/experiments/ labels:
+ > - traefik.enable=true
+ > - traefik.http.services..loadbalancer.server.port=80
+ > - traefik.http.routers._router.service=
+ > - traefik.http.routers._router.rule=Host(````, ``0.0.0.0``) && PathPrefix(``/v2/t-res_``)
+ > - traefik.http.middlewares.test-stripprefix-rwop.stripprefix.prefixes=/v2/[t-res]()
+ >
+ > - traefik.http.routers._router.middlewares=test-stripprefix-rwop command: ["poetry", "run", "uvicorn", "app:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80", "--root-path", "/v2/t-res_deezy_reldisamb-wpubl-wmtops"]
diff --git a/docs/t-res-api/usage.md b/docs/t-res-api/usage.md
new file mode 100644
index 00000000..6283f9ec
--- /dev/null
+++ b/docs/t-res-api/usage.md
@@ -0,0 +1,13 @@
+# Using the T-Res API
+
+If you deploy the T-Res API according to the steps in the previous section, it should now be available on your server as a HTTP API (be sure to expose the correct ports - by default, the app is deployed to port 8000). Automatically generated, interactive documentation (created by ``Swagger``) is available at the `/docs` endpoint.
+
+The following example shows how to query the API via curl to resolve the toponyms in a single sentence:
+
+``` bash
+curl -X GET http://20.0.184.45:8000/v2/t-res_deezy_reldisamb-wpubl-wmtops/toponym_resolution \
+-H "Content-Type: application/json" \
+-d '{"text": "Harvey, from London;Thomas and Elizabeth, Barnett.", "place": "Manchester", "place_wqid": "Q18125"}'
+```
+
+See the `app/api_usage.ipynb` notebook for more examples of how to use the API's various endpoints via Python.
diff --git a/examples/load_use_ner_model.ipynb b/examples/load_use_ner_model.ipynb
index c4be7f47..9deabacd 100644
--- a/examples/load_use_ner_model.ipynb
+++ b/examples/load_use_ner_model.ipynb
@@ -9,7 +9,7 @@
"\n",
"This notebook shows how to load an existing named entity recognition (NER) model from the HuggingFace hub, using T-Res.\n",
"\n",
- "We start by importing some libraries, and the `recogniser` script from the `geoparser` folder:"
+ "We start by importing the `ner` module from the `geoparser` folder:"
]
},
{
@@ -18,11 +18,7 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
- "\n",
- "\n",
- "from t_res.geoparser import recogniser"
+ "from t_res.geoparser import ner"
]
},
{
@@ -30,9 +26,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Create a `myner` object of the `Recogniser` class.\n",
+ "Create an instance of the `PretrainedRecogniser` class.\n",
"\n",
- "We only need to pass the path to the model in `model` and set `load_from_hub` to True, as follows:"
+ "We only need to pass the name of the model in `model`, as follows:"
]
},
{
@@ -41,9 +37,8 @@
"metadata": {},
"outputs": [],
"source": [
- "myner = recogniser.Recogniser(\n",
- " model=\"Livingwithmachines/toponym-19thC-en\",\n",
- " load_from_hub=True,\n",
+ "recogniser = ner.PretrainedRecogniser(\n",
+ " model_name=\"Livingwithmachines/toponym-19thC-en\",\n",
")"
]
},
@@ -52,7 +47,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Print the Recogniser (see that most fields are empty, because they are not needed):"
+ "Print the Recogniser:"
]
},
{
@@ -61,7 +56,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(myner)"
+ "print(recogniser)"
]
},
{
@@ -69,7 +64,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "If we try to train the model, nothing happens, because we're loading an existing model:"
+ "To use the model, you'll need to first load the NER pipeline:"
]
},
{
@@ -78,7 +73,7 @@
"metadata": {},
"outputs": [],
"source": [
- "myner.train()"
+ "recogniser.load()"
]
},
{
@@ -86,24 +81,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Now, to use the model you want to use, you'll need to load it into a NER pipeline:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "myner.pipe = myner.create_pipeline()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And, finally, use the newly trained model to predict the named entities in a sentence."
+ "And, finally, use the pre-trained model to identify the named entities in a sentence.\n",
+ "\n",
+ "The output shows the toponymn mention `Sheffield`, its NER label `LOC`, the toponym's character span in the sentence and the NER confidence score:"
]
},
{
@@ -113,15 +93,14 @@
"outputs": [],
"source": [
"sentence = \"A remarkable case of rattening has just occurred in the building trade at Sheffield.\"\n",
- "\n",
- "predictions = myner.ner_predict(sentence)\n",
- "print(predictions)"
+ "mentions = recogniser.run(sentence)\n",
+ "print(mentions)"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
@@ -135,7 +114,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.17"
+ "version": "3.9.7"
},
"orig_nbformat": 4
},
diff --git a/examples/run_pipeline_basic.ipynb b/examples/run_pipeline_basic.ipynb
index aa8aba10..340e9857 100644
--- a/examples/run_pipeline_basic.ipynb
+++ b/examples/run_pipeline_basic.ipynb
@@ -16,9 +16,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
- "\n",
"from t_res.geoparser import pipeline"
]
},
@@ -51,7 +48,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The pipeline can take either a sentence (`run_sentence`) or a document (`run_text`). If the latter, the text is split into sentences using the `sentence-splitter` library. See an example of how to run each:"
+ "The pipeline can take on individual sentences or longer passages of text. If the latter, the text is split into sentences using the `sentence-splitter` library. In both cases, pass the text string to the `run` method on the `Pipeline`:"
]
},
{
@@ -60,8 +57,8 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_text(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
- "print(resolved)"
+ "predictions = geoparser.run(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
+ "print(predictions)"
]
},
{
@@ -70,8 +67,9 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
- "print(resolved)"
+ "text = \"A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.\"\n",
+ "predictions = geoparser.run(text)\n",
+ "print(predictions)"
]
},
{
@@ -85,7 +83,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Instead of using the end-to-end pipeline, the pipeline can be used step-wise.\n",
+ "Instead of using the end-to-end pipeline, the pipeline can be executed one step at a time.\n",
"\n",
"Therefore, it can be used to just perform toponym recognition (i.e. NER):"
]
@@ -96,15 +94,18 @@
"metadata": {},
"outputs": [],
"source": [
- "mentions = geoparser.run_text_recognition(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
- "print(mentions)"
+ "mentions = geoparser.run_text_recognition(text)\n",
+ "for m in mentions:\n",
+ " print(m)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "The pipeline can then be used to just perform candidate selection given the output of NER:"
+ "The pipeline can then be used to just perform candidate selection given the output of NER. \n",
+ "\n",
+ "Each candidate is a string match (with string similarity score), plus a list of potential links in Wikidata given by their Wikidata IDs:"
]
},
{
@@ -114,14 +115,17 @@
"outputs": [],
"source": [
"candidates = geoparser.run_candidate_selection(mentions)\n",
- "print(candidates)"
+ "for c in candidates.candidates():\n",
+ " print(c)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "And finally, the pipeline can be used to perform entity disambiguation, given the output from the previous two steps:"
+ "And finally, the pipeline can be used to perform entity disambiguation, given the output from the previous two steps. \n",
+ "\n",
+ "This produces predictions, each with a disambiguation confidence score:"
]
},
{
@@ -130,17 +134,10 @@
"metadata": {},
"outputs": [],
"source": [
- "disamb_output = geoparser.run_disambiguation(mentions, candidates)\n",
- "print(disamb_output)"
+ "predictions = geoparser.run_disambiguation(candidates)\n",
+ "print(predictions)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": null,
@@ -151,7 +148,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
@@ -165,7 +162,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.17"
+ "version": "3.9.7"
},
"orig_nbformat": 4
},
diff --git a/examples/run_pipeline_deezy_mostpopular.ipynb b/examples/run_pipeline_deezy_mostpopular.ipynb
index 9129b8b3..17e18760 100644
--- a/examples/run_pipeline_deezy_mostpopular.ipynb
+++ b/examples/run_pipeline_deezy_mostpopular.ipynb
@@ -14,8 +14,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
"from pathlib import Path\n",
"\n",
"from t_res.geoparser import pipeline, ranking, linking"
@@ -29,8 +27,7 @@
"source": [
"# --------------------------------------\n",
"# Instantiate the ranker:\n",
- "myranker = ranking.Ranker(\n",
- " method=\"deezymatch\",\n",
+ "ranker = ranking.DeezyMatchRanker(\n",
" resources_path=\"../resources/\",\n",
" strvar_parameters={\n",
" # Parameters to create the string pair dataset:\n",
@@ -66,8 +63,7 @@
"metadata": {},
"outputs": [],
"source": [
- "mylinker = linking.Linker(\n",
- " method=\"mostpopular\",\n",
+ "linker = linking.MostPopularLinker(\n",
" resources_path=\"../resources/\",\n",
")"
]
@@ -78,7 +74,7 @@
"metadata": {},
"outputs": [],
"source": [
- "geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)"
+ "geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)"
]
},
{
@@ -87,22 +83,14 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
- "for r in resolved:\n",
- " print(r)"
+ "predictions = geoparser.run(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
+ "print(predictions)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
@@ -116,7 +104,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.17"
+ "version": "3.9.7"
},
"orig_nbformat": 4
},
diff --git a/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb b/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb
index 8f89e400..3fb7621a 100644
--- a/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb
+++ b/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb
@@ -16,8 +16,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
"import sqlite3\n",
"from pathlib import Path\n",
"\n",
@@ -32,8 +30,7 @@
"source": [
"# --------------------------------------\n",
"# Instantiate the ranker:\n",
- "myranker = ranking.Ranker(\n",
- " method=\"deezymatch\",\n",
+ "ranker = ranking.DeezyMatchRanker(\n",
" resources_path=\"../resources/\",\n",
" strvar_parameters=dict(),\n",
" deezy_parameters={\n",
@@ -62,12 +59,12 @@
"source": [
"with sqlite3.connect(\"../resources/rel_db/embeddings_database.db\") as conn:\n",
" cursor = conn.cursor()\n",
- " mylinker = linking.Linker(\n",
- " method=\"reldisamb\",\n",
+ " linker = linking.RelDisambLinker(\n",
" resources_path=\"../resources/\",\n",
+ " ranker=ranker,\n",
" rel_params={\n",
" \"model_path\": \"../resources/models/disambiguation/\",\n",
- " \"data_path\": \"../experiments/outputs/data/lwm/\",\n",
+ " \"data_path\": \"../tests/sample_files/experiments/outputs/data/lwm/\",\n",
" \"training_split\": \"originalsplit\",\n",
" \"db_embeddings\": cursor,\n",
" \"with_publication\": False,\n",
@@ -86,7 +83,7 @@
"metadata": {},
"outputs": [],
"source": [
- "geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)"
+ "geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)"
]
},
{
@@ -95,10 +92,8 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_text(\"A remarkable case of rattening has just occurred in the building trade next to the Market-street of Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!\")\n",
- " \n",
- "for r in resolved:\n",
- " print(r)"
+ "predictions = geoparser.run(\"A remarkable case of rattening has just occurred in the building trade next to the Market-street of Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!\")\n",
+ "print(predictions)"
]
},
{
@@ -107,15 +102,14 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
- "for r in resolved:\n",
- " print(r)"
+ "predictions = geoparser.run(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
+ "print(predictions)"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
diff --git a/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb b/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb
index 6e74593f..c77da601 100644
--- a/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb
+++ b/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb
@@ -16,8 +16,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
"import sqlite3\n",
"from pathlib import Path\n",
"\n",
@@ -32,8 +30,7 @@
"source": [
"# --------------------------------------\n",
"# Instantiate the ranker:\n",
- "myranker = ranking.Ranker(\n",
- " method=\"deezymatch\",\n",
+ "ranker = ranking.DeezyMatchRanker(\n",
" resources_path=\"../resources/\",\n",
" strvar_parameters={\n",
" # Parameters to create the string pair dataset:\n",
@@ -71,9 +68,9 @@
"source": [
"with sqlite3.connect(\"../resources/rel_db/embeddings_database.db\") as conn:\n",
" cursor = conn.cursor()\n",
- " mylinker = linking.Linker(\n",
- " method=\"reldisamb\",\n",
+ " linker = linking.RelDisambLinker(\n",
" resources_path=\"../resources/\",\n",
+ " ranker=ranker,\n",
" rel_params={\n",
" \"model_path\": \"../resources/models/disambiguation/\",\n",
" \"data_path\": \"../experiments/outputs/data/lwm/\",\n",
@@ -95,7 +92,7 @@
"metadata": {},
"outputs": [],
"source": [
- "geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)"
+ "geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)"
]
},
{
@@ -104,13 +101,12 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\n",
+ "predictions = geoparser.run(\n",
" \"A remarkable case of rattening has just occurred in the building trade at Newtown. It started in Fairfield.\", \n",
- " place=\"Connecticut\", \n",
- " place_wqid=\"Q779\"\n",
+ " place_of_pub=\"Connecticut\", \n",
+ " place_of_pub_wqid=\"Q779\"\n",
")\n",
- "for r in resolved:\n",
- " print(r)"
+ "print(predictions)"
]
},
{
@@ -119,19 +115,18 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\n",
+ "predictions = geoparser.run(\n",
" \"A remarkable case of rattening has just occurred in the building trade at Newtown. It started in Powys.\", \n",
- " place=\"Powys\", \n",
- " place_wqid=\"Q156150\"\n",
+ " place_of_pub=\"Powys\", \n",
+ " place_of_pub_wqid=\"Q156150\"\n",
")\n",
- "for r in resolved:\n",
- " print(r)"
+ "print(predictions)"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
diff --git a/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb b/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb
index 688a81de..202725dd 100644
--- a/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb
+++ b/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb
@@ -16,8 +16,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
"import sqlite3\n",
"from pathlib import Path\n",
"\n",
@@ -32,8 +30,7 @@
"source": [
"# --------------------------------------\n",
"# Instantiate the ranker:\n",
- "myranker = ranking.Ranker(\n",
- " method=\"deezymatch\",\n",
+ "ranker = ranking.DeezyMatchRanker(\n",
" resources_path=\"../resources/\",\n",
" strvar_parameters=dict(),\n",
" deezy_parameters={\n",
@@ -62,9 +59,9 @@
"source": [
"with sqlite3.connect(\"../resources/rel_db/embeddings_database.db\") as conn:\n",
" cursor = conn.cursor()\n",
- " mylinker = linking.Linker(\n",
- " method=\"reldisamb\",\n",
+ " linker = linking.RelDisambLinker(\n",
" resources_path=\"../resources/\",\n",
+ " ranker=ranker,\n",
" rel_params={\n",
" \"model_path\": \"../resources/models/disambiguation/\",\n",
" \"data_path\": \"../experiments/outputs/data/lwm/\",\n",
@@ -86,7 +83,7 @@
"metadata": {},
"outputs": [],
"source": [
- "geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)"
+ "geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)"
]
},
{
@@ -95,14 +92,12 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_text(\n",
+ "predictions = geoparser.run(\n",
" \"A remarkable case of rattening has just occurred in the building trade next to the Market-street of Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!\", \n",
- " place=\"Manchester\", \n",
- " place_wqid=\"Q18125\"\n",
+ " place_of_pub=\"Manchester\", \n",
+ " place_of_pub_wqid=\"Q18125\"\n",
")\n",
- " \n",
- "for r in resolved:\n",
- " print(r)"
+ "print(predictions)"
]
},
{
@@ -111,19 +106,25 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\n",
+ "predictions = geoparser.run(\n",
" \"A remarkable case of rattening has just occurred in the building trade at Sheffield.\", \n",
- " place=\"Manchester\", \n",
- " place_wqid=\"Q18125\"\n",
+ " place_of_pub=\"Manchester\", \n",
+ " place_of_pub_wqid=\"Q18125\"\n",
")\n",
- "for r in resolved:\n",
- " print(r)"
+ "print(predictions)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
diff --git a/examples/run_pipeline_deezy_reldisamb.ipynb b/examples/run_pipeline_deezy_reldisamb.ipynb
index 445c1a7d..80601995 100644
--- a/examples/run_pipeline_deezy_reldisamb.ipynb
+++ b/examples/run_pipeline_deezy_reldisamb.ipynb
@@ -16,8 +16,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
"import sqlite3\n",
"from pathlib import Path\n",
"\n",
@@ -32,8 +30,7 @@
"source": [
"# --------------------------------------\n",
"# Instantiate the ranker:\n",
- "myranker = ranking.Ranker(\n",
- " method=\"deezymatch\",\n",
+ "ranker = ranking.DeezyMatchRanker(\n",
" resources_path=\"../resources/\",\n",
" mentions_to_wikidata=dict(),\n",
" wikidata_to_mentions=dict(),\n",
@@ -74,13 +71,13 @@
"source": [
"with sqlite3.connect(\"../resources/rel_db/embeddings_database.db\") as conn:\n",
" cursor = conn.cursor()\n",
- " mylinker = linking.Linker(\n",
- " method=\"reldisamb\",\n",
+ " linker = linking.RelDisambLinker(\n",
" resources_path=\"../resources/\",\n",
+ " ranker=ranker,\n",
" linking_resources=dict(),\n",
" rel_params={\n",
" \"model_path\": \"../resources/models/disambiguation/\",\n",
- " \"data_path\": \"../experiments/outputs/data/lwm/\",\n",
+ " \"data_path\": \"../tests/sample_files/experiments/outputs/data/lwm/\",\n",
" \"training_split\": \"originalsplit\",\n",
" \"context_length\": 100,\n",
" \"db_embeddings\": cursor,\n",
@@ -100,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
- "geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)"
+ "geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)"
]
},
{
@@ -109,10 +106,8 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_text(\"A remarkable case of rattening has just occurred in the building trade at Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!\")\n",
- " \n",
- "for r in resolved:\n",
- " print(r)"
+ "predictions = geoparser.run(\"A remarkable case of rattening has just occurred in the building trade at Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!\")\n",
+ "print(predictions)"
]
},
{
@@ -121,9 +116,8 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
- "for r in resolved:\n",
- " print(r)"
+ "predictions = geoparser.run(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
+ "print(predictions)"
]
},
{
@@ -136,7 +130,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
@@ -150,7 +144,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.17"
+ "version": "3.9.7"
},
"orig_nbformat": 4
},
diff --git a/examples/run_pipeline_modular.ipynb b/examples/run_pipeline_modular.ipynb
index 40e5aac1..aa7b545b 100644
--- a/examples/run_pipeline_modular.ipynb
+++ b/examples/run_pipeline_modular.ipynb
@@ -6,8 +6,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
"import sqlite3\n",
"from pathlib import Path\n",
"\n",
@@ -22,8 +20,7 @@
"source": [
"# --------------------------------------\n",
"# Instantiate the ranker:\n",
- "myranker = ranking.Ranker(\n",
- " method=\"deezymatch\",\n",
+ "ranker = ranking.DeezyMatchRanker(\n",
" resources_path=\"../resources/\",\n",
" strvar_parameters={\n",
" # Parameters to create the string pair dataset:\n",
@@ -61,9 +58,9 @@
"source": [
"with sqlite3.connect(\"../resources/rel_db/embeddings_database.db\") as conn:\n",
" cursor = conn.cursor()\n",
- " mylinker = linking.Linker(\n",
- " method=\"reldisamb\",\n",
+ " linker = linking.RelDisambLinker(\n",
" resources_path=\"../resources/\",\n",
+ " ranker=ranker,\n",
" rel_params={\n",
" \"model_path\": \"../resources/models/disambiguation/\",\n",
" \"data_path\": \"../experiments/outputs/data/lwm/\",\n",
@@ -85,7 +82,7 @@
"metadata": {},
"outputs": [],
"source": [
- "geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)"
+ "geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)"
]
},
{
@@ -102,11 +99,11 @@
"metadata": {},
"outputs": [],
"source": [
- "output = geoparser.run_text_recognition(\n",
+ "mentions = geoparser.run_text_recognition(\n",
" \"A remarkable case of rattening has just occurred in the building trade next to the Market-street of Shefiield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop! Maybe in Lancaster actually.\", \n",
- " place=\"Manchester\", \n",
- " place_wqid=\"Q18125\"\n",
- " )"
+ " )\n",
+ "for m in mentions:\n",
+ " print(m)"
]
},
{
@@ -115,7 +112,12 @@
"metadata": {},
"outputs": [],
"source": [
- "cands = geoparser.run_candidate_selection(output)"
+ "candidates = geoparser.run_candidate_selection(\n",
+ " mentions,\n",
+ " place_of_pub=\"Manchester\", \n",
+ " place_of_pub_wqid=\"Q18125\",\n",
+ " )\n",
+ "print(candidates)"
]
},
{
@@ -124,29 +126,14 @@
"metadata": {},
"outputs": [],
"source": [
- "output_disamb = geoparser.run_disambiguation(output, cands)"
+ "predictions = geoparser.run_disambiguation(candidates)\n",
+ "print(predictions)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "output_disamb"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
@@ -160,7 +147,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.17"
+ "version": "3.9.7"
},
"orig_nbformat": 4
},
diff --git a/examples/run_pipeline_perfect_mostpopular.ipynb b/examples/run_pipeline_perfect_mostpopular.ipynb
index 4a11aa63..b7da2855 100644
--- a/examples/run_pipeline_perfect_mostpopular.ipynb
+++ b/examples/run_pipeline_perfect_mostpopular.ipynb
@@ -14,9 +14,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import sys\n",
- "\n",
"from t_res.geoparser import pipeline, ranking, linking"
]
},
@@ -26,10 +23,9 @@
"metadata": {},
"outputs": [],
"source": [
- "myranker = ranking.Ranker(\n",
- " method=\"perfectmatch\",\n",
+ "ranker = ranking.PerfectMatchRanker(\n",
" resources_path=\"../resources/\",\n",
- ")\n"
+ ")"
]
},
{
@@ -38,8 +34,7 @@
"metadata": {},
"outputs": [],
"source": [
- "mylinker = linking.Linker(\n",
- " method=\"mostpopular\",\n",
+ "linker = linking.MostPopularLinker(\n",
" resources_path=\"../resources/\",\n",
")"
]
@@ -50,7 +45,7 @@
"metadata": {},
"outputs": [],
"source": [
- "geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)"
+ "geoparser = pipeline.Pipeline(ranker=ranker, linker=linker)"
]
},
{
@@ -59,22 +54,14 @@
"metadata": {},
"outputs": [],
"source": [
- "resolved = geoparser.run_sentence(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
- "for r in resolved:\n",
- " print(r)"
+ "predictions = geoparser.run(\"A remarkable case of rattening has just occurred in the building trade at Sheffield.\")\n",
+ "print(predictions)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "resolution-cNmUJBkC-py3.9",
+ "display_name": "t-res-LL5RBBZ3-py3.9",
"language": "python",
"name": "python3"
},
@@ -88,7 +75,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.17"
+ "version": "3.9.7"
},
"orig_nbformat": 4
},
diff --git a/experiments/experiment.py b/experiments/experiment.py
index 1ab1412d..704362f6 100644
--- a/experiments/experiment.py
+++ b/experiments/experiment.py
@@ -2,13 +2,14 @@
import os
import sys
from pathlib import Path
-from typing import Literal, Optional
+from typing import Literal, Optional, List
import pandas as pd
from tqdm import tqdm
-from t_res.geoparser import linking, ranking, recogniser
+from t_res.geoparser import ner, ranking, linking
from t_res.utils import process_data, rel_utils
+from t_res.utils.dataclasses import SentenceMentions, SentenceCandidates, Mention
class Experiment:
@@ -25,10 +26,10 @@ class Experiment:
be stored. If it does not exist, it will be created.
dataset_df (pandas.DataFrame): The dataframe representing the
resulting, preprocessed, dataset.
- myner (recogniser.Recogniser): An instance of the NER model to use.
- myranker (ranking.Ranker): An instance of the candidate ranking model
+ ner (recogniser.Recogniser): An instance of the NER model to use.
+ ranker (ranking.Ranker): An instance of the candidate ranking model
to use.
- mylinker (linking.Linker): An instance of the linking model to use.
+ linker (linking.Linker): An instance of the linking model to use.
overwrite_processing (bool, optional): Whether to overwrite the
processed data if it already exists (default is ``True``).
processed_data (dict, optional): A dictionary to store the processed
@@ -48,9 +49,9 @@ def __init__(
data_path: str,
results_path: str,
dataset_df: pd.DataFrame,
- myner: recogniser.Recogniser,
- myranker: ranking.Ranker,
- mylinker: linking.Linker,
+ recogniser: ner.Recogniser,
+ ranker: ranking.Ranker,
+ linker: linking.Linker,
overwrite_processing: Optional[bool] = True,
processed_data: Optional[dict] = dict(),
test_split: Optional[str] = "",
@@ -63,9 +64,9 @@ def __init__(
self.dataset = dataset
self.data_path = data_path
self.results_path = results_path
- self.myner = myner
- self.myranker = myranker
- self.mylinker = mylinker
+ self.recogniser = recogniser
+ self.ranker = ranker
+ self.linker = linker
self.overwrite_processing = overwrite_processing
self.dataset_df = dataset_df
self.processed_data = processed_data
@@ -119,14 +120,14 @@ def load_data(self) -> dict:
dict: A dictionary where the processed data is stored.
"""
- output_path = os.path.join(self.data_path, self.dataset, self.myner.model)
+ output_path = os.path.join(self.data_path, self.dataset, self.recogniser.model_name)
# Add the candidate experiment info to the path:
- cand_approach = self.myranker.method
- if self.myranker.method == "deezymatch":
- cand_approach += "+" + str(self.myranker.deezy_parameters["num_candidates"])
+ cand_approach = self.ranker.method_name
+ if self.ranker.method_name == "deezymatch":
+ cand_approach += "+" + str(self.ranker.deezy_parameters["num_candidates"])
cand_approach += "+" + str(
- self.myranker.deezy_parameters["selection_threshold"]
+ self.ranker.deezy_parameters["selection_threshold"]
)
output_processed_data = dict()
@@ -168,19 +169,6 @@ def prepare_data(self) -> dict:
# ----------------------------------
# Coherence checks:
- # Some scenarios do not make sense. Warn and exit:
- if self.myranker.method not in [
- "perfectmatch",
- "partialmatch",
- "levenshtein",
- "deezymatch",
- ]:
- print(
- "\n!!! Coherence check failed. "
- "This is because the candidate ranking method does not exist.\n"
- )
- sys.exit(0)
-
# ----------------------------------
# If data is processed and overwrite is set to False, then do nothing,
# otherwise process the data.
@@ -201,7 +189,7 @@ def prepare_data(self) -> dict:
# Parse with NER in the LwM way
print("\nPerform NER with our model:")
output_lwm_ner = process_data.ner_and_process(
- dSentences, dAnnotated, self.myner
+ dSentences, dAnnotated, self.recogniser
)
dPreds = output_lwm_ner[0]
@@ -223,10 +211,7 @@ def prepare_data(self) -> dict:
# Obtain candidates per sentence:
for sentence_id in tqdm(dMentionsPred):
pred_mentions_sent = dMentionsPred[sentence_id]
- (
- wk_cands,
- self.myranker.already_collected_cands,
- ) = self.myranker.find_candidates(pred_mentions_sent)
+ wk_cands = self.find_candidates(pred_mentions_sent)
dCandidates[sentence_id] = wk_cands
# -------------------------------------------
@@ -249,6 +234,71 @@ def prepare_data(self) -> dict:
return self.processed_data
+ # Method retained from a previous version of the Ranker, for backwards
+ # compatibility (specifically, to support the prepare_data method).
+ def find_candidates(self, mentions: List[dict]) -> dict:
+ """
+ Find candidates for the given mentions using the selected ranking
+ method.
+
+ Arguments:
+ mentions (list): A list of predicted mentions as dictionaries.
+
+ Returns:
+ dict: A dictionary that maps each original mention to a
+ sub-dictionary, where the sub-dictionary maps the mention
+ variations to a sub-sub-dictionary with two keys: ``"Score"``
+ (the string matching similarity score) and ``"Candidates"``
+ (a dictionary containing the Wikidata candidates, where the
+ key is the Wikidata ID and value is the the relative mention-
+ to-wikidata frequency).
+
+ The variation is found by the candidate ranker in the knowledge
+ base, and for each variation, the candidate ranking score and
+ the candidates from Wikidata are provided. E.g. for mention
+ "Guadaloupe" in sentence "sn83030483-1790-03-31-a-i0004_1", the
+ candidates will show as follows:
+
+ .. code-block:: json
+
+ {
+ "Guadaloupe": {
+ "Score": 1.0,
+ "Candidates": {
+ "Q17012": 0.003935458480913026,
+ "Q3153836": 0.07407407407407407
+ }
+ }
+ }
+
+ Note:
+ This method takes a list of mentions and finds candidates for each
+ mention using the selected Ranker instance.
+
+ The method returns a dictionary that maps each original mention to
+ a sub-dictionary containing the mention variations as keys and
+ their corresponding Wikidata match scores as values.
+ """
+ # Extract the mentions
+ mentions = [Mention.from_dict(d) for d in mentions]
+
+ # Pass the mentions to the Ranker run method.
+ cands = self.ranker.run(mentions)
+
+ # Get Wikidata candidates
+ wk_cands = dict()
+ for cand in cands:
+ wk_cands[cand.mention.mention] = dict()
+ for match in cand.matches:
+ found_cands = self.ranker.mentions_to_wikidata.get(match.variation, dict())
+ if found_cands:
+ wk_cands[cand.mention.mention][cand.mention.mention] = {
+ "Score": match.string_similarity,
+ "Candidates": found_cands,
+ }
+
+ return wk_cands
+
def store_processed_data(
self,
preds: dict,
@@ -295,14 +345,14 @@ def store_processed_data(
"""
data_path = self.data_path
dataset = self.dataset
- model_name = self.myner.model
+ model_name = self.recogniser.model_name
output_path = data_path + dataset + "/" + model_name
- cand_approach = self.myranker.method
- if self.myranker.method == "deezymatch":
- cand_approach += "+" + str(self.myranker.deezy_parameters["num_candidates"])
+ cand_approach = self.ranker.method_name
+ if self.ranker.method_name == "deezymatch":
+ cand_approach += "+" + str(self.ranker.deezy_parameters["num_candidates"])
cand_approach += "+" + str(
- self.myranker.deezy_parameters["selection_threshold"]
+ self.ranker.deezy_parameters["selection_threshold"]
)
# Store NER predictions using a specific NER model:
@@ -374,11 +424,11 @@ def create_mentions_df(self) -> pd.DataFrame:
dMetadata = self.processed_data["dMetadata"]
dCandidates = self.processed_data["dCandidates"]
- cand_approach = self.myranker.method
- if self.myranker.method == "deezymatch":
- cand_approach += "+" + str(self.myranker.deezy_parameters["num_candidates"])
+ cand_approach = self.ranker.method_name
+ if self.ranker.method_name == "deezymatch":
+ cand_approach += "+" + str(self.ranker.deezy_parameters["num_candidates"])
cand_approach += "+" + str(
- self.myranker.deezy_parameters["selection_threshold"]
+ self.ranker.deezy_parameters["selection_threshold"]
)
rows = []
@@ -466,9 +516,9 @@ def create_mentions_df(self) -> pd.DataFrame:
data=rows,
)
- print(f"Saving to {os.path.join(self.data_path,self.dataset,f'{self.myner.model}_{cand_approach}')}")
+ print(f"Saving to {os.path.join(self.data_path,self.dataset,f'{self.recogniser.model_name}_{cand_approach}')}")
output_path = (
- os.path.join(self.data_path,self.dataset,f"{self.myner.model}_{cand_approach}")
+ os.path.join(self.data_path,self.dataset,f"{self.recogniser.model_name}_{cand_approach}")
)
@@ -540,7 +590,7 @@ def store_results(
scenario_name = ""
if task == "ner":
- scenario_name += task + "_" + self.myner.model + "_"
+ scenario_name += task + "_" + self.recogniser.model_name + "_"
# Store predictions results formatted for CLEF-HIPE scorer:
preds_name = "preds"
@@ -560,24 +610,24 @@ def store_results(
)
if task == "linking":
- scenario_name += task + "_" + self.myner.model + "_"
- cand_approach = self.myranker.method
- if self.myranker.method == "deezymatch":
+ scenario_name += task + "_" + self.recogniser.model_name + "_"
+ cand_approach = self.ranker.method_name
+ if self.ranker.method_name == "deezymatch":
cand_approach += "+" + str(
- self.myranker.deezy_parameters["num_candidates"]
+ self.ranker.deezy_parameters["num_candidates"]
)
cand_approach += "+" + str(
- self.myranker.deezy_parameters["selection_threshold"]
+ self.ranker.deezy_parameters["selection_threshold"]
)
scenario_name += cand_approach + "_" + how_split + "_"
- link_approach = self.mylinker.method
- if self.mylinker.method == "reldisamb":
- if self.mylinker.rel_params["with_publication"]:
+ link_approach = self.linker.method_name
+ if self.linker.method_name == "reldisamb":
+ if self.linker.rel_params["with_publication"]:
link_approach += "+wpubl"
- if self.mylinker.rel_params["without_microtoponyms"]:
+ if self.linker.rel_params["without_microtoponyms"]:
link_approach += "+wmtops"
- if self.mylinker.rel_params["do_test"]:
+ if self.linker.rel_params["do_test"]:
link_approach += "_test"
# Store predictions results formatted for CLEF-HIPE scorer:
@@ -672,11 +722,6 @@ def linking_experiments(self) -> None:
# Get ids of articles in each split:
test_article_ids = list(test_original.article_id.astype(str))
- # Train a linking model if needed (it requires myranker to generate potential
- # candidates to the training set):
- print("Train EL model using:", split)
- linking_model = self.mylinker.train_load_model(self.myranker, split=split)
-
# Dictionary of sentences:
# {k1 : {k2 : v}}, where k1 is article id, k2 is
# sentence pos, and v is the sentence text.
@@ -722,9 +767,9 @@ def linking_experiments(self) -> None:
prediction["sentence"] = mention_data["sentence"]
prediction["place"] = mention_data["place"]
prediction["place_wqid"] = mention_data["place_wqid"]
- if self.mylinker.method == "reldisamb":
+ if self.linker.method_name == "reldisamb":
if (
- self.mylinker.rel_params["without_microtoponyms"]
+ self.linker.rel_params["without_microtoponyms"]
and mention_data["pred_ner_label"] != "LOC"
):
prediction["candidates"] = dict()
@@ -734,20 +779,30 @@ def linking_experiments(self) -> None:
mentions_dataset[sentence_id] = [prediction]
all_cands.update({prediction["mention"]: prediction["candidates"]})
- if self.mylinker.method == "reldisamb":
+ if self.linker.method_name == "reldisamb":
rel_resolved = dict()
for sentence_id in mentions_dataset:
article_dataset = {sentence_id: mentions_dataset[sentence_id]}
+
+ dict_mentions = [{"candidates": wk, "place_wqid": None} for wk in all_cands.values()]
+ all_cands = {d["candidates"].mention : self.linker.run(d) for d in dict_mentions}
+
article_dataset = rel_utils.rank_candidates(
article_dataset,
all_cands,
- self.mylinker.linking_resources["mentions_to_wikidata"],
+ # self.linker.linking_resources["mentions_to_wikidata"],
)
- if self.mylinker.rel_params["with_publication"]:
+ if self.linker.rel_params["with_publication"]:
# If "publ", add an artificial publication entry:
article_dataset = rel_utils.add_publication(article_dataset)
+
+ # Train a linking model if needed (it requires ranker to generate potential
+ # candidates to the training set):
+ print("Train EL model using:", split)
+ linking_model = self.linker.train_load_model(self.ranker, split=split)
+
predicted = linking_model.predict(article_dataset)
- if self.mylinker.rel_params["with_publication"]:
+ if self.linker.rel_params["with_publication"]:
# ... and if "publ", now remove the artificial publication entry!
predicted[sentence_id].pop()
for i in range(len(predicted[sentence_id])):
@@ -774,21 +829,30 @@ def linking_experiments(self) -> None:
):
prediction = mention
- if self.mylinker.method in ["mostpopular", "bydistance"]:
- # Run entity linking per mention:
- selected_cand = self.mylinker.run(
- {
- "candidates": prediction["candidates"],
- "place_wqid": prediction["place_wqid"],
- }
- )
- prediction["prediction"] = selected_cand[0]
- prediction["ed_score"] = round(selected_cand[1], 3)
+ if self.linker.method_name in ["mostpopular", "bydistance"]:
+
+ # Convert `prediction` dictionary into a Predictions instance.
+ sentence_mentions = SentenceMentions.from_list([prediction])
+
+ if len(sentence_mentions.mentions) != 1:
+ raise Exception("Expected precisely one mention.")
+
+ matches = self.ranker.run([sentence_mentions.mentions[0]])
+ candidates = self.linker.run(matches[0], prediction["place_wqid"], prediction["place"])
+ sentence_candidates = SentenceCandidates(sentence_mentions.sentence, [candidates])
+ pred = self.linker.disambiguate([sentence_candidates])
+
+ if pred.candidates() and pred.candidates()[0].best_match():
+ prediction["ed_score"] = round(pred.candidates()[0].best_match().best_disambiguation_score(), 3)
+ prediction["prediction"] = pred.candidates()[0].best_wqid()
+ else:
+ prediction["ed_score"] = None
+ prediction["prediction"] = None
to_append.append(
[
prediction["prediction"],
- round(prediction["ed_score"], 3),
+ prediction["ed_score"]
]
)
diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py
index dee4af56..feb06ef8 100644
--- a/experiments/toponym_resolution.py
+++ b/experiments/toponym_resolution.py
@@ -7,7 +7,7 @@
import experiment
import pandas as pd
-from t_res.geoparser import linking, ranking, recogniser
+from t_res.geoparser import ner, ranking, linking
parser = ArgumentParser()
parser.add_argument(
@@ -64,8 +64,8 @@
# --------------------------------------
# Instantiate the recogniser:
- myner = recogniser.Recogniser(
- model="blb_lwm-ner-" + granularity,
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-" + granularity,
train_dataset=str(current_dir)
+ "/outputs/data/lwm/ner_"
+ granularity
@@ -92,42 +92,17 @@
}, # Training arguments: you can change them. These are selected based on: https://github.com/dbmdz/clef-hipe/tree/main/experiments/clef-hipe-2022#topres19th
overwrite_training=False, # Set to True if you want to overwrite an existing model with the same name.
do_test=False, # Set to True if you want to perform the training on test mode (the string "_test" will be appended to your model name).
- load_from_hub=False, # Whether the model should be loaded from the HuggingFace hub
)
# --------------------------------------
# Instantiate the ranker:
- myranker = ranking.Ranker(
- method=cand_select_method,
- resources_path=resources_dir,
- mentions_to_wikidata=dict(),
- wikidata_to_mentions=dict(),
- strvar_parameters={
- # Parameters to create the string pair dataset:
- "ocr_threshold": 60,
- "top_threshold": 85,
- "min_len": 5,
- "max_len": 15,
- "w2v_ocr_path": os.path.join(resources_dir, "models/w2v/"),
- "w2v_ocr_model": "w2v_*_news",
- "overwrite_dataset": False,
- },
- deezy_parameters={
- # Paths and filenames of DeezyMatch models and data:
- "dm_path": os.path.join(resources_dir, "deezymatch/"),
- "dm_cands": "wkdtalts",
- "dm_model": "w2v_ocr",
- "dm_output": "deezymatch_on_the_fly",
- # Ranking measures:
- "ranking_metric": "faiss",
- "selection_threshold": 50,
- "num_candidates": 1,
- "verbose": False,
- # DeezyMatch training:
- "overwrite_training": False,
- "do_test": False,
- },
- )
+ kwargs = {
+ 'method_name': cand_select_method,
+ 'resources_path': resources_dir
+ }
+ # If deezymatch ranking is selected, use the default parameters,
+ # so no `strvar_parameters` or `deezy_parameters` are needed in the kwargs.
+ ranker = ranking.Ranker.new(**kwargs)
# --------------------------------------
# Instantiate the linker:
@@ -135,23 +110,27 @@
os.path.join(resources_dir, "rel_db/embeddings_database.db")
) as conn:
cursor = conn.cursor()
- mylinker = linking.Linker(
- method=top_res_method,
- resources_path=resources_dir,
- linking_resources=dict(),
- rel_params={
- "model_path": os.path.join(resources_dir, "models/disambiguation/"),
- "data_path": os.path.join(current_dir, "outputs/data/lwm/"),
- "training_split": "",
- "db_embeddings": cursor,
- "with_publication": wpubl,
- "without_microtoponyms": wmtops,
- "do_test": False,
- "default_publname": "",
- "default_publwqid": "",
- },
- overwrite_training=False,
- )
+ rel_params={
+ "model_path": os.path.join(resources_dir, "models/disambiguation/"),
+ "data_path": os.path.join(current_dir, "outputs/data/lwm/"),
+ "training_split": "",
+ "db_embeddings": cursor,
+ "with_publication": wpubl,
+ "without_microtoponyms": wmtops,
+ "do_test": False,
+ "default_publname": "",
+ "default_publwqid": "",
+ }
+ kwargs = {
+ 'method_name': top_res_method,
+ 'resources_path': resources_dir
+ }
+ # Only include the `rel_params` if the linking method is `reldisamb`.
+ if top_res_method == 'reldisamb':
+ kwargs['ranker'] = ranker
+ kwargs['rel_params'] = rel_params
+
+ linker = linking.Linker.new(**kwargs)
# --------------------------------------
# Instantiate the experiment:
@@ -160,9 +139,9 @@
data_path=os.path.join(current_dir, "outputs/data/"),
dataset_df=pd.DataFrame(),
results_path=os.path.join(current_dir, "outputs/results/"),
- myner=myner,
- myranker=myranker,
- mylinker=mylinker,
+ recogniser=recogniser,
+ ranker=ranker,
+ linker=linker,
overwrite_processing=False, # If True, do data processing, else load existing processing, if exists.
processed_data=dict(), # Dictionary where we'll keep the processed data for the experiments.
test_split=test_scenario, # "dev" while experimenting, "test" when running final experiments.
@@ -172,28 +151,26 @@
# Print experiment information:
print(myexperiment)
- print(myner)
- print(myranker)
- print(mylinker)
+ print(recogniser)
+ print(ranker)
+ print(linker)
# -----------------------------------------
# NER training and creating pipeline:
# Train the NER models if needed:
- myner.train()
+ recogniser.train()
# Load the NER pipeline:
- myner.pipe = myner.create_pipeline()
+ recogniser.pipe = recogniser.load()
# -----------------------------------------
# Ranker loading resources and training a model:
- # Load the resources:
- myranker.mentions_to_wikidata = myranker.load_resources()
- # Train a DeezyMatch model if needed:
- myranker.train()
+ # Load the resources (and train a DeezyMatch model if needed):
+ ranker.load()
# -----------------------------------------
# Linker loading resources:
# Load linking resources:
- mylinker.linking_resources = mylinker.load_resources()
+ linker.load()
# -----------------------------------------
# Prepare experiment:
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 00000000..95882382
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,97 @@
+site_name: T-Res Docs
+site_url: https://living-with-machines.github.io/T-Res
+theme:
+ name: material
+ palette:
+ scheme: default
+ primary: custom
+ features:
+ - navigation.sections
+ - navigation.footer
+ - navigation.instant
+ # - toc.integrate
+ - navigation.top
+ - content.code.annotation
+ - content.code.copy
+nav:
+ - 'T-Res toponym resolution pipeline': index.md
+ - Getting Started:
+ - getting-started/index.md
+ - getting-started/installation.md
+ - getting-started/resources.md
+ - getting-started/data-structures.md
+ - T-Res pipeline:
+ - getting-started/pipeline/index.md
+ - getting-started/pipeline/recogniser.md
+ - getting-started/pipeline/ranker.md
+ - getting-started/pipeline/linker.md
+ - getting-started/batch-jobs.md
+ - Reference:
+ - reference/index.md
+ - geoparser module:
+ - reference/geoparser/index.md
+ - reference/geoparser/ner.md
+ - reference/geoparser/ranking.md
+ - reference/geoparser/linking.md
+ - reference/geoparser/pipeline.md
+ - utils module:
+ - reference/utils/index.md
+ - 'Dataclasses': reference/utils/dataclasses.md
+ - 'Deezy Processing': reference/utils/deezy_processing.md
+ - 'Get Data': reference/utils/get_data.md
+ - 'NER Utils': reference/utils/ner_utils.md
+ - 'Preprocess Data': reference/utils/preprocess_data.md
+ - 'Process Data': reference/utils/process_data.md
+ - 'Process Wikipedia': reference/utils/process_wikipedia.md
+ - 'REL e2e': reference/utils/rel_e2e.md
+ - 'REL Utils': reference/utils/rel_utils.md
+ - utils.REL module:
+ - reference/utils/REL/index.md
+ - 'Entity Disambigution': reference/utils/REL/entity_disambiguation.md
+ - 'MulRel Ranker': reference/utils/REL/mulrel_ranker.md
+ - 'Utils': reference/utils/REL/utils.md
+ - 'Vocabulary': reference/utils/REL/vocabulary.md
+ # TODO:
+ # - T-Res API:
+ # - t-res-api/index.md
+ # - t-res-api/installation.md
+ # - t-res-api/usage.md
+ # - Experiments:
+ # - experiments/index.md
+ # - experiments/evaluation.md
+
+markdown_extensions:
+ - pymdownx.highlight:
+ anchor_linenums: true
+ pygments_lang_class: true
+ - pymdownx.inlinehilite
+ - pymdownx.snippets
+ - pymdownx.superfences
+ - pymdownx.keys
+ - admonition
+ - codehilite
+ - tables
+ - attr_list
+ - md_in_html
+ - toc:
+ toc_depth: 2 # Avoids TOC entries for class methods
+ - pymdownx.emoji:
+ emoji_index: !!python/name:material.extensions.emoji.twemoji
+ emoji_generator: !!python/name:material.extensions.emoji.to_svg
+
+extra_css:
+ - stylesheets/extra.css
+
+extra_javascript:
+ - javascripts/extra.js
+
+plugins:
+ - search
+ - section-index
+ - autorefs
+ - mkdocstrings:
+ default_handler: python
+ handlers:
+ python:
+ options:
+ show_root_toc_entry: false
diff --git a/pyproject.toml b/pyproject.toml
index 5bc7d1d3..fa2a9b33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ pandas = "^1.3.4"
wget = "^3.2"
DeezyMatch = "^1.3.4"
datasets = "^1.18.0"
-transformers = "^4.15.0"
+transformers = ">=4.16.1, <=4.40.2"
pydash = "^5.1.0"
wikimapper = "^0.1.5"
numpy = "^1.22.1"
@@ -51,6 +51,15 @@ Sphinx = "^5.0.0"
sphinx-rtd-theme = "^1.0.0"
sphinxcontrib-napoleon = "^0.7"
sphinx-copybutton = "^0.5.2"
+mkdocs-material = "^9.5.49"
+mkdocs-include-markdown-plugin= "^6.0.4"
+mkdocs-exclude = "^1.0.2"
+mkdocs-macros-plugin = "^1.0.4"
+mkdocs-awesome-pages-plugin = "^2.10.1"
+mkdocstrings-python = "^1.12.2"
+
+[tool.poetry.scripts]
+batch-job = "t_res.utils.batch_job:run"
[build-system]
requires = ["poetry-core>=1.0.0"]
@@ -65,5 +74,7 @@ profile = "black"
[tool.pytest.ini_options]
markers = [
- "deezy: tests which need a deezy model",
+ "resources: tests which depend on files in the resources directory",
+ "train: tests which train a model",
+ "app: tests which require access to the HTTP API",
]
diff --git a/t_res/geoparser/linking.py b/t_res/geoparser/linking.py
index 1731a903..c1d85165 100644
--- a/t_res/geoparser/linking.py
+++ b/t_res/geoparser/linking.py
@@ -1,12 +1,13 @@
import json
import os
-import sys
from pathlib import Path
-from typing import Literal, Optional, Tuple
+from typing import Optional, List, Dict, Tuple
import numpy as np
import pandas as pd
+import torch
from haversine import haversine
+from math import exp
from tqdm import tqdm
tqdm.pandas()
@@ -17,7 +18,7 @@
from ..utils import rel_utils
from ..utils.REL import entity_disambiguation
from . import ranking
-
+from ..utils.dataclasses import Mention, MentionCandidates, StringMatchLinks, WikidataLink, MostPopularLink, ByDistanceLink, RelDisambLink, CandidateMatches, CandidateLinks, SentenceCandidates, Predictions, RelPredictions, CombinedScores
class Linker:
"""
@@ -26,147 +27,158 @@ class Linker:
knowledge base.
Arguments:
- method (Literal["mostpopular", "reldisamb", "bydistance"]): The
- linking method to use.
resources_path (str): The path to the linking resources.
experiments_path (str, optional): The path to the experiments
- directory. Default is "../experiments/".
+ directory.
linking_resources (dict, optional): Dictionary containing the
- necessary linking resources. Defaults to ``dict()`` (an empty
- dictionary).
- overwrite_training (bool): Flag indicating whether to overwrite the
- training. Defaults to ``False``.
- rel_params (dict, optional): Dictionary containing the parameters
- for performing entity disambiguation using the ``reldisamb``
- approach (adapted from the Radboud Entityt Linker, REL).
- For the default settings, see Notes below.
-
- Example:
-
- .. code-block:: python
-
- linker = Linker(
- method="mostpopular",
- resources_path="/path/to/resources/",
- experiments_path="/path/to/experiments/",
- linking_resources={},
- overwrite_training=True,
- rel_params={"with_publication": True, "do_test": True}
- )
+ necessary linking resources.
Note:
-
- * Note that, in order to instantiate the Linker with the ``reldisamb``
- method, the Linker needs to be wrapped by a context manager in which
- a connection to the entity embeddings database is established and a
- cursor is created:
-
- .. code-block:: python
-
- with sqlite3.connect("../resources/rel_db/embeddings_database.db") as conn:
- cursor = conn.cursor()
- mylinker = linking.Linker(
- method="reldisamb",
- resources_path="../resources/",
- experiments_path="../experiments/",
- linking_resources=dict(),
- rel_params={
- "model_path": "../resources/models/disambiguation/",
- "data_path": "../experiments/outputs/data/lwm/",
- "training_split": "",
- "db_embeddings": cursor,
- "with_publication": wpubl,
- "without_microtoponyms": wmtops,
- "do_test": False,
- "default_publname": "",
- "default_publwqid": "",
- },
- overwrite_training=False,
- )
-
- See below the default settings for ``rel_params``. Note that
- `db_embeddings` defaults to None, but it should be assigned a
- cursor to the entity embeddings database, as described above:
-
- .. code-block:: python
-
- rel_params: Optional[dict] = {
- "model_path": "../resources/models/disambiguation/",
- "data_path": "../experiments/outputs/data/lwm/",
- "training_split": "originalsplit",
- "db_embeddings": None,
- "with_publication": True,
- "without_microtoponyms": True,
- "do_test": False,
- "default_publname": "United Kingdom",
- "default_publwqid": "Q145",
- }
-
+ This base class should not be instatiated directly. Instead use a subclass
+ constructor.
"""
+ # Class attribute for the name of the linking method.
+ method_name: str = None
def __init__(
self,
- method: Literal["mostpopular", "reldisamb", "bydistance"],
resources_path: str,
experiments_path: Optional[str] = "../experiments",
linking_resources: Optional[dict] = dict(),
- overwrite_training: Optional[bool] = False,
- rel_params: Optional[dict] = None,
):
"""
Initialises a Linker object.
"""
- self.method = method
self.resources_path = resources_path
self.experiments_path = experiments_path
- self.linking_resources = linking_resources
- self.overwrite_training = overwrite_training
-
- if rel_params is None:
- rel_params = {
- "model_path": os.path.join(resources_path, "models/disambiguation/"),
- "data_path": os.path.join(experiments_path, "outputs/data/lwm/"),
- "training_split": "originalsplit",
- "db_embeddings": None, # The cursor to the embeddings database.
- "with_publication": True,
- "without_microtoponyms": True,
- "do_test": False,
- "default_publname": "United Kingdom",
- "default_publwqid": "Q145",
- }
-
- self.rel_params = rel_params
+ self.resources = linking_resources
def __str__(self) -> str:
"""
Returns a string representation of the Linker object.
Returns:
- str: String representation of the Linker object.
+ String representation of the Linker object.
"""
s = ">>> Entity Linking:\n"
- s += f" * Method: {self.method}\n"
- s += f" * Overwrite training: {self.overwrite_training}\n"
+ s += f" * Method: {self.method_name}\n"
return s
- def load_resources(self) -> dict:
+ def new(**kwargs) -> 'Linker':
"""
- Loads the linking resources.
+ Static constructor.
+
+ Args:
+ kwargs (dict): A dictionary of keyword arguments matching the
+ arguments to a subclass __init__ constructor, plus a
+ `method_name` argument to specify the desired subclass.
Returns:
- dict: Dictionary containing loaded necessary linking resources.
+ A Linker (subclass) instance.
- Note:
- Different methods will require different resources.
+ """
+ if not 'method_name' in kwargs.keys():
+ raise ValueError("Expected `method_name` keyword argument.")
+ method_name = kwargs['method_name']
+ del kwargs['method_name']
+ if method_name == 'mostpopular':
+ return MostPopularLinker(**kwargs)
+ if method_name == 'bydistance':
+ return ByDistanceLinker(**kwargs)
+ if method_name == 'reldisamb':
+ return RelDisambLinker(**kwargs)
+ raise ValueError(f"Invalid linking method: {method_name}")
+
+ def wkdt_class(self, wqid: str) -> Optional[str]:
+ """
+ Returns the Wikidata class for the given Wikidata entry, if available.
+
+ Returns:
+ The corresponding Wikidata class if available, otherwise `None`.
+ """
+ return self.resources["entity2class"].get(wqid, None)
+
+ def wkdt_coords(self, wqid: str) -> Optional[Tuple[float, float]]:
+ """
+ Returns the lat-lon coordinates for the given Wikidata entry, if available.
+
+ Returns:
+ Latitude and longitude coordinates for the given Wikidata entry, if
+ available.
+ """
+ return self.resources["wqid_to_coords"].get(wqid, None)
+
+ def haversine(self,
+ origin_coords: Optional[Tuple[float, float]],
+ coords: Optional[Tuple[float, float]]) -> Optional[float]:
+ """
+ Calculates the great circle distance between two points on Earth's surface.
+
+ Args:
+ origin_coords (Optional[Tuple[float, float]]): coordinates of the origin
+ coords (Optional[Tuple[float, float]]): coordinates of the other point
+
+ Returns:
+ The great circle distance between the points, or `None` if either pair
+ of coordinates is unavailable.
+ """
+ if not origin_coords:
+ print("Missing place of publication coordinates.")
+ return None
+ try:
+ return haversine(origin_coords, coords, normalize=True)
+ except ValueError:
+ # We have one candidate with coordinates in Venus!
+ print(f"Failed to compute haversine distance from {origin_coords} to {coords}")
+ return None
+
+ def empty_candidates(self,
+ mention: Mention,
+ ranking_method: str,
+ place_of_pub_wqid: str,
+ place_of_pub: str) -> MentionCandidates:
+ """
+ Constructs an empty `MentionCandidates` instance.
+
+ Returns:
+ A `MentionCandidates` instance with an empty list of candidate links.
+ """
+ return MentionCandidates(
+ mention,
+ ranking_method,
+ self.method_name,
+ list(),
+ place_of_pub_wqid,
+ place_of_pub)
+
+ def load(self):
+ """
+ Loads the linking resources and assigns them to instance variables.
"""
print("*** Load linking resources.")
+ # TODO: make this more consistent with the Ranker (which has a mentions_to_wikidata attribute).
+
# Load Wikidata mentions-to-QID with absolute counts:
print(" > Loading mentions to wikidata mapping.")
with open(
os.path.join(self.resources_path, "wikidata/mentions_to_wikidata.json"), "r"
) as f:
- self.linking_resources["mentions_to_wikidata"] = json.load(f)
+ self.resources["mentions_to_wikidata"] = json.load(f)
+
+ # Load Wikidata mentions-to-QID with normalized counts:
+ print(" > Loading mentions to normalized wikidata mapping.")
+ with open(
+ os.path.join(self.resources_path, "wikidata/mentions_to_wikidata_normalized.json"), "r"
+ ) as f:
+ self.resources["mentions_to_wikidata_normalized"] = json.load(f)
+
+ # The entity2class.txt file is created as the last step in
+ # wikipedia processing:
+ with open(
+ os.path.join(self.resources_path, "wikidata/entity2class.txt"), "r"
+ ) as f:
+ self.resources["entity2class"] = json.load(f)
print(" > Loading gazetteer.")
gaz = pd.read_csv(
@@ -177,193 +189,629 @@ def load_resources(self) -> dict:
gaz["longitude"] = gaz["longitude"].astype(float)
gaz["coords"] = gaz[["latitude", "longitude"]].to_numpy().tolist()
wqid_to_coords = dict(zip(gaz.wikidata_id, gaz.coords))
- self.linking_resources["wqid_to_coords"] = wqid_to_coords
+ self.resources["wqid_to_coords"] = wqid_to_coords
gaz_ids = set(gaz["wikidata_id"].tolist())
# Keep only wikipedia entities in the gazetteer:
- self.linking_resources["wikidata_locs"] = gaz_ids
+ self.resources["wikidata_locs"] = gaz_ids
gaz_ids = ""
gaz = ""
- # The entity2class.txt file is created as the last step in
- # wikipedia processing:
- with open(
- os.path.join(self.resources_path, "wikidata/entity2class.txt"), "r"
- ) as f:
- self.linking_resources["entity2class"] = json.load(f)
-
print("*** Linking resources loaded!\n")
- return self.linking_resources
- def run(self, dict_mention: dict) -> Tuple[str, float, dict]:
+ def run(
+ self,
+ matches: CandidateMatches,
+ place_of_pub_wqid: Optional[str]=None,
+ place_of_pub: Optional[str]=None,
+ ) -> MentionCandidates:
"""
- Executes the linking process based on the specified unsupervised
- method.
+ Executes the linking process.
+
+ Arguments:
+ matches: A `CandidatesMatches` instance containing string matches to be linked.
+ place_of_pub_wqid (Optional[str]): The Wikidata ID of the place of publication.
+ place_of_pub (Optional[str]): The place of publication.
+
+ Returns:
+ The candidates identified by the linking process.
+ """
+ if matches.is_empty():
+ return self.empty_candidates(matches.mention, matches.ranking_method, place_of_pub_wqid, place_of_pub)
+
+ candidate_links = [CandidateLinks(m.as_string_match(), self.wikidata_links(m, place_of_pub_wqid))
+ for m in matches.matches]
+
+ return MentionCandidates(
+ matches.mention,
+ matches.ranking_method,
+ self.method_name,
+ candidate_links,
+ place_of_pub_wqid,
+ place_of_pub,
+ )
+
+ def wikidata_links(
+ self,
+ match: StringMatchLinks,
+ place_of_pub_wqid: Optional[str]=None,
+ ) -> List[WikidataLink]:
+ """
+ Identifies candidate links in the Wikidata knowledgebase.
+
+ Args:
+ match (StringMatchLinks): The toponym string match to be linked.
+ place_of_pub_wqid (Optional[str], optional): The Wikidata ID of
+ the place of publication, if available.
+
+ Raises:
+ NotImplementedError: If not implemented in a subclass.
+
+ Returns:
+ A list of candidate links in Wikidata.
+
+ Note:
+ Each Linker subclass must implement a linking algorithm by
+ overriding the `wikidata_links` method.
+ """
+ raise NotImplementedError("Subclass implementation required.")
+
+ def disambiguate(self, candidates: List[SentenceCandidates]) -> Predictions:
+ """
+ Performs entity disambiguation given a list of already identified
+ toponyms and selected candidates.
Arguments:
- dict_mention: Dictionary containing the mention information.
+ candidates: A list of SentenceCandidates instances.
Returns:
- Tuple[str, float, dict]:
- The result of the linking process. For details, see below:
+ A `Predictions` instance representing the identified and
+ linked toponyms.
+ """
+ if len(candidates) == 0:
+ return Predictions(list())
+ # Replace each CandidatesLinks instance with a PredictedLinks instance.
+ for scs in candidates:
+ for cs in scs.candidates:
+ for i, links in enumerate(cs.links):
+ scores = self.disambiguation_scores(links.wikidata_links, links.string_match.string_similarity)
+ cs.links[i] = links.attach_scores(scores)
+ return Predictions(candidates)
+
+ def disambiguation_scores(self, links: List[WikidataLink], string_similarity: float) -> Dict[str, float]:
+ """
+ Computes disambiguation scores for a given list of Wikidata links.
- - If the ``method`` provided when initialising the
- :py:meth:`~geoparser.linking.Linker` object was
- ``"mostpopular"``, see
- :py:meth:`~geoparser.linking.Linker.most_popular`.
- - If the ``method`` provided when initialising the
- :py:meth:`~geoparser.linking.Linker` object was
- ``"bydistance"``, see
- :py:meth:`~geoparser.linking.Linker.by_distance`.
+ Arguments:
+ links: A list of `WikidataLink` instances.
+ string_similarity (float): the string similarity score for the candidate match.
+
+ Raises:
+ NotImplementedError: If not implemented in a subclass.
+
+ Returns:
+ A dictionary containing disambiguation scores, keyed by Wikidata ID.
+ Note:
+ Each Linker subclass must implement a linking algorithm by
+ overriding the `disambiguation_scores` method.
"""
- if self.method == "mostpopular":
- return self.most_popular(dict_mention)
+ raise NotImplementedError("Subclass implementation required.")
+
+class MostPopularLinker(Linker):
+ """
+ An entity linking method that selects the candidate that is most
+ popular in the Wikipedia knowledgebase.
+
+ Example:
+ ```python
+ linker = MostPopularLinker(
+ resources_path="/path/to/resources/",
+ experiments_path="/path/to/experiments/",
+ linking_resources={},
+ )
+ ```
- if self.method == "bydistance":
- return self.by_distance(dict_mention)
+ """
+ # Override the method_name class attribute.
+ method_name: str = "mostpopular"
+
+ def wikidata_links(
+ self,
+ match: StringMatchLinks,
+ place_of_pub_wqid: Optional[str]=None,
+ ) -> List[WikidataLink]:
+ """
+ Identifies candidate links in the Wikidata knowledgebase.
- raise SyntaxError(f"Unknown method provided: {self.method}")
+ Args:
+ match (StringMatchLinks): The toponym string match to be linked.
+ place_of_pub_wqid (Optional[str], optional): The Wikidata ID of
+ the place of publication, if available. **Not used** in this
+ linking method.
- def most_popular(self, dict_mention: dict) -> Tuple[str, float, dict]:
+ Returns:
+ A list of candidate links in Wikidata, each of type
+ [`MostPopularLink`][t_res.utils.dataclasses.MostPopularLink].
"""
- Select most popular candidate, given Wikipedia's in-link structure.
+ links = [MostPopularLink(
+ wqid=wqid,
+ wkdt_class=self.wkdt_class(wqid),
+ coords=self.wkdt_coords(wqid),
+ freq=self.resources["mentions_to_wikidata"][match.variation][wqid])
+ for wqid in match.wqid_links]
+ return links
+
+ def disambiguation_scores(self, links: List[MostPopularLink], string_similarity=None) -> Dict[str, float]:
+ """
+ Computes disambiguation scores by using the relative mention-to-wikidata
+ link frequencies as a proxy for popularity of the toponym in Wikidata.
Arguments:
- dict_mention (dict): dictionary with all the relevant information
- needed to disambiguate a certain mention.
+ links: A list of `WikidataLink` instances.
+ string_similarity (float): the string similarity score for the candidate match.
Returns:
- Tuple[str, float, dict]:
- A tuple containing the most popular candidate's Wikidata ID
- (e.g. ``"Q84"``) or ``"NIL"``, the confidence score of the
- predicted link as a float, and a dictionary of all candidates
- and their confidence scores.
-
- .. note::
-
- Applying the "most popular" disambiguation method for linking
- entities. Given a set of candidates for a given mention, the
- function returns as a prediction the more relevant Wikidata
- candidate, determined from the in-link structure of Wikipedia.
- """
- cands = dict_mention["candidates"]
- most_popular_candidate_id = "NIL"
- keep_highest_score = 0.0
- total_score = 0.0
- final_score = 0.0
- all_candidates = {}
- if cands:
- for variation in cands:
- for candidate in cands[variation]["Candidates"]:
- score = self.linking_resources["mentions_to_wikidata"][variation][
- candidate
- ]
- total_score += score
- all_candidates[candidate] = score
- if score > keep_highest_score:
- keep_highest_score = score
- most_popular_candidate_id = candidate
-
- # Return the predicted and the score (overall the total):
- final_score = keep_highest_score / total_score
-
- # Compute scores for all candidates
- all_candidates = {
- cand: (score / total_score) for cand, score in all_candidates.items()
- }
+ A dictionary containing disambiguation scores, keyed by Wikidata ID.
+ """
+ total = sum([m.freq for m in links])
+ return {link.wqid: link.freq / total for link in links}
- return most_popular_candidate_id, final_score, all_candidates
+class ByDistanceLinker(Linker):
+ """
+ An entity linking method that selects the candidate based on its
+ proximity to the place of publication.
- def by_distance(
- self, dict_mention: dict, origin_wqid: Optional[str] = ""
- ) -> Tuple[str, float, dict]:
+ Example:
+ ```python
+ linker = ByDistanceLinker(
+ resources_path="/path/to/resources/",
+ experiments_path="/path/to/experiments/",
+ linking_resources={},
+ )
+ ```
+ """
+ # Override the method_name class attribute.
+ method_name: str = "bydistance"
+
+ def wikidata_links(
+ self,
+ match: StringMatchLinks,
+ place_of_pub_wqid: Optional[str]=None,
+ ) -> List[WikidataLink]:
+ """
+ Identifies candidate links in the Wikidata knowledgebase.
+
+ Args:
+ match (StringMatchLinks): The toponym string match to be linked.
+ place_of_pub_wqid (Optional[str], optional): The Wikidata ID of
+ the place of publication, if available.
+
+ Returns:
+ A list of candidate links in Wikidata, each of type
+ [`ByDistanceLink`][t_res.utils.dataclasses.ByDistanceLink].
+ """
+ origin_coords = self.wkdt_coords(place_of_pub_wqid)
+ links = [ByDistanceLink(
+ wqid=wqid,
+ wkdt_class=self.wkdt_class(wqid),
+ coords=self.wkdt_coords(wqid),
+ place_of_pub_coords=origin_coords,
+ geodist=self.haversine(origin_coords, self.wkdt_coords(wqid)),
+ normalized_score=self.resources["mentions_to_wikidata_normalized"][match.variation][
+ wqid
+ ]) for wqid in match.wqid_links]
+ return links
+
+ def disambiguation_scores(self,
+ wikidata_links: List[ByDistanceLink],
+ string_similarity: float) -> Dict[str, float]:
"""
- Select candidate based on distance to the place of publication.
+ Computes disambiguation scores based on the physical proximity of the candidate
+ to the place of publication of the source text, also taking into account the
+ string similarity of the match and the relative popularity of the Wikidata entry.
Arguments:
- dict_mention (dict): dictionary with all the relevant information
- needed to disambiguate a certain mention.
- origin_wqid (str, optional): The origin Wikidata ID for distance
- calculation. Defaults to ``""``.
+ links: A list of `WikidataLink` instances.
+ string_similarity (float): the string similarity score for the candidate match.
Returns:
- Tuple[str, float, dict]:
- A tuple containing the Wikidata ID of the closest candidate
- to the place of publication (e.g. ``"Q84"``) or ``"NIL"``,
- the confidence score of the predicted link as a float (rounded
- to 3 decimals), and a dictionary of all candidates and their
- confidence scores.
-
- .. note::
-
- Applying the "by distance" disambiguation method for linking
- entities, based on geographical distance. It undertakes an
- unsupervised disambiguation, which returns a prediction of a
- location closest to the place of publication, for a provided set
- of candidates and the place of publication of the original text.
- """
- cands = dict_mention["candidates"]
- origin_coords = self.linking_resources["wqid_to_coords"].get(origin_wqid)
- if not origin_coords:
- origin_coords = self.linking_resources["wqid_to_coords"].get(
- dict_mention["place_wqid"]
- )
- closest_candidate_id = "NIL"
+ A dictionary containing disambiguation scores, keyed by Wikidata ID.
+ """
max_on_gb = 1000 # 1000 km, max on GB
- keep_lowest_distance = max_on_gb # 20000 km, max on Earth
- keep_lowest_relv = 1.0
- all_candidates = {}
-
- if cands:
- for x in cands:
- matching_score = cands[x]["Score"]
- for candidate, score in cands[x]["Candidates"].items():
- cand_coords = self.linking_resources["wqid_to_coords"][candidate]
- geodist = 20000
- # if origin_coords and cand_coords: # If there are coordinates
- try:
- geodist = haversine(origin_coords, cand_coords)
- all_candidates[candidate] = geodist
- except ValueError:
- # We have one candidate with coordinates in Venus!
- pass
- if geodist < keep_lowest_distance:
- keep_lowest_distance = geodist
- closest_candidate_id = candidate
- keep_lowest_relv = (matching_score + score) / 2.0
-
- if keep_lowest_distance == 0.0:
- keep_lowest_distance = 1.0
- else:
- keep_lowest_distance = (
- max_on_gb if keep_lowest_distance > max_on_gb else keep_lowest_distance
+ ret = dict()
+ for link in wikidata_links:
+
+ distance = min(max_on_gb, link.geodist if link.geodist is not None else max_on_gb)
+
+ if distance == 0.0:
+ distance_score = 1.0
+ else:
+ distance = (max_on_gb if distance > max_on_gb else distance)
+ distance_score = 1.0 - (distance / max_on_gb)
+
+ relv_score = min(1.0, (string_similarity + link.normalized_score) / 2.0)
+ final_score = round((relv_score + distance_score) / 2, 3) if link.geodist is not None else 0.0
+
+ ret[link.wqid] = final_score
+ return ret
+
+class RelDisambLinker(MostPopularLinker):
+ """
+ An entity linking method that selects the candidate using the [Radboud
+ Entity Linker](https://github.com/informagi/REL/) (REL) model.
+
+ This is a subclass of the MostPopularLinker so that the disambiguation
+ score based on Wikidata popularity may be used to compute a combined
+ disambiguation score (if configured to do so).
+
+ Arguments:
+ resources_path (str): The path to the linking resources.
+ ranker (Ranker): A `Ranker` instance.
+ experiments_path (str, optional): The path to the experiments
+ directory.
+ linking_resources (dict): Dictionary containing the
+ necessary linking resources.
+ overwrite_training (bool): Flag indicating whether to overwrite the
+ training.
+ rel_params (dict, optional): Dictionary containing the parameters
+ for performing entity disambiguation using the ``reldisamb``
+ approach (adapted from the Radboud Entityt Linker, REL).
+ For the default settings, see Notes below.
+
+ Example:
+ ```python
+ linker = Linker(
+ resources_path="/path/to/resources/",
+ ranker=PerfectMatchRanker(resources_path="/path/to/resources/"),
+ experiments_path="/path/to/experiments/",
+ linking_resources={},
+ overwrite_training=True,
+ rel_params={"with_publication": True, "do_test": True}
+ )
+ ```
+
+ Note:
+ Note that, in order to instantiate the Linker with the ``reldisamb``
+ method, the Linker needs to be wrapped by a context manager in which
+ a connection to the entity embeddings database is established and a
+ cursor is created:
+
+ ```python
+ with sqlite3.connect("../resources/rel_db/embeddings_database.db") as conn:
+ cursor = conn.cursor()
+ linker = RelDisambLinker(
+ resources_path="../resources/",
+ ranker=PerfectMatchRanker(resources_path="../resources/"),
+ experiments_path="../experiments/",
+ linking_resources=dict(),
+ overwrite_training=False,
+ rel_params={
+ "model_path": "../resources/models/disambiguation/",
+ "data_path": "../experiments/outputs/data/lwm/",
+ "training_split": "",
+ "db_embeddings": cursor,
+ "with_publication": True,
+ "without_microtoponyms": True,
+ "do_test": False,
+ "default_publname": "",
+ "default_publwqid": "",
+ },
)
- keep_lowest_distance = 1.0 - (keep_lowest_distance / max_on_gb)
+ ```
+
+ See below the default settings for ``rel_params``. Note that
+ `db_embeddings` defaults to None, but it should be assigned a
+ cursor to the entity embeddings database, as described above:
+
+ ```python
+ rel_params: Optional[dict] = {
+ "model_path": "../resources/models/disambiguation/",
+ "data_path": "../experiments/outputs/data/lwm/",
+ "training_split": "originalsplit",
+ "db_embeddings": None,
+ "with_publication": True,
+ "without_microtoponyms": True,
+ "do_test": False,
+ "default_publname": "United Kingdom",
+ "default_publwqid": "Q145",
+ "reference_separation": ((49.956739, -8.17751), (60.87, 1.762973))
+ }
+ ```
+ """
+ # Override the method_name class attribute.
+ method_name: str = "reldisamb"
+
+ # Override the constructor to include REL model parameters.
+ def __init__(
+ self,
+ resources_path: str,
+ ranker: ranking.Ranker,
+ experiments_path: Optional[str] = "../experiments",
+ linking_resources: Optional[dict] = dict(),
+ overwrite_training: Optional[bool] = False,
+ rel_params: Optional[dict] = None,
+ ):
+ super().__init__(resources_path, experiments_path, linking_resources)
+
+ self.overwrite_training = overwrite_training
+
+ # Default linking parameters:
+ params = {
+ "model_path": os.path.join(resources_path, "models/disambiguation/"),
+ "data_path": os.path.join(experiments_path, "outputs/data/lwm/"),
+ "training_split": "originalsplit",
+ "db_embeddings": None, # The cursor to the embeddings database.
+ "with_publication": True,
+ "predict_place_of_publication": True,
+ "combined_score": True,
+ "without_microtoponyms": True,
+ "do_test": False,
+ "default_publname": "United Kingdom",
+ "default_publwqid": "Q145",
+ "reference_separation": ((49.956739, -8.17751), (60.87, 1.762973)),
+ "device": "cuda" if torch.cuda.is_available() else "cpu"
+ }
+ if rel_params is not None:
+ if not set(rel_params) <= set(params):
+ raise ValueError("Invalid REL config parameters.")
+ # Update the default parameters with any given parameters.
+ params.update(rel_params)
+
+ self.rel_params = params
+ self.ranker = ranker
+ self.entity_disambiguation_model = None
+
+ reference_separation = self.rel_params['reference_separation']
+ self.reference_distance = self.haversine(reference_separation[0], reference_separation[1])
+
+ def __str__(self) -> str:
+ """
+ Returns a string representation of the Linker object.
+
+ Returns:
+ String representation of the Linker object.
+ """
+ s = super().__str__()
+ s += f" * Overwrite training: {self.overwrite_training}\n"
+ return s
+
+ # Override the load method to load the entity disambiguation model.
+ def load(
+ self, split: Optional[str] = "originalsplit"
+ ):
+ """
+ Loads the linking resources and assigns them to instance variables.
+ """
+ super().load()
+ self.train_load_model(split=split)
+
+ # Override the run method to include handling of REL config parameters.
+ def run(
+ self,
+ matches: CandidateMatches,
+ place_of_pub_wqid: Optional[str]=None,
+ place_of_pub: Optional[str]=None,
+ ) -> MentionCandidates:
+ """
+ Executes the linking process.
+
+ Arguments:
+ matches: A `CandidatesMatches` instance containing string matches to be linked.
+ place_of_pub_wqid (Optional[str]): The Wikidata ID of the place of publication.
+ place_of_pub (Optional[str]): The place of publication.
- final_score = 0.0
- if not closest_candidate_id == "NIL":
- final_score = round((keep_lowest_relv + keep_lowest_distance) / 2, 3)
+ Returns:
+ The candidates identified by the linking process.
+ """
+ # If configured to link "with publication" (i.e. with an additional sentence
+ # containing an artificial mention of the place of publication), use default
+ # values for place_of_pub_wqid and place_of_pub unless they are already populated.
+ if self.rel_params["with_publication"]:
+ if not (place_of_pub_wqid and place_of_pub):
+ place_of_pub_wqid = self.rel_params["default_publwqid"]
+ place_of_pub = self.rel_params["default_publname"]
+
+ # Skip microtoponyms if configured to do so.
+ if self.rel_params["without_microtoponyms"]:
+ if matches.mention.is_microtoponym():
+ return self.empty_candidates(matches.mention, matches.ranking_method, place_of_pub_wqid, place_of_pub)
+
+ return super().run(matches, place_of_pub_wqid, place_of_pub)
+
+ def wikidata_links(
+ self,
+ match: StringMatchLinks,
+ place_of_pub_wqid: Optional[str]=None,
+ ) -> List[WikidataLink]:
+ """
+ Identifies candidate links in the Wikidata knowledgebase.
+
+ Args:
+ match (StringMatchLinks): The toponym string match to be linked.
+ place_of_pub_wqid (Optional[str], optional): The Wikidata ID of
+ the place of publication, if available.
+
+ Returns:
+ A list of candidate links in Wikidata, each of type
+ [`RelDisambLink`][t_res.utils.dataclasses.RelDisambLink].
+ """
+ links = [RelDisambLink(
+ wqid=wqid,
+ wkdt_class=self.wkdt_class(wqid),
+ coords=self.wkdt_coords(wqid),
+ freq=self.resources["mentions_to_wikidata"][match.variation][wqid],
+ normalized_score=self.resources["mentions_to_wikidata_normalized"][match.variation][
+ wqid
+ ]) for wqid in match.wqid_links]
+ return links
+
+ # Override the disambiguate method to include REL linking.
+ def disambiguate(self,
+ candidates: List[SentenceCandidates],
+ apply_rel: bool=True) -> Predictions:
+ """
+ Performs entity disambiguation given a list of already identified
+ toponyms and selected candidates. This method overrides the base
+ class implementation to include REL model linking.
- return closest_candidate_id, final_score, all_candidates
+ Arguments:
+ candidates: A list of SentenceCandidates instances.
- def train_load_model(
- self, myranker: ranking.Ranker, split: Optional[str] = "originalsplit"
- ) -> entity_disambiguation.EntityDisambiguation:
+ Returns:
+ A `Predictions` instance representing the identified and
+ linked toponyms.
+ """
+ # Generate prior predictions as inputs to the REL model.
+ predictions = super().disambiguate(candidates)
+
+ # Remove any microtoponyms from the predictions, if configured to do so.
+ if self.rel_params["without_microtoponyms"]:
+ micro_candidates = [sc for sc in predictions.sentence_candidates for c in sc.candidates if c.mention.is_microtoponym()]
+ for sc in micro_candidates:
+ sc.remove_microtoponyms()
+
+ if not apply_rel:
+ return predictions
+
+ if not self.entity_disambiguation_model:
+ ValueError("Entity disambiguation model not yet loaded. Call `load` method.")
+
+ # Apply the REL model to the interim predictions.
+ rel_predictions_dict = self.entity_disambiguation_model.predict(
+ predictions.as_dict(self.rel_params["with_publication"]))
+
+ # Incorporate the REL model predictions.
+ rel_predictions = predictions.apply_rel_disambiguation(rel_predictions_dict, self.rel_params["with_publication"])
+
+ # Take into account the `predict_place_of_pub` config parameter.
+ if self.rel_params['predict_place_of_publication']:
+ self.predict_place_of_publication(rel_predictions)
+
+ # Take into account the `combined_score` config parameter.
+ if self.rel_params['combined_score']:
+ self.apply_combined_score(rel_predictions)
+
+ return rel_predictions
+
+ # Computes disambiguation scores for a collection of potential Wikidata links.
+ # (Note: this replaces the rank_candidates function from rel_utils.py)
+ def disambiguation_scores(self,
+ links: List[RelDisambLink],
+ string_similarity: float) -> Dict[str, float]:
"""
- Trains or loads the entity disambiguation model.
+ Computes *interim* disambiguation scores (i.e. before applying the REL model)
+ by taking into account the string similarity of the match and the relative
+ popularity of the Wikidata entry.
Arguments:
- myranker (geoparser.ranking.Ranker): The ranker object used for
- training.
- split (str, optional): The split type for training. Defaults to
- ``"originalsplit"``.
+ links: A list of `WikidataLink` instances.
+ string_similarity (float): the string similarity score for the candidate match.
Returns:
- entity_disambiguation.EntityDisambiguation:
- A trained Entity Disambiguation model.
+ A dictionary containing disambiguation scores, keyed by Wikidata ID.
+ """
+ ret = dict()
+ if not links:
+ return ret
+ max_cand_freq = max([m.freq for m in links])
+ for wikidata_link in links:
+
+ # Normalize absolute mention-to-Wikidata relevance by entity:
+ candidate_score_1 = wikidata_link.freq / max_cand_freq
+ # Average of string similarity and mention-to-Wikidata normalized relevance:
+ candidate_score_2 = (wikidata_link.normalized_score + string_similarity) / 2
+
+ # Average of two candidate scores, normalized between 0 and 0.9:
+ score = ((candidate_score_1 + candidate_score_2) / 2) * 0.9
+ ret[wikidata_link.wqid] = score
+
+ return ret
+
+ def predict_place_of_publication(self, rel_predictions: RelPredictions):
+ """
+ Sets the disambiguation scores for the place of publication to 1.0 inside the given
+ REL predictions, provided the place of publication is known and exists as a candidate link.
+
+ Arguments:
+ rel_predictions: An instance of the `RelPredictions` dataclass.
+ """
+ place_of_pub_wqid = rel_predictions.place_of_pub_wqid()
+ if not place_of_pub_wqid:
+ return
+ for rs in rel_predictions.rel_scores:
+ # If the place of publication is not in the list of scored candidates, do nothing.
+ if not place_of_pub_wqid in rs.scores.keys():
+ return
+ rs.scores[place_of_pub_wqid] = 1.0
+
+ def apply_combined_score(self, rel_predictions: RelPredictions):
+ """
+ Updates all disambiguation scores in the given REL predictions by
+ combining the REL score with place of publication information, if known.
+
+ Arguments:
+ rel_predictions: An instance of the `RelPredictions` dataclass.
+ """
+ place_of_pub_wqid = rel_predictions.place_of_pub_wqid()
+ if not place_of_pub_wqid:
+ return
+
+ def combined_score(rel_score, popularity, proximity):
+ if not proximity:
+ return rel_score
+ return rel_score * max(popularity, proximity)
+
+ # Iterate over the mention candidates (and their corresponding REL scores by the same index).
+ for i, mc in enumerate(rel_predictions.candidates(ignore_empty_candidates=False)):
+ # Iterate over the predicted Wikidata links.
+ for cl in mc.links:
+ # Compute popularity and proximity scores for all Wikidata links.
+ wqids = [wl.wqid for wl in cl.wikidata_links]
+ # Use the MostPopularLinker superclass to compute popularity.
+ popularity = super().disambiguation_scores(cl.wikidata_links)
+ proximity = {wqid: self.proximity(
+ origin_coords=self.wkdt_coords(place_of_pub_wqid),
+ coords=self.wkdt_coords(wqid)) for wqid in wqids}
+ # Compute the combined scores.
+ rs = rel_predictions.rel_scores[i]
+ combined_scores = {wqid: combined_score(rs.scores[wqid], popularity[wqid], proximity[wqid]) for wqid in wqids}
+ # Update the REL predictions (retaining the original REL scores).
+ rel_predictions.rel_scores[i] = CombinedScores(
+ mention=rs.mention,
+ scores=combined_scores,
+ confidence=rs.confidence,
+ rel_scores=rs.scores,
+ )
+
+ def proximity(self,
+ origin_coords: Optional[Tuple[float, float]],
+ coords: Optional[Tuple[float, float]]) -> Optional[float]:
+ """Computes the proximity measure between pairs of lat-long coordinates.
+
+ Args:
+ origin_coords (Optional[Tuple[float, float]]): _description_
+ coords (Optional[Tuple[float, float]]): _description_
+
+ Returns:
+ Optional[float]: _description_
+ """
+ if not coords:
+ return None
+ distance = self.haversine(origin_coords, coords)
+ # Handle caught error in the haversine method.
+ if not distance:
+ return None
+ return exp(-(distance/self.reference_distance)**2)
+
+ def train_load_model(self, split: Optional[str] = "originalsplit"):
+ """
+ Trains or loads the entity disambiguation model and assigns to the
+ `entity_disambiguation_model` field.
- .. note::
+ Arguments:
+ split (str, optional): The split type for training.
+ Note:
The training will be skipped if the model already exists and
``overwrite_training`` was set to False when initiating the Linker
object, or if the disambiguation method is unsupervised. The
@@ -371,115 +819,111 @@ def train_load_model(
``do_test`` key's value set to True when initiating the Linker
object.
- .. note::
-
- **Credit:**
+ Note: Credit:
+ This class and its methods are adapted from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE).
- This method is adapted from the `REL: Radboud Entity Linker
- `_ Github repository:
- Copyright (c) 2020 Johannes Michael van Hulst. See the `permission
- notice `_.
+ ```
+ Reference:
- ::
-
- Reference:
-
- @inproceedings{vanHulst:2020:REL,
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
- """
- if self.method == "reldisamb":
- # Generate ED model name:
- linker_name = myranker.method
- if myranker.method == "deezymatch":
- linker_name += "+" + str(myranker.deezy_parameters["num_candidates"])
- linker_name += "+" + str(
- myranker.deezy_parameters["selection_threshold"]
- )
- linker_name += f"_{split}"
- if self.rel_params["with_publication"]:
- linker_name += "+wpubl"
- if self.rel_params["without_microtoponyms"]:
- linker_name += "+wmtops"
- if self.rel_params["do_test"]:
- linker_name += "_test"
- linker_name = os.path.join(self.rel_params["model_path"], linker_name)
-
- if self.overwrite_training == True or not Path(linker_name).is_dir():
- print(
- "The entity disambiguation model does not exist or overwrite_training is set to True."
- )
-
- print("Creating the dataset.")
- # Create the folder where to store the resulting
- # disambiguation models:
- Path(linker_name).mkdir(parents=True, exist_ok=True)
-
- # Load the linking dataset, separate training and dev:
- linking_df_path = os.path.join(
- self.rel_params["data_path"], "linking_df_split.tsv"
- )
- linking_df = pd.read_csv(linking_df_path, sep="\t")
- train_df = linking_df[linking_df[split] == "train"]
- dev_df = linking_df[linking_df[split] == "dev"]
-
- # If this is a test, use only the first 20 rows of the train
- # and dev sets:
- if self.rel_params["do_test"] == True:
- train_df = train_df.iloc[:20]
- dev_df = dev_df.iloc[:20]
-
- # Prepare the dataset into the format required by REL:
- train_json = rel_utils.prepare_rel_trainset(
- train_df,
- self.rel_params,
- self.linking_resources["mentions_to_wikidata"],
- myranker,
- "train",
- )
- dev_json = rel_utils.prepare_rel_trainset(
- dev_df,
- self.rel_params,
- self.linking_resources["mentions_to_wikidata"],
- myranker,
- "dev",
- )
-
- # Set ED configuration to train mode:
- config_rel = {
- "mode": "train",
- "model_path": os.path.join(linker_name, "model"),
- }
-
- # Instantiate the entity disambiguation model:
- model = entity_disambiguation.EntityDisambiguation(
- self.rel_params["db_embeddings"],
- config_rel,
- )
- print("Training the model.")
-
- # Train the model using lwm_train:
- model.train(train_json, dev_json)
-
- # Train and predict using LR (to obtain confidence scores)
- model.train_LR(train_json, dev_json, linker_name)
-
- return model
- else:
- # Setting disambiguation model mode to "eval":
- config_rel = {
- "mode": "eval",
- "model_path": os.path.join(linker_name, "model"),
- }
-
- model = entity_disambiguation.EntityDisambiguation(
- self.rel_params["db_embeddings"],
- config_rel,
- )
-
- return model
+ }
+ ```
+ """
+ # Generate ED model name:
+ linker_name = self.ranker.method_name
+ if self.ranker.method_name == "deezymatch":
+ linker_name += "+" + str(self.ranker.deezy_parameters["num_candidates"])
+ linker_name += "+" + str(
+ self.ranker.deezy_parameters["selection_threshold"]
+ )
+ linker_name += f"_{split}"
+ if self.rel_params["with_publication"]:
+ linker_name += "+wpubl"
+ if self.rel_params["without_microtoponyms"]:
+ linker_name += "+wmtops"
+ if self.rel_params["do_test"]:
+ linker_name += "_test"
+ linker_name = os.path.join(self.rel_params["model_path"], linker_name)
+
+ if self.overwrite_training == True or not Path(linker_name).is_dir() or len(os.listdir(linker_name)) == 0:
+ print(
+ f"The entity disambiguation model {Path(linker_name)} does not exist or overwrite_training is set to True."
+ )
+
+ print("Creating the dataset.")
+ # Create the folder where to store the resulting
+ # disambiguation models:
+ Path(linker_name).mkdir(parents=True, exist_ok=True)
+
+ # Load the linking dataset, separate training and dev:
+ linking_df_path = os.path.join(
+ self.rel_params["data_path"], "linking_df_split.tsv"
+ )
+ linking_df = pd.read_csv(linking_df_path, sep="\t")
+ train_df = linking_df[linking_df[split] == "train"]
+ dev_df = linking_df[linking_df[split] == "dev"]
+
+ # If this is a test, use only the first 20 rows of the train
+ # and dev sets:
+ if self.rel_params["do_test"] == True:
+ train_df = train_df.iloc[:20]
+ dev_df = dev_df.iloc[:20]
+
+ # Prepare the dataset into the format required by REL:
+ train_json = rel_utils.prepare_rel_trainset(
+ train_df,
+ self.rel_params,
+ self.ranker,
+ self,
+ "train",
+ )
+ dev_json = rel_utils.prepare_rel_trainset(
+ dev_df,
+ self.rel_params,
+ self.ranker,
+ self,
+ "dev",
+ )
+
+ # Set ED configuration to train mode:
+ config_rel = {
+ "mode": "train",
+ "model_path": os.path.join(linker_name, "model"),
+ "device": self.rel_params["device"],
+ }
+
+ # Instantiate the entity disambiguation model:
+ model = entity_disambiguation.EntityDisambiguation(
+ self.rel_params["db_embeddings"],
+ config_rel,
+ )
+ print("Training the model.")
+
+ # Train the model using lwm_train:
+ model.train(train_json, dev_json)
+
+ # Train and predict using LR (to obtain confidence scores)
+ model.train_LR(train_json, dev_json, linker_name)
+ else:
+ # Setting disambiguation model mode to "eval":
+ config_rel = {
+ "mode": "eval",
+ "model_path": os.path.join(linker_name, "model"),
+ "device": self.rel_params["device"],
+ }
+
+ model = entity_disambiguation.EntityDisambiguation(
+ self.rel_params["db_embeddings"],
+ config_rel,
+ )
+
+ self.entity_disambiguation_model = model
diff --git a/t_res/geoparser/recogniser.py b/t_res/geoparser/ner.py
similarity index 50%
rename from t_res/geoparser/recogniser.py
rename to t_res/geoparser/ner.py
index 975f0010..61f03ef7 100644
--- a/t_res/geoparser/recogniser.py
+++ b/t_res/geoparser/ner.py
@@ -5,6 +5,7 @@
from typing import List, Optional, Tuple
import numpy as np
+import torch
from datasets import load_dataset, load_metric
from transformers import (
AutoModelForTokenClassification,
@@ -16,16 +17,263 @@
pipeline,
)
-from ..utils import ner
-
+from ..utils import ner_utils
+from ..utils.dataclasses import Mention, Sentence, SentenceMentions
class Recogniser:
"""
- A class for training and using a toponym recogniser with the specified
- parameters.
+ The Recogniser class provides methods for named entity recognition
+ applied to toponyms.
Arguments:
- model (str): The name of the NER model.
+ model_name (str): The name of the NER model.
+ device (str, optional): GPU device name (default: ``None``).
+
+ Note:
+ This base class should not be instatiated directly. Instead use a subclass
+ constructor.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ device: Optional[str]=None,
+ ):
+ """
+ Initialises a Recogniser object.
+ """
+ self.model_name = model_name
+ if device is None:
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ self.device = device
+
+ def __str__(self) -> str:
+ """
+ Returns a string representation of the Recogniser object.
+
+ Returns:
+ str: String representation of the Recogniser object.
+ """
+ s = "\n>>> Toponym recogniser:\n"
+ s += f" * Model: {self.model()}\n"
+ return s
+
+ def new(**kwargs) -> 'Recogniser':
+ """
+ Static constructor.
+
+ Args:
+ kwargs (dict): A dictionary of keyword arguments matching the
+ arguments to a subclass __init__ constructor, plus a
+ `method_name` argument to specify the desired subclass.
+
+ Returns:
+ A Recogniser (subclass) instance.
+
+ """
+ if not 'method_name' in kwargs.keys():
+ raise ValueError("Expected `method_name` keyword argument.")
+ method_name = kwargs['method_name']
+ del kwargs['method_name']
+ if method_name == 'pretrained':
+ return PretrainedRecogniser(**kwargs)
+ if method_name == 'custom':
+ return CustomRecogniser(**kwargs)
+ raise ValueError(f"Invalid NER method: {method_name}")
+
+ def model(self) -> str:
+ """
+ Returns the ``model`` parameter to be passed to the Pipeline factory
+ method in the ``transformers`` package.
+
+ Returns:
+ str: The ``model`` parameter
+ """
+ raise NotImplementedError("Subclass implementation required.")
+
+ def load(self):
+ """
+ Creates a Named Entity Recognition (NER) pipeline and assigns it
+ to the ``pipe`` attribute.
+
+ Note:
+ This method creates and loads a NER pipeline for performing named
+ entity recognition tasks. The created pipeline is stored in the
+ ``pipe`` attribute of the ``Recogniser`` instance.
+ """
+
+ print("*** Creating and loading a NER pipeline.")
+ self.pipe = pipeline("ner", model=self.model(), ignore_labels=[], device=self.device)
+
+ # The run method combines `ner_predict` with the `aggregate_mentions`
+ # function from `ner_utils.py` (eventually making those redundant).
+ def run(self, sentence: str) -> SentenceMentions:
+ """
+ Identifies named entities in a given sentence using the NER pipeline.
+
+ Arguments:
+ sentence (str): The input sentence.
+
+ Returns:
+ SentenceMentions: An instance of the SentenceMentions dataclass, containing
+ a list of toponym mentions found in the given sentence.
+
+ Note:
+ Any n-dash characters (``—``) in the provided sentence are
+ replaced with a comma (``,``) to handle parsing issues related to
+ the n-dash in OCR from historical newspapers.
+ """
+ sentence = str(sentence)
+ if len(sentence) <= 1:
+ return SentenceMentions(Sentence(sentence), [])
+
+ # The n-dash is a very frequent character in historical newspapers,
+ # but the NER pipeline does not process it well: Plymouth—Kingston
+ # is parsed as "Plymouth (B-LOC), — (B-LOC), Kingston (B-LOC)", instead
+ # of the n-dash being interpreted as a word separator. Therefore, we
+ # replace it by a comma, except when the n-dash occurs in the opening
+ # position of a sentence.
+ sentence = sentence[0] + sentence[1:].replace("—", ",")
+
+ # Run the NER pipeline to predict mentions:
+ if not hasattr(self, 'pipe'):
+ raise ValueError("Missing NER pipeline. Try calling the load() method.")
+ ner_preds = self.pipe(sentence)
+ return self.post_process(ner_preds, sentence)
+
+ def post_process(self, ner_predictions, sentence: str) -> SentenceMentions:
+
+ sentence = str(sentence)
+ if len(sentence) <= 1:
+ return SentenceMentions(Sentence(sentence), [])
+
+ # Post-process the predictions, fixing potential grouping errors:
+ lEntities = []
+ predictions = []
+ for pred_ent in ner_predictions:
+ pred_ent["score"] = float(pred_ent["score"])
+ pred_ent["entity"] = pred_ent["entity"]
+ pred_ent = ner_utils.fix_capitalization(pred_ent, sentence)
+ predictions = ner_utils.aggregate_entities(pred_ent, lEntities)
+
+ if len(predictions) > 0:
+ predictions = ner_utils.fix_hyphens(predictions)
+ predictions = ner_utils.fix_nested(predictions)
+ predictions = ner_utils.fix_startEntity(predictions)
+
+ # Process predictions (moved from pipeline.py::run_sentence_recognition):
+ procpreds = [
+ [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]]
+ for x in predictions
+ ]
+
+ # Aggregate mentions:
+ mentions = ner_utils.aggregate_mentions(procpreds, "pred")
+
+ mentions = [Mention.from_dict(m) for m in mentions]
+ return SentenceMentions(Sentence(sentence), mentions=mentions)
+
+ # Deprecated: use the `run` method instead.
+ def ner_predict(self, sentence: str) -> List[dict]:
+ """
+ Predicts named entities in a given sentence using the NER pipeline.
+
+ Arguments:
+ sentence (str): The input sentence.
+
+ Returns:
+ A list of dictionaries representing the predicted named
+ entities. Each dictionary contains the keys ``"word"``,
+ ``"entity"``, ``"score"``, ``"start"`` , and ``"end"``
+ representing the entity text, entity label, confidence
+ score and start and end character position of the text
+ respectively. For example:
+
+ ```json
+ {
+ "word": "From",
+ "entity": "O",
+ "score": 0.99975187,
+ "start": 0,
+ "end": 4
+ }
+ ```
+
+ Note:
+ This method takes a sentence as input and uses the NER pipeline to
+ predict named entities in the sentence.
+
+ Any n-dash characters (``—``) in the provided sentence are
+ replaced with a comma (``,``) to handle parsing issues related to
+ the n-dash in OCR from historical newspapers.
+ """
+ # Error if the sentence is too short.
+ if len(sentence) <= 1:
+ return []
+
+ # The n-dash is a very frequent character in historical newspapers,
+ # but the NER pipeline does not process it well: Plymouth—Kingston
+ # is parsed as "Plymouth (B-LOC), — (B-LOC), Kingston (B-LOC)", instead
+ # of the n-dash being interpreted as a word separator. Therefore, we
+ # replace it by a comma, except when the n-dash occurs in the opening
+ # position of a sentence.
+ sentence = sentence[0] + sentence[1:].replace("—", ",")
+
+ # Run the NER pipeline to predict mentions:
+ ner_preds = self.pipe(sentence)
+
+ # Post-process the predictions, fixing potential grouping errors:
+ lEntities = []
+ predictions = []
+ for pred_ent in ner_preds:
+ pred_ent["score"] = float(pred_ent["score"])
+ pred_ent["entity"] = pred_ent["entity"]
+ pred_ent = ner_utils.fix_capitalization(pred_ent, sentence)
+ predictions = ner_utils.aggregate_entities(pred_ent, lEntities)
+
+ if len(predictions) > 0:
+ predictions = ner_utils.fix_hyphens(predictions)
+ predictions = ner_utils.fix_nested(predictions)
+ predictions = ner_utils.fix_startEntity(predictions)
+
+ return predictions
+
+class PretrainedRecogniser(Recogniser):
+ """
+ A pretrained toponym recogniser loaded from HuggingFace.
+
+ Example:
+ ```
+ # Create an instance of the PretrainedRecogniser class
+ recogniser = PretrainedRecogniser(
+ model_name="Livingwithmachines/toponym-19thC-en",
+ )
+
+ # Create and load the NER pipeline
+ recogniser.load()
+
+ # Predict named entities in a sentence
+ sentence = "I live in London."
+ predictions = recogniser.ner_predict(sentence)
+ print(predictions)
+ ```
+ """
+
+ def model(self) -> str:
+ """
+ Returns the name of the model loaded from HuggingFace.
+
+ Returns:
+ The name of the pretrained HuggingFace model.
+ """
+ return self.model_name
+
+class CustomRecogniser(Recogniser):
+ """
+ A toponym recogniser with data and parameters for custom training.
+
+ Arguments:
+ model_name (str): The name of the NER model.
train_dataset (str, optional): Path to the training dataset
(default: ``""``).
test_dataset (str, optional): Path to the testing dataset
@@ -43,43 +291,39 @@ class Recogniser:
trained model (default: ``False``).
do_test (bool, optional): Whether to train in test mode
(default: ``False``).
- load_from_hub (bool, optional): Whether to load the model from
- HuggingFace model hub or locally (default: ``False``).
Example:
- >>> # Create an instance of the Recogniser class
- >>> recogniser = Recogniser(
- model="ner-model",
- train_dataset="train.json",
- test_dataset="test.json",
- base_model="bert-base-uncased",
- model_path="/path/to/model/",
- training_args={
- "batch_size": 8,
- "num_train_epochs": 10,
- "learning_rate": 0.00005,
- "weight_decay": 0.0,
- },
- overwrite_training=False,
- do_test=False,
- load_from_hub=False
- )
-
- >>> # Create and load the NER pipeline
- >>> pipeline = recogniser.create_pipeline()
+ ```
+ # Create an instance of the CustomRecogniser class
+ recogniser = CustomRecogniser(
+ model_name="ner-model",
+ train_dataset="train.json",
+ test_dataset="test.json",
+ base_model="bert-base-uncased",
+ model_path="/path/to/model/",
+ training_args={
+ "batch_size": 8,
+ "num_train_epochs": 10,
+ "learning_rate": 0.00005,
+ "weight_decay": 0.0,
+ },
+ overwrite_training=False,
+ do_test=False,
+ )
- >>> # Train the model
- >>> recogniser.train()
+ # Create and load the NER pipeline
+ recogniser.load()
- >>> # Predict named entities in a sentence
- >>> sentence = "I live in London."
- >>> predictions = recogniser.ner_predict(sentence)
- >>> print(predictions)
+ # Predict named entities in a sentence
+ sentence = "I live in London."
+ predictions = recogniser.ner_predict(sentence)
+ print(predictions)
+ ```
"""
def __init__(
self,
- model: str,
+ model_name: str,
train_dataset: Optional[str] = "",
test_dataset: Optional[str] = "",
pipe: Optional[Pipeline] = None,
@@ -93,12 +337,12 @@ def __init__(
},
overwrite_training: Optional[bool] = False,
do_test: Optional[bool] = False,
- load_from_hub: Optional[bool] = False,
+ device: Optional[str]=None,
):
"""
Initialises a Recogniser object.
"""
- self.model = model
+ super().__init__(model_name, device)
self.train_dataset = train_dataset
self.test_dataset = test_dataset
self.pipe = pipe
@@ -107,47 +351,66 @@ def __init__(
self.training_args = training_args
self.overwrite_training = overwrite_training
self.do_test = do_test
- self.load_from_hub = load_from_hub
- # Add "_test" to the model name if do_test is True, unless
- # the model is downloaded from Huggingface, in which case
- # we keep the name inputed by the user.
- if self.do_test == True and self.load_from_hub == False:
- self.model += "_test"
+ # Add "_test" to the model name if do_test is True.
+ if self.do_test:
+ self.model_name += "_test"
- # -------------------------------------------------------------
def __str__(self) -> str:
"""
Returns a string representation of the Recogniser object.
Returns:
- str: String representation of the Recogniser object.
+ A string representation of the Recogniser object.
"""
- s = "\n>>> Toponym recogniser:\n"
- s += f" * Model path: {self.model_path}\n"
- s += f" * Model name: {self.model}\n"
+ s = super().__str__()
s += f" * Base model: {self.base_model}\n"
s += f" * Overwrite model if exists: {self.overwrite_training}\n"
s += f" * Train in test mode: {self.do_test}\n"
- s += f" * Load from hub: {self.load_from_hub}\n"
s += f" * Training args: {self.training_args}\n"
return s
- # -------------------------------------------------------------
- def train(self) -> None:
+ def model(self) -> str:
"""
- Trains a NER model.
+ Returns the path and filename of the trained model.
Returns:
- None.
+ The path and filename of the trained model
+ """
+ return os.path.join(self.model_path, f"{self.model_name}.model")
+
+ # Override the load method to train the model if necessary.
+ def load(self):
+ """
+ Creates a Named Entity Recognition (NER) pipeline and assigns it
+ to the ``pipe`` attribute.
Note:
- If the model is obtained from the HuggingFace model hub
- (``load_from_hub=True``) or if the model already exists at the
- specified model path and ``overwrite_training`` is False,
- training is skipped.
+ This method creates and loads a NER pipeline for performing named
+ entity recognition tasks. Unless a trained model already exists and
+ overwrite_training is False, it calls the ``train`` method to
+ train a custom model and saves it using the specified model name
+ and model path. It then creates the pipeline from that model.
+ The created pipeline is stored in the ``pipe`` attribute of the
+ ``Recogniser`` object.
+ """
+
+ if Path(self.model()).exists() and not self.overwrite_training:
+ s = "\n** Note: Model "
+ s += f"{self.model()} is already trained.\n"
+ s += "Set overwrite_training to True if needed.\n"
+ print(s)
+ else:
+ self.train()
+
+ super().load()
- Otherwise, the training process is executed, including the
+ def train(self):
+ """
+ Trains an NER model and saves it under the model path.
+
+ Note:
+ Training process is executed, including the
loading of datasets, model, and tokenizer, tokenization and
alignment of labels, computation of evaluation metrics,
training using the Trainer object, evaluation, and saving the
@@ -157,22 +420,9 @@ def train(self) -> None:
True when the Recogniser object was initiated.
Credit:
- This function is adapted from `a HuggingFace tutorial `_.
+ This function is adapted from a [HuggingFace tutorial](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb).
"""
- # Skip training if the model is obtained from the hub:
- if self.load_from_hub == True:
- return None
-
- # If model exists and overwrite is set to False, skip training:
- model_path = os.path.join(self.model_path,f"{self.model}.model")
- if Path(model_path).exists() and self.overwrite_training == False:
- s = "\n** Note: Model "
- s += f"{model_path} is already trained.\n"
- s += "Set overwrite to True if needed.\n"
- print(s)
- return None
-
print("*** Training the toponym recognition model...")
# Create a path to store the model if it does not exist:
@@ -227,7 +477,7 @@ def train(self) -> None:
# Align tokens and labels when training:
lwm_train_tok = lwm_train.map(
partial(
- ner.training_tokenize_and_align_labels,
+ ner_utils.training_tokenize_and_align_labels,
tokenizer=tokenizer,
label_encoding_dict=label2id,
),
@@ -235,7 +485,7 @@ def train(self) -> None:
)
lwm_test_tok = lwm_test.map(
partial(
- ner.training_tokenize_and_align_labels,
+ ner_utils.training_tokenize_and_align_labels,
tokenizer=tokenizer,
label_encoding_dict=label2id,
),
@@ -270,7 +520,7 @@ def compute_metrics(p: Tuple[list, list]) -> dict:
training_args = TrainingArguments(
output_dir=self.model_path,
evaluation_strategy="epoch",
- logging_dir=os.path.join(self.model_path,"runs/",self.model),
+ logging_dir=os.path.join(self.model_path,"runs/",self.model_name),
learning_rate=self.training_args["learning_rate"],
per_device_train_batch_size=self.training_args["batch_size"],
per_device_eval_batch_size=self.training_args["batch_size"],
@@ -295,102 +545,4 @@ def compute_metrics(p: Tuple[list, list]) -> dict:
trainer.evaluate()
# Save the model:
- trainer.save_model(os.path.join(self.model_path,f"{self.model}.model"))
-
- # -------------------------------------------------------------
- def create_pipeline(self) -> Pipeline:
- """
- Creates and loads a Named Entity Recognition (NER) pipeline.
-
- Returns:
- geoparser.pipeline.Pipeline: The created NER pipeline.
-
- Note:
- This method creates and loads a NER pipeline for performing named
- entity recognition tasks. It uses the specified model name and
- model path (if the model is not obtained from the HuggingFace
- model hub or from a local path) to initialise the pipeline.
- The created pipeline is stored in the ``pipe`` attribute of the
- ``Recogniser`` object. It is also returned by the method.
- """
-
- print("*** Creating and loading a NER pipeline.")
-
- # Path to NER Model:
- model_name = self.model
-
- # If the model is local (has not been obtained from the hub),
- # pre-append the model path and the extension of the model
- # to obtain the model name.
- if self.load_from_hub == False:
- model_name = os.path.join(self.model_path, f"{self.model}.model")
-
- # Load a NER pipeline:
- self.pipe = pipeline("ner", model=model_name, ignore_labels=[])
- return self.pipe
-
- # -------------------------------------------------------------
- def ner_predict(self, sentence: str) -> List[dict]:
- """
- Predicts named entities in a given sentence using the NER pipeline.
-
- Arguments:
- sentence (str): The input sentence.
-
- Returns:
- List[dict]:
- A list of dictionaries representing the predicted named
- entities. Each dictionary contains the keys ``"word"``,
- ``"entity"``, ``"score"``, ``"start"`` , and ``"end"``
- representing the entity text, entity label, confidence
- score and start and end character position of the text
- respectively. For example:
-
- .. code-block:: json
-
- {
- "word": "From",
- "entity": "O",
- "score": 0.99975187,
- "start": 0,
- "end": 4
- }
-
- Note:
- This method takes a sentence as input and uses the NER pipeline to
- predict named entities in the sentence.
-
- Any n-dash characters (``—``) in the provided sentence are
- replaced with a comma (``,``) to handle parsing issues related to
- the n-dash in OCR from historical newspapers.
- """
- # Error if the sentence is too short.
- if len(sentence) <= 1:
- return []
-
- # The n-dash is a very frequent character in historical newspapers,
- # but the NER pipeline does not process it well: Plymouth—Kingston
- # is parsed as "Plymouth (B-LOC), — (B-LOC), Kingston (B-LOC)", instead
- # of the n-dash being interpreted as a word separator. Therefore, we
- # replace it by a comma, except when the n-dash occurs in the opening
- # position of a sentence.
- sentence = sentence[0] + sentence[1:].replace("—", ",")
-
- # Run the NER pipeline to predict mentions:
- ner_preds = self.pipe(sentence)
-
- # Post-process the predictions, fixing potential grouping errors:
- lEntities = []
- predictions = []
- for pred_ent in ner_preds:
- pred_ent["score"] = float(pred_ent["score"])
- pred_ent["entity"] = pred_ent["entity"]
- pred_ent = ner.fix_capitalization(pred_ent, sentence)
- predictions = ner.aggregate_entities(pred_ent, lEntities)
-
- if len(predictions) > 0:
- predictions = ner.fix_hyphens(predictions)
- predictions = ner.fix_nested(predictions)
- predictions = ner.fix_startEntity(predictions)
-
- return predictions
+ trainer.save_model(self.model())
diff --git a/t_res/geoparser/pipeline.py b/t_res/geoparser/pipeline.py
index 371c8e05..63a5052e 100644
--- a/t_res/geoparser/pipeline.py
+++ b/t_res/geoparser/pipeline.py
@@ -1,901 +1,193 @@
-import os
-import sys
-from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Optional
from sentence_splitter import split_text_into_sentences
-from ..utils import ner, rel_utils
-from . import linking, ranking, recogniser
-
+from . import ner, ranking, linking
+from ..utils.dataclasses import SentenceContext, SentenceMentions, SentenceCandidates, Candidates, Predictions
class Pipeline:
"""
- Represents a pipeline for processing a text using natural language
- processing, including Named Entity Recognition (NER), Ranking, and Linking,
- to geoparse any entities in the text.
+ The Pipeline class constitutes an end-to-end pipeline for toponym resolution
+ using natural language processing, including Named Entity Recognition (NER),
+ candidate ranking, and linking to geolocated entities in the Wikidata
+ knowledgebase.
Arguments:
- myner (recogniser.Recogniser, optional): The NER (Named Entity
+ ner (ner.Recogniser, optional): The NER (Named Entity
Recogniser) object to use in the pipeline. If None, a default
``Recogniser`` will be instantiated. For the default settings, see
Notes below.
- myranker (ranking.Ranker, optional): The ``Ranker`` object to use in
+ ranker (ranking.Ranker, optional): The ``Ranker`` object to use in
the pipeline. If None, the default ``Ranker`` will be instantiated.
For the default settings, see Notes below.
- mylinker (linking.Linker, optional): The ``Linker`` object to use in
+ linker (linking.Linker, optional): The ``Linker`` object to use in
the pipeline. If None, the default ``Linker`` will be instantiated.
For the default settings, see Notes below.
resources_path (str, optional): The path to your resources directory.
experiments_path (str, optional): The path to the experiments directory.
- Default is "../experiments".
Example:
- >>> # Instantiate the Pipeline object with a default setup
- >>> pipeline = Pipeline()
+ ```python
+ # Instantiate the Pipeline object with a default setup
+ pipeline = Pipeline()
- >>> # Now you can use the pipeline for processing text or sentences
- >>> text = "I visited Paris and New York City last summer."
- >>> processed_data = pipeline.run_text(text)
+ # Now you can use the pipeline for processing text or sentences
+ text = "I visited Paris and New York City last summer."
+ predictions = pipeline.run(text)
+ print(predictions)
- >>> # Access the processed mentions in the document
- >>> for mention in processed_data:
- >>> print(mention)
+ # Access the results for each toponym mention in the text
+ for toponym in predictions.candidates():
+ print(toponym)
+ ```
Note:
- * The default settings for the ``Recogniser``:
-
- .. code-block:: python
-
- recogniser.Recogniser(
- model="Livingwithmachines/toponym-19thC-en",
- load_from_hub=True,
- )
-
- * The default settings for the ``Ranker``:
-
- .. code-block:: python
-
- ranking.Ranker(
- method="perfectmatch",
- resources_path=resources_path,
- )
+ The default settings for the ``Recogniser``:
- * The default settings for the ``Linker``:
+ ```python
+ ner.PretrainedRecogniser(
+ model="Livingwithmachines/toponym-19thC-en",
+ )
+ ```
- .. code-block:: python
+ The default settings for the ``Ranker``:
+ ```python
+ ranking.PerfectMatchRanker(
+ resources_path=resources_path,
+ )
+ ```
- linking.Linker(
- method="mostpopular",
- resources_path=resources_path,
- )
+ The default settings for the ``Linker``:
+ ```python
+ linking.MostPopularLinker(
+ resources_path=resources_path,
+ experiments_path=experiments_path,
+ )
+ ```
"""
def __init__(
self,
- myner: Optional[recogniser.Recogniser] = None,
- myranker: Optional[ranking.Ranker] = None,
- mylinker: Optional[linking.Linker] = None,
+ recogniser: Optional[ner.Recogniser] = None,
+ ranker: Optional[ranking.Ranker] = None,
+ linker: Optional[linking.Linker] = None,
resources_path: Optional[str] = None,
- experiments_path: Optional[str] = None,
+ experiments_path: Optional[str] = "../experiments",
):
"""
Instantiates a Pipeline object.
"""
- self.myner = myner
- self.myranker = myranker
- self.mylinker = mylinker
+ self.recogniser = recogniser
+ self.ranker = ranker
+ self.linker = linker
- # If myner is None, instantiate the default Recogniser.
- if not self.myner:
- self.myner = recogniser.Recogniser(
- model="Livingwithmachines/toponym-19thC-en",
- load_from_hub=True,
+ # If ner is None, instantiate the default Recogniser.
+ if not self.recogniser:
+ self.recogniser = ner.PretrainedRecogniser(
+ model_name="Livingwithmachines/toponym-19thC-en",
)
- # If myranker is None, instantiate the default Ranker.
- if not self.myranker:
+ # If ranker is None, instantiate the default Ranker.
+ if not self.ranker:
if not resources_path:
raise ValueError("[ERROR] Please specify path to resources directory.")
- self.myranker = ranking.Ranker(
- method="perfectmatch",
+ self.ranker = ranking.PerfectMatchRanker(
resources_path=resources_path,
)
- # If mylinker is None, instantiate the default Linker.
- if not self.mylinker:
+ # If linker is None, instantiate the default Linker.
+ if not self.linker:
if not resources_path:
raise ValueError("[ERROR] Please specify path to resources directory.")
-
- if experiments_path:
- self.mylinker = linking.Linker(
- method="mostpopular",
- resources_path=resources_path,
- experiments_path=experiments_path,
- )
- else:
- self.mylinker = linking.Linker(
- method="mostpopular",
- resources_path=resources_path,
- )
-
- # -----------------------------------------
- # NER training and creating pipeline:
-
- # Train the NER models if needed:
- self.myner.train()
-
- # Load the NER pipeline:
- self.myner.pipe = self.myner.create_pipeline()
-
- # -----------------------------------------
- # Ranker loading resources and training a model:
-
- # Load the resources:
- self.myranker.mentions_to_wikidata = self.myranker.load_resources()
-
- # Train a DeezyMatch model if needed:
- self.myranker.train()
-
- # -----------------------------------------
- # Linker loading resources:
-
- # Load linking resources:
- self.mylinker.linking_resources = self.mylinker.load_resources()
-
- # Train a linking model if needed (it requires myranker to generate
- # potential candidates to the training set):
- self.mylinker.rel_params["ed_model"] = self.mylinker.train_load_model(
- self.myranker
- )
-
- def run_sentence(
- self,
- sentence: str,
- sent_idx: Optional[int] = 0,
- context: Optional[Tuple[str, str]] = ("", ""),
- place: Optional[str] = "",
- place_wqid: Optional[str] = "",
- postprocess_output: Optional[bool] = True,
- without_microtoponyms: Optional[bool] = False,
- ) -> List[dict]:
- """
- Runs the pipeline on a single sentence.
-
- Arguments:
- sentence (str): The input sentence to process.
- sent_idx (int, optional): Index position of the target sentence in
- a larger text. Defaults to ``0``.
- context (tuple, optional): A tuple containing the previous and
- next sentences as context. Defaults to ``("", "")``.
- place (str, optional): The place of publication associated with
- the sentence as a human-legible string (e.g. "London").
- Defaults to ``""``.
- place_wqid (str, optional): The Wikidata ID of the place of
- publication provided in ``place`` (e.g. "Q84"). Defaults to
- ``""``.
- postprocess_output (bool, optional): Whether to postprocess the
- output, adding geographic coordinates. Defaults to ``True``.
- without_microtoponyms (bool, optional): Specifies whether to
- exclude microtoponyms during processing. Defaults to ``False``.
-
- Returns:
- List[dict]:
- A list of dictionaries representing the processed identified
- and linked toponyms in the sentence. Each dictionary contains
- the following keys:
-
- - ``sent_idx`` (int): The index of the sentence.
- - ``mention`` (str): The mention text.
- - ``pos`` (int): The starting position of the mention in the
- sentence.
- - ``end_pos`` (int): The ending position of the mention in the
- sentence.
- - ``tag`` (str): The NER label of the mention.
- - prediction`` (str): The predicted entity linking result.
- - ner_score`` (float): The NER score of the mention.
- - ed_score`` (float): The entity disambiguation score.
- - sentence`` (str): The input sentence.
- - prior_cand_score`` (dict): A dictionary of candidate
- entities and their string matching confidence scores.
- - ``cross_cand_score`` (dict): A dictionary of candidate
- entities and their cross-candidate confidence scores.
-
- If ``postprocess_output`` is set to True, the dictionaries
- will also contain the following two keys:
-
- - ``latlon`` (tuple): The latitude and longitude coordinates of
- the predicted entity.
- - ``wkdt_class`` (str): The Wikidata class of the predicted
- entity.
-
- Note:
- The ``run_sentence`` method processes a single sentence through the
- pipeline, performing tasks such as Named Entity Recognition (NER),
- ranking, and linking. It takes the input sentence along with
- optional parameters like the sentence index, context, the place of
- publication and its related Wikidata ID. By default, the method
- performs post-processing on the output.
-
- It first identifies toponyms in the sentence, then finds relevant
- candidates and ranks them, and finally links them to the Wikidata
- ID.
- """
-
- mentions = self.run_sentence_recognition(sentence)
-
- # List of mentions for the ranker:
- rmentions = []
- if without_microtoponyms:
- rmentions = [
- {"mention": y["mention"]} for y in mentions if y["ner_label"] == "LOC"
- ]
- else:
- rmentions = [{"mention": y["mention"]} for y in mentions]
-
- # Perform candidate ranking:
- wk_cands, self.myranker.already_collected_cands = self.myranker.find_candidates(
- rmentions
- )
-
- mentions_dataset = dict()
- mentions_dataset["linking"] = []
- for m in mentions:
- prediction = self.format_prediction(
- m,
- sentence,
- wk_cands=wk_cands,
- context=context,
- sent_idx=sent_idx,
- place=place,
- place_wqid=place_wqid,
- )
- mentions_dataset["linking"].append(prediction)
-
- # If the linking method is "reldisamb", rank and format candidates,
- # and produce a prediction:
- if self.mylinker.method == "reldisamb":
- mentions_dataset = rel_utils.rank_candidates(
- mentions_dataset,
- wk_cands,
- self.mylinker.linking_resources["mentions_to_wikidata"],
+ self.linker = linking.MostPopularLinker(
+ resources_path=resources_path,
+ experiments_path=experiments_path,
)
- if self.mylinker.rel_params["with_publication"]:
- if place_wqid == "" or place == "":
- place_wqid = self.mylinker.rel_params["default_publwqid"]
- place = self.mylinker.rel_params["default_publname"]
-
- # If "publ", add an artificial publication entry:
- mentions_dataset = rel_utils.add_publication(
- mentions_dataset,
- place,
- place_wqid,
- )
+ self.recogniser.load()
+ self.ranker.load()
+ self.linker.load()
- predicted = self.mylinker.rel_params["ed_model"].predict(mentions_dataset)
-
- if self.mylinker.rel_params["with_publication"]:
- # ... and if "publ", now remove the artificial publication entry!
- mentions_dataset["linking"].pop()
-
- for i in range(len(mentions_dataset["linking"])):
- mentions_dataset["linking"][i]["prediction"] = predicted["linking"][i][
- "prediction"
- ]
- mentions_dataset["linking"][i]["ed_score"] = round(
- predicted["linking"][i]["conf_ed"], 3
- )
-
- # Get cross-candidate confidence scores per candidate:
- mentions_dataset["linking"][i]["cross_cand_score"] = {
- cand: score
- for cand, score in zip(
- predicted["linking"][i]["candidates"],
- predicted["linking"][i]["scores"],
- )
- if cand != "#UNK#"
- }
-
- # Sort candidates and round scores:
- mentions_dataset["linking"][i]["cross_cand_score"] = {
- k: round(v, 3)
- for k, v in sorted(
- mentions_dataset["linking"][i]["cross_cand_score"].items(),
- key=lambda item: item[1],
- reverse=True,
- )
- }
-
- # Get string matching confidence scores per candidate:
- dCs = mentions_dataset["linking"][i]["string_match_candidates"]
- mentions_dataset["linking"][i]["string_match_score"] = {
- x: (
- round(dCs[x]["Score"], 3),
- [wqc for wqc in dCs[x]["Candidates"]],
- )
- for x in dCs
- }
- # Get linking prior confidence scores per candidate:
- mentions_dataset["linking"][i]["prior_cand_score"] = {
- cand: score
- for cand, score in mentions_dataset["linking"][i]["candidates"]
- if cand in mentions_dataset["linking"][i]["cross_cand_score"]
- }
-
- # Sort candidates and round scores:
- mentions_dataset["linking"][i]["prior_cand_score"] = {
- k: round(v, 3)
- for k, v in sorted(
- mentions_dataset["linking"][i]["prior_cand_score"].items(),
- key=lambda item: item[1],
- reverse=True,
- )
- }
-
- if self.mylinker.method in ["mostpopular", "bydistance"]:
- for i in range(len(mentions_dataset["linking"])):
- mention = mentions_dataset["linking"][i]
-
- # Run entity linking per mention:
- selected_cand = self.mylinker.run(
- {
- "candidates": wk_cands[mention["mention"]],
- "place_wqid": place_wqid,
- }
- )
- mentions_dataset["linking"][i]["prediction"] = selected_cand[0]
- mentions_dataset["linking"][i]["ed_score"] = round(selected_cand[1], 3)
- dCs = mentions_dataset["linking"][i]["string_match_candidates"]
- mentions_dataset["linking"][i]["string_match_score"] = {
- x: (
- round(dCs[x]["Score"], 3),
- [wqc for wqc in dCs[x]["Candidates"]],
- )
- for x in dCs
- }
- mentions_dataset["linking"][i]["prior_cand_score"] = dict()
-
- # Return candidates scores for top n=7 candidates
- # (same returned by REL):
- tmp_cands = {k: round(selected_cand[2][k], 3) for k in selected_cand[2]}
- mentions_dataset["linking"][i]["cross_cand_score"] = dict(
- sorted(tmp_cands.items(), key=lambda x: x[1], reverse=True)[:7]
- )
-
- if not postprocess_output:
- return mentions_dataset
-
- if postprocess_output:
- # Process output, add coordinates and wikidata class from
- # prediction:
- keys = [
- "sent_idx",
- "mention",
- "pos",
- "end_pos",
- "tag",
- "prediction",
- "ner_score",
- "ed_score",
- "sentence",
- "string_match_score",
- "prior_cand_score",
- "cross_cand_score",
- ]
- sentence_dataset = []
- for md in mentions_dataset["linking"]:
- md = dict((k, md[k]) for k in md if k in keys)
- md["latlon"] = self.mylinker.linking_resources["wqid_to_coords"].get(
- md["prediction"]
- )
- md["wkdt_class"] = self.mylinker.linking_resources["entity2class"].get(
- md["prediction"]
- )
- sentence_dataset.append(md)
- return sentence_dataset
-
- def run_text(
- self,
- text: str,
- place: Optional[str] = "",
- place_wqid: Optional[str] = "",
- postprocess_output: Optional[bool] = True,
- ) -> List[dict]:
+ # TODO: docstring
+ def run(self,
+ text: str,
+ place_of_pub_wqid: Optional[str]=None,
+ place_of_pub: Optional[str]=None,
+ ) -> Predictions:
"""
- Runs the pipeline on a text document.
+ Runs the end-to-end pipeline.
- Arguments:
- text (str): The input text document to process.
- place (str, optional): The place of publication associated with
- the text document as a human-legible string (e.g.
- ``"London"``). Defaults to ``""``.
- place_wqid (str, optional): The Wikidata ID of the place of
- publication provided in ``place`` (e.g. ``"Q84"``). Defaults
- to ``""``.
- postprocess_output (bool, optional): Whether to postprocess the
- output, adding geographic coordinates. Defaults to ``True``.
+ Args:
+ text (str): A block of text.
+ place_of_pub_wqid (Optional[str]): The Wikidata ID of the
+ place of publication of the text, if available.
+ place_of_pub (Optional[str]): The place of publication of
+ the text, if available.
Returns:
- List[dict]:
- A list of dictionaries representing the identified and linked
- toponyms in the sentence. Each dictionary contains the following
- keys:
-
- * "sent_idx" (int): The index of the sentence.
- * "mention" (str): The mention text.
- * "pos" (int): The starting position of the mention in the
- sentence.
- * "end_pos" (int): The ending position of the mention in the
- sentence.
- * "tag" (str): The NER label of the mention.
- * "prediction" (str): The predicted entity linking result.
- * "ner_score" (float): The NER score of the mention.
- * "ed_score" (float): The entity disambiguation score.
- * "sentence" (str): The input sentence.
- * "prior_cand_score" (dict): A dictionary of candidate
- entities and their string matching confidence scores.
- * "cross_cand_score" (dict): A dictionary of candidate
- entities and their cross-candidate confidence scores.
-
- If ``postprocess_output`` is set to True, the dictionaries
- will also contain the following two keys:
-
- * "latlon" (tuple): The latitude and longitude coordinates of
- the predicted entity.
- * "wkdt_class" (str): The Wikidata class of the predicted
- entity.
-
- Note:
- The ``run_text`` method processes an entire text through the
- pipeline, after splitting it into sentences, performing the tasks
- of Named Entity Recognition (NER), ranking, and linking. It takes
- the input text document along with optional parameters like the
- place of publication and its related Wikidata ID and splits it
- into sentences. By default, the method performs post-processing
- on the output.
-
- It first identifies toponyms in each of the text document's
- sentences, then finds relevant candidates and ranks them, and
- finally links them to the Wikidata ID.
-
- This method runs the
- :py:meth:`~geoparser.pipeline.Pipeline.run_sentence` method for
- each of the document's sentences. The ``without_microtoponyms``
- keyword, passed to ``run_sentence`` comes from the ``Linker``'s
- (passed when initialising the
- :py:meth:`~geoparser.pipeline.Pipeline` object) ``rel_params``
- parameter. See :py:class:`geoparser.linking.Linker` for
- instructions on how to set that up.
-
+ Toponyms identified in the text, linked to the Wikidata
+ knowledgebase.
"""
- # Split the text into its sentences:
- sentences = split_text_into_sentences(text, language="en")
-
- document_dataset = []
- for idx, sentence in enumerate(sentences):
- # Get context (prev and next sentence)
- context = ["", ""]
- if idx - 1 >= 0:
- context[0] = sentences[idx - 1]
- if idx + 1 < len(sentences):
- context[1] = sentences[idx + 1]
-
- # Run pipeline on sentence:
- sentence_dataset = self.run_sentence(
- sentence,
- sent_idx=idx,
- context=context,
- place=place,
- place_wqid=place_wqid,
- postprocess_output=postprocess_output,
- without_microtoponyms=self.mylinker.rel_params.get(
- "without_microtoponyms", False
- ),
- )
- # Collect results from all sentences:
- for sd in sentence_dataset:
- document_dataset.append(sd)
+ mentions = self.run_text_recognition(text)
+ candidates = self.run_candidate_selection(mentions, place_of_pub_wqid, place_of_pub)
+ return self.run_disambiguation(candidates)
- return document_dataset
+ ### Modular/stepwise methods:
- def run_sentence_recognition(self, sentence) -> List[dict]:
- # Get predictions:
- predictions = self.myner.ner_predict(sentence)
-
- # Process predictions:
- procpreds = [
- [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]]
- for x in predictions
- ]
-
- # Aggregate mentions:
- mentions = ner.aggregate_mentions(procpreds, "pred")
- return mentions
-
- def format_prediction(
- self,
- mention,
- sentence: str,
- wk_cands: Optional[dict] = None,
- context: Optional[Tuple[str, str]] = ("", ""),
- sent_idx: Optional[int] = 0,
- place: Optional[str] = "",
- place_wqid: Optional[str] = "",
- ) -> dict:
- prediction = dict()
- prediction["mention"] = mention["mention"]
- prediction["context"] = context
- prediction["candidates"] = []
- prediction["gold"] = ["NONE"]
- prediction["ner_score"] = mention["ner_score"]
- prediction["pos"] = mention["start_char"]
- prediction["sent_idx"] = sent_idx
- prediction["end_pos"] = mention["end_char"]
- prediction["ngram"] = mention["mention"]
- prediction["conf_md"] = mention["ner_score"]
- prediction["tag"] = mention["ner_label"]
- prediction["sentence"] = sentence
- prediction["place"] = place
- prediction["place_wqid"] = place_wqid
- if wk_cands:
- prediction["string_match_candidates"] = wk_cands.get(
- mention["mention"], dict()
- )
- prediction["candidates"] = wk_cands.get(mention["mention"], dict())
- return prediction
-
- def run_text_recognition(
- self,
- text: str,
- place: Optional[str] = "",
- place_wqid: Optional[str] = "",
- ) -> List[dict]:
+ def run_text_recognition(self, text: str) -> List[SentenceMentions]:
"""
- Runs the NER on a text document and returns the recognised entities
- in the format required by future steps: candidate selection and
- entity disambiguation.
-
- Arguments:
- text (str): The input text document to process.
- place (str, optional): The place of publication associated with
- the text document as a human-legible string (e.g.
- ``"London"``). Defaults to ``""``.
- place_wqid (str, optional): The Wikidata ID of the place of
- publication provided in ``place`` (e.g. ``"Q84"``). Defaults
- to ``""``.
+ Runs the named entity recognition step of the pipeline.
+
+ Args:
+ text (str): A block of text.
Returns:
- List[dict]:
- A list of dictionaries representing the identified toponyms
- in the sentence, in the format required by the following
- steps in the pipeline: candidate selection and entity
- disambiguation. Each dictionary contains the following keys:
-
- - ``mention`` (str): The mention text.
- - ``context`` (list): List of two strings corresponding to
- the context (i.e. previous and next sentence).
- - ``candidates`` (list): List of candidates, which at this
- point will be empty.
- - ``gold`` (list): List containing the gold standard entity,
- which is and will remain ``['NONE']``.
- - ``ner_score`` (float): The NER score of the mention.
- - ``pos`` (int): The starting position of the mention in the
- sentence.
- - ``sent_idx`` (int): The index of the sentence.
- - ``end_pos`` (int): The ending position of the mention in the
- sentence.
- - ``ngram`` (str): The mention text (redundant).
- - ``conf_md`` (str): The NER score of the mention (redundant).
- - ``tag`` (str): The NER label of the mention.
- - ``prediction`` (str): The predicted entity linking result.
- - ``sentence`` (str): The input sentence.
-
- Note:
- The ``run_text_recognition`` method runs Named Entity Recognition
- (NER) on a full text, one sentence at a time. It takes the input text
- (along with optional parameters like the place of publication
- and its related Wikidata ID) and splits it into sentences, and
- after that finds mentions for each sentence.
+ A list of `SentenceMentions` instances, one for each sentence in
+ the text.
"""
-
- # Split the text into its sentences:
- sentences = split_text_into_sentences(text, language="en")
-
- document_dataset = []
- for idx, sentence in enumerate(sentences):
- # Get context (prev and next sentence)
- context = ["", ""]
- if idx - 1 >= 0:
- context[0] = sentences[idx - 1]
- if idx + 1 < len(sentences):
- context[1] = sentences[idx + 1]
-
- mentions = self.run_sentence_recognition(sentence)
-
- mentions_dataset = []
- for m in mentions:
- prediction = self.format_prediction(
- m,
- sentence,
- wk_cands=None,
- context=context,
- sent_idx=idx,
- place=place,
- place_wqid=place_wqid,
- )
- # mentions_dataset["linking"].append(prediction)
- if not len(m["mention"]) == 1 and not m["mention"].islower():
- mentions_dataset.append(prediction)
-
- # Collect results from all sentences:
- for sd in mentions_dataset:
- document_dataset.append(sd)
-
- return document_dataset
-
- def run_candidate_selection(self, document_dataset: List[dict]) -> dict:
+ # Split the text into sentences.
+ text = str(text)
+ sentences = SentenceContext.from_text(text, language="en")
+ return [self.recogniser.run(sentence.sentence) for sentence in sentences]
+
+ def run_candidate_selection(self,
+ sentence_mentions: List[SentenceMentions],
+ place_of_pub_wqid: Optional[str]=None,
+ place_of_pub: Optional[str]=None,
+ ) -> Candidates:
"""
- Performs candidate selection on already identified toponyms,
- resulting from the ``run_text_recognition`` method. Given a
- list of dictionaries corresponding to mentions, this method
- first extracts the subset of mentions for which to try to find
- candidates and then runs the ``find_candidates`` function from
- the Ranker object. This method returns a dictionary of all
- mentions and their candidates, with a similarity score.
-
- Arguments:
- document_dataset (List[dict]): The list of mentions identified,
- formatted as dictionaries.
+ Runs the candidate selection step of the pipeline.
+
+ Args:
+ sentence_mentions (List[SentenceMentions]): A list of
+ `SentenceMentions` instances, as produced by the
+ `run_text_recognition` method.
+ place_of_pub_wqid (Optional[str]): The Wikidata ID of the
+ place of publication of the text, if available.
+ place_of_pub (Optional[str]): The place of publication of
+ the text, if available.
Returns:
- dict:
- A three-level nested dictionary, as show in the example
- in the Note below. The outermost key is the mention as
- has been identified in the text, the first-level nested
- keys are candidate mentions found in Wikidata (i.e. potential
- matches for the original mention). The second-level nested
- keys are the match confidence score and the Wikidata entities
- that correspond to the candidate mentions, each with its
- associated normalised mention-to-wikidata relevance score.
-
- Note:
-
- .. code-block:: python
-
- {'Salop': {
- 'Salop': {
- 'Score': 1.0,
- 'Candidates': {
- 'Q201970': 0.0006031363088057901,
- 'Q23103': 0.0075279261777561925
- }
- }
- }
- }
-
+ A `Candidates` instance containing toponym candidates.
"""
-
- # Get without_microtoponyms value (whether to resolve microtoponyms or not):
- without_microtoponyms = self.mylinker.rel_params.get(
- "without_microtoponyms", False
- )
-
- # List of mentions for the ranker:
- rmentions = []
- if without_microtoponyms:
- rmentions = [y["mention"] for y in document_dataset if y["tag"] == "LOC"]
- else:
- rmentions = [y["mention"] for y in document_dataset]
-
- # Make list of mentions unique:
- mentions = list(set(rmentions))
-
- # Prepare list of mentions as required by candidate selection and ranking:
- mentions = [{"mention": m} for m in mentions]
-
- # Perform candidate ranking:
- wk_cands, self.myranker.already_collected_cands = self.myranker.find_candidates(
- mentions
- )
- return wk_cands
-
- def run_disambiguation(
- self,
- dataset,
- wk_cands,
- place: Optional[str] = "",
- place_wqid: Optional[str] = "",
- ):
+ sentence_candidates = list()
+ for sms in sentence_mentions:
+ matches = self.ranker.run(sms.mentions)
+ candidates = [self.linker.run(m, place_of_pub_wqid, place_of_pub) for m in matches]
+ sentence_candidates.append(SentenceCandidates(sms.sentence, candidates))
+ return Candidates(sentence_candidates)
+
+ def run_disambiguation(self, candidates: Candidates) -> Predictions:
"""
- Performs entity disambiguation given a list of already identified
- toponyms and selected candidates.
-
- Arguments:
- dataset (List[dict]): The list of mentions identified,
- formatted as dictionaries.
- wk_cands (dict): A three-level nested dictionary mapping
- mentions to potential Wikidata entities.
- place (str, optional): The place of publication associated with
- the text document as a human-legible string (e.g.
- ``"London"``). Defaults to ``""``.
- place_wqid (str, optional): The Wikidata ID of the place of
- publication provided in ``place`` (e.g. ``"Q84"``). Defaults
- to ``""``.
+ Runs the entity disambiguation step of the pipeline.
+
+ Args:
+ candidates (Candidates): A `Candidates` instance, as produced by
+ the `run_candidate_selection` method.
Returns:
- List[dict]:
- A list of dictionaries representing the identified and linked
- toponyms in the sentence. Each dictionary contains the following
- keys:
-
- * "mention" (str): The mention text.
- * "ner_score" (float): The NER score of the mention.
- * "pos" (int): The starting position of the mention in the
- sentence.
- * "sent_idx" (int): The index of the sentence.
- * "end_pos" (int): The ending position of the mention in the
- sentence.
- * "tag" (str): The NER label of the mention.
- * "sentence" (str): The input sentence.
- * "prediction" (str): The predicted entity linking result.
- * "ed_score" (float): The entity disambiguation score.
- * "string_match_score" (dict): A dictionary of candidate
- entities and their string matching confidence scores.
- * "prior_cand_score" (dict): A dictionary of candidate
- entities and their prior confidence scores.
- * "cross_cand_score" (dict): A dictionary of candidate
- entities and their cross-candidate confidence scores.
- * "latlon" (tuple): The latitude and longitude coordinates of
- the predicted entity.
- * "wkdt_class" (str): The Wikidata class of the predicted
- entity.
+ A `Predictions` instance containing linked toponym predictions.
"""
-
- mentions_dataset = dict()
- mentions_dataset["linking"] = []
- for prediction in dataset:
- prediction["candidates"] = wk_cands.get(prediction["mention"], dict())
- prediction["string_match_candidates"] = prediction["candidates"]
- mentions_dataset["linking"].append(prediction)
-
- # If the linking method is "reldisamb", rank and format candidates,
- # and produce a prediction:
- if self.mylinker.method == "reldisamb":
- mentions_dataset = rel_utils.rank_candidates(
- mentions_dataset,
- wk_cands,
- self.mylinker.linking_resources["mentions_to_wikidata"],
- )
-
- if self.mylinker.rel_params["with_publication"]:
- if place_wqid == "" or place == "":
- place_wqid = self.mylinker.rel_params["default_publwqid"]
- place = self.mylinker.rel_params["default_publname"]
-
- # If "publ", add an artificial publication entry:
- mentions_dataset = rel_utils.add_publication(
- mentions_dataset,
- place,
- place_wqid,
- )
-
- predicted = self.mylinker.rel_params["ed_model"].predict(mentions_dataset)
-
- if self.mylinker.rel_params["with_publication"]:
- # ... and if "publ", now remove the artificial publication entry!
- mentions_dataset["linking"].pop()
-
- for i in range(len(mentions_dataset["linking"])):
- mentions_dataset["linking"][i]["prediction"] = predicted["linking"][i][
- "prediction"
- ]
- mentions_dataset["linking"][i]["ed_score"] = round(
- predicted["linking"][i]["conf_ed"], 3
- )
-
- # Get cross-candidate confidence scores per candidate:
- mentions_dataset["linking"][i]["cross_cand_score"] = {
- cand: score
- for cand, score in zip(
- predicted["linking"][i]["candidates"],
- predicted["linking"][i]["scores"],
- )
- if cand != "#UNK#"
- }
-
- # Sort candidates and round scores:
- mentions_dataset["linking"][i]["cross_cand_score"] = {
- k: round(v, 3)
- for k, v in sorted(
- mentions_dataset["linking"][i]["cross_cand_score"].items(),
- key=lambda item: item[1],
- reverse=True,
- )
- }
-
- # Get string matching confidence scores per candidate:
- dCs = mentions_dataset["linking"][i]["string_match_candidates"]
- mentions_dataset["linking"][i]["string_match_score"] = {
- x: (
- round(dCs[x]["Score"], 3),
- [wqc for wqc in dCs[x]["Candidates"]],
- )
- for x in dCs
- }
- # Get linking prior confidence scores per candidate:
- mentions_dataset["linking"][i]["prior_cand_score"] = {
- cand: score
- for cand, score in mentions_dataset["linking"][i]["candidates"]
- if cand in mentions_dataset["linking"][i]["cross_cand_score"]
- }
-
- # Sort candidates and round scores:
- mentions_dataset["linking"][i]["prior_cand_score"] = {
- k: round(v, 3)
- for k, v in sorted(
- mentions_dataset["linking"][i]["prior_cand_score"].items(),
- key=lambda item: item[1],
- reverse=True,
- )
- }
-
- if self.mylinker.method in ["mostpopular", "bydistance"]:
- for i in range(len(mentions_dataset["linking"])):
- mention = mentions_dataset["linking"][i]
-
- # Run entity linking per mention:
- selected_cand = self.mylinker.run(
- {
- "candidates": wk_cands[mention["mention"]],
- "place_wqid": "",
- }
- )
- mentions_dataset["linking"][i]["prediction"] = selected_cand[0]
- mentions_dataset["linking"][i]["ed_score"] = round(selected_cand[1], 3)
- dCs = mentions_dataset["linking"][i]["string_match_candidates"]
- mentions_dataset["linking"][i]["string_match_score"] = {
- x: (
- round(dCs[x]["Score"], 3),
- [wqc for wqc in dCs[x]["Candidates"]],
- )
- for x in dCs
- }
- mentions_dataset["linking"][i]["prior_cand_score"] = dict()
-
- # Return candidates scores for top n=7 candidates
- # (same returned by REL):
- tmp_cands = {k: round(selected_cand[2][k], 3) for k in selected_cand[2]}
- mentions_dataset["linking"][i]["cross_cand_score"] = dict(
- sorted(tmp_cands.items(), key=lambda x: x[1], reverse=True)[:7]
- )
-
- # Process output, add coordinates and wikidata class from
- # prediction:
- keys = [
- "sent_idx",
- "mention",
- "pos",
- "end_pos",
- "tag",
- "prediction",
- "ner_score",
- "ed_score",
- "sentence",
- "string_match_score",
- "prior_cand_score",
- "cross_cand_score",
- ]
- sentence_dataset = []
- for md in mentions_dataset["linking"]:
- md = dict((k, md[k]) for k in md if k in keys)
- md["latlon"] = self.mylinker.linking_resources["wqid_to_coords"].get(
- md["prediction"]
- )
- md["wkdt_class"] = self.mylinker.linking_resources["entity2class"].get(
- md["prediction"]
- )
- sentence_dataset.append(md)
- return sentence_dataset
+ return self.linker.disambiguate(candidates.sentence_candidates)
diff --git a/t_res/geoparser/ranking.py b/t_res/geoparser/ranking.py
index a314003c..483fd929 100644
--- a/t_res/geoparser/ranking.py
+++ b/t_res/geoparser/ranking.py
@@ -1,8 +1,7 @@
import json
import os
-import sys
from pathlib import Path
-from typing import List, Literal, Optional, Tuple
+from typing import Optional, List
import pandas as pd
from DeezyMatch import candidate_ranker
@@ -10,204 +9,122 @@
from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance
from ..utils import deezy_processing
-
+from ..utils.dataclasses import StringMatch, StringMatchLinks, CandidateMatches, Mention
class Ranker:
"""
The Ranker class implements a system for candidate selection through string
- variation ranking. It provides methods to select candidates based on different
- matching approaches, such as perfect match, partial match, Levenshtein distance,
- and DeezyMatch. The class also handles loading and processing of resources
- related to candidate selection.
+ variation ranking. Its subclasses provide methods to select candidates based
+ on different matching approaches, such as perfect match, partial match,
+ Levenshtein distance, and DeezyMatch. The base class handles loading and
+ processing of resources related to candidate selection.
Arguments:
- method (str): The candidate selection and ranking method to use.
resources_path (str): Relative path to the resources directory
(containing Wikidata resources).
mentions_to_wikidata (dict, optional): An empty dictionary which
will store the mapping between mentions and Wikidata IDs,
- which will be loaded through the
- :py:meth:`~geoparser.ranking.Ranker.load_resources` method.
+ which will be loaded through the Ranker's
+ [load method][t_res.geoparser.ranking.Ranker.load].
wikidata_to_mentions (dict, optional): An empty dictionary which
will store the mapping between Wikidata IDs and mentions,
- which will be loaded through the
- :py:meth:`~geoparser.ranking.Ranker.load_resources` method.
- strvar_parameters (dict, optional): Dictionary of string variation
- parameters required to create a DeezyMatch training dataset.
- For the default settings, see Notes below.
- deezy_parameters (dict, optional): Dictionary of DeezyMatch parameters
- for model training. For the default settings, see Notes below.
- already_collected_cands (dict, optional): Dictionary of already
- collected candidates. Defaults to ``dict()`` (an empty dictionary).
-
- Example:
- >>> # Create a Ranker object:
- >>> ranker = Ranker(
- method="perfectmatch",
- resources_path="/path/to/resources/",
- )
-
- >>> # Load resources
- >>> ranker.mentions_to_wikidata = ranker.load_resources()
-
- >>> # Train the ranker (if applicable)
- >>> ranker.train()
-
- >>> # Perform candidate selection
- >>> queries = ['London', 'Paraguay']
- >>> candidates, already_collected = ranker.run(queries)
-
- >>> # Find candidates for mentions
- >>> mentions = [{'mention': 'London'}, {'mention': 'Paraguay'}]
- >>> mention_candidates, mention_already_collected = ranker.find_candidates(mentions)
-
- >>> # Print the results
- >>> print("Candidate Selection Results:")
- >>> print(candidates)
- >>> print(already_collected)
- >>> print("Find Candidates Results:")
- >>> print(mention_candidates)
- >>> print(mention_already_collected)
+ which will be loaded through the Ranker's
+ [load method][t_res.geoparser.ranking.Ranker.load].
Note:
- * The default settings for ``strvar_parameters``:
-
- .. code-block:: python
-
- strvar_parameters: Optional[dict] = {
- # Parameters to create the string pair dataset:
- "ocr_threshold": 60,
- "top_threshold": 85,
- "min_len": 5,
- "max_len": 15,
- "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()),
- "w2v_ocr_model": "w2v_*_news",
- "overwrite_dataset": False,
- }
-
- * The default settings for ``deezy_parameters``:
-
- .. code-block:: python
+ This base class should not be instatiated directly. Instead use a subclass
+ constructor.
- deezy_parameters: Optional[dict] = {
- "dm_path": str(Path("resources/deezymatch/").resolve()),
- "dm_cands": "wkdtalts",
- "dm_model": "w2v_ocr",
- "dm_output": "deezymatch_on_the_fly",
- "ranking_metric": "faiss",
- "selection_threshold": 50,
- "num_candidates": 1,
- "verbose": False,
- "overwrite_training": False,
- "do_test": False,
- }
+ Example:
+ ```python
+ # Create a Ranker object:
+ ranker = PerfectMatchRanker(resources_path="/path/to/resources/")
+ # Load resources
+ ranker.load()
+ # Perform candidate selection
+ queries = ['London', 'Paraguay']
+ results = [ranker.matches(query) for query in queries]
+ # Print the results
+ print("Candidate Selection Results:")
+ for matches in results:
+ print(matches)
+ ```
"""
+ # Class attribute for the name of the ranking method.
+ method_name: str = None
+ # TODO: move wikidata_to_mentions arg to the DeezyMatchRanker __init__ only.
def __init__(
self,
- method: Literal["perfectmatch", "partialmatch", "levenshtein", "deezymatch"],
resources_path: str,
mentions_to_wikidata: Optional[dict] = dict(),
wikidata_to_mentions: Optional[dict] = dict(),
- strvar_parameters: Optional[dict] = None,
- deezy_parameters: Optional[dict] = None,
- already_collected_cands: Optional[dict] = dict(),
):
"""
Initialize a Ranker object.
"""
- self.method = method
self.resources_path = resources_path
self.mentions_to_wikidata = mentions_to_wikidata
self.wikidata_to_mentions = wikidata_to_mentions
-
- # set paths based on resources path
- if strvar_parameters is None:
- strvar_parameters = {
- # Parameters to create the string pair dataset:
- "ocr_threshold": 60,
- "top_threshold": 85,
- "min_len": 5,
- "max_len": 15,
- "w2v_ocr_path": os.path.join(resources_path, "models/w2v/"),
- "w2v_ocr_model": "w2v_*_news",
- "overwrite_dataset": False,
- }
-
- if deezy_parameters is None:
- deezy_parameters = {
- # Paths and filenames of DeezyMatch models and data:
- "dm_path": os.path.join(resources_path, "deezymatch/"),
- "dm_cands": "wkdtalts",
- "dm_model": "w2v_ocr",
- "dm_output": "deezymatch_on_the_fly",
- # Ranking measures:
- "ranking_metric": "faiss",
- "selection_threshold": 50,
- "num_candidates": 1,
- "verbose": False,
- # DeezyMatch training:
- "overwrite_training": False,
- "do_test": False,
- }
-
- self.strvar_parameters = strvar_parameters
- self.deezy_parameters = deezy_parameters
- self.already_collected_cands = already_collected_cands
+ self.cache = dict()
def __str__(self) -> str:
"""
- Returns a string representation of the Ranker object.
-
- Note:
- The string will, at minimum, include the method name, and if the
- ``method`` was set to "deezymatch" in the Ranker initialiser, the
- string will also include the training parameters provided.
+ Returns a string representation of the Ranker object, including the method name.
"""
s = ">>> Candidate selection:\n"
- s += f" * Method: {self.method}\n"
-
- if self.method == "deezymatch":
- s += " * DeezyMatch details:\n"
- s += f" * Model: {self.deezy_parameters['dm_model']}\n"
- s += f" * Ranking metric: {self.deezy_parameters['ranking_metric']}\n"
- s += f" * Selection threshold: {self.deezy_parameters['selection_threshold']}\n"
- s += f" * Num candidates: {self.deezy_parameters['num_candidates']}\n"
- s += f" * Overwrite training: {self.deezy_parameters['overwrite_training']}\n"
- s += f" * Overwrite dataset: {self.strvar_parameters['overwrite_dataset']}\n"
- s += f" * Test mode: {self.deezy_parameters['do_test']}\n"
-
+ s += f" * Method: {self.method_name}\n"
return s
-
- def load_resources(self) -> dict:
+
+ def new(**kwargs) -> 'Ranker':
"""
- Load the ranker resources.
+ Static constructor.
+
+ Args:
+ kwargs (dict): A dictionary of keyword arguments matching the
+ arguments to a subclass __init__ constructor, plus a
+ `method_name` argument to specify the desired subclass.
Returns:
- dict:
- The loaded mentions-to-wikidata dictionary, which maps a
- mention (e.g. ``"London"``) to the Wikidata entities that are
- referred to by this mention on Wikipedia (e.g. ``Q84``,
- ``Q2477346``). The data also includes, for each entity, their
- normalized "relevance", i.e. number of in-links across Wikipedia.
+ Ranker: A Ranker subclass instance.
+
+ """
+ if not 'method_name' in kwargs.keys():
+ raise ValueError("Expected `method_name` keyword argument.")
+ method_name = kwargs['method_name']
+ del kwargs['method_name']
+ if method_name == 'perfectmatch':
+ return PerfectMatchRanker(**kwargs)
+ if method_name == 'partialmatch':
+ return PartialMatchRanker(**kwargs)
+ if method_name == 'levenshtein':
+ return LevenshteinRanker(**kwargs)
+ if method_name == 'deezymatch':
+ return DeezyMatchRanker(**kwargs)
+ raise ValueError(f"Invalid ranking method: {method_name}")
+
+ def load(self):
+ """
+ Loads the ranker resources.
Note:
This method loads the mentions-to-wikidata and
wikidata-to-mentions dictionaries from the resources directory,
- specified when initialising the
- :py:meth:`~geoparser.ranking.Ranker`. They are required for
- performing candidate selection and ranking.
+ specified when initialising the [Ranker][t_res.geoparser.ranking.Ranker].
+ They are required for performing candidate selection and ranking.
- It filters the dictionaries to remove noise and updates the class
- attributes accordingly.
+ The loaded mentions-to-wikidata dictionary maps a toponym
+ (e.g. ``"London"``) to the Wikidata entities that are
+ referred to by this toponym on Wikipedia (e.g. ``Q84``,
+ ``Q2477346``). The data also includes, for each entity, its
+ normalized "relevance", i.e. number of in-links across Wikipedia.
- The method also initialises ``pandarallel`` if needed by the
- candidate ranking method (if the ``method`` set in the initialiser
- of the ``Ranker`` was set to "partialmatch" or "levenshtein").
+ The loaded dictionaries are filtered to remove noise and the class
+ attributes are updated accordingly.
"""
+
print("*** Loading the ranker resources.")
-
- # Load files
+ # NOTE: these are the *normalized* mentions, *not* relative frequencies.
files = {
"mentions_to_wikidata": os.path.join(
self.resources_path, "wikidata/mentions_to_wikidata_normalized.json"
@@ -259,142 +176,209 @@ def load_resources(self) -> dict:
del mentions_to_wikidata_filtered
del wikidata_to_mentions_filtered
- # Parallelize if ranking method is one of the following:
- if self.method in ["partialmatch", "levenshtein"]:
- pandarallel.initialize(nb_workers=10)
- os.environ["TOKENIZERS_PARALLELISM"] = "true"
+ def run(self, mentions: List[Mention]) -> List[CandidateMatches]:
+ """
+ Executes the ranking process for a given list of toponym mentions.
+
+ Arguments:
+ mentions (List[Mention]): A list of instances of the Mention
+ dataclass cntaining toponyms to be matched.
- return self.mentions_to_wikidata
+ Returns:
+ A list of instances of the CandidateMatches dataclass
+ containing potential string matches for each toponym.
- def train(self) -> None:
+ Note:
+ String matches are added to the cache for efficient retrieval.
"""
- Training a DeezyMatch model. The training will be skipped if the model
- already exists and the ``overwrite_training`` key in the
- ``deezy_parameters`` passed when initialising the
- :py:meth:`~geoparser.ranking.Ranker` object is set to ``False``. The
- training will be run on test mode if the ``do_test`` key in the
- ``deezy_parameters`` passed when initialising the
- :py:meth:`~geoparser.ranking.Ranker` object is set to ``True``.
+ r = range(len(mentions))
+
+ # Identify which mentions are in the cache.
+ cache_hits = {i: CandidateMatches(mentions[i], self.method_name, self.cache[mentions[i].mention])
+ for i in r if mentions[i].mention in self.cache}
+
+ uncached_indices = list(set(r).difference(set(cache_hits.keys())))
+ uncached_indices.sort()
+ uncached_mentions = [mentions[i].mention for i in uncached_indices]
+ uncached_string_matches = self.matches(uncached_mentions)
+
+ if len(uncached_string_matches) != len(uncached_mentions):
+ raise ValueError(f"Got {len(uncached_string_matches)} lists of matches from {len(uncached_mentions)} queries.")
+
+ dict_string_matches = {uncached_indices[i]: uncached_string_matches[i] for i in range(len(uncached_indices))}
+
+ # Get the potential Wikidata links for each string match.
+ all_candidates = list()
+ for i in r:
+ if i in cache_hits.keys():
+ all_candidates.append(cache_hits[i])
+ else:
+ matches = list()
+ for match in dict_string_matches[i]:
+ wqid_links = list(self.mentions_to_wikidata.get(match.variation, dict()).keys())
+ matches.append(StringMatchLinks(match.variation, match.string_similarity, wqid_links))
+
+ candidates = CandidateMatches(mentions[i], self.method_name, matches)
+ all_candidates.append(candidates)
+
+ # Update the cache.
+ self.cache[mentions[i].mention] = matches
+
+ return all_candidates
+
+ def matches(self, queries: List[str]) -> List[List[StringMatch]]:
+ """
+ Identifies string matching candidates for each of the given toponym queries.
+
+ Each Ranker subclass must implement a ranking method by overriding
+ this function.
+
+ Args:
+ queries (List[str]): A list of toponyms to be matched.
+
+ Raises:
+ NotImplementedError: If this method is not overridden in a subclass.
Returns:
- None.
+ A list of lists of StringMatch instances containing potential
+ matches for each given toponym.
"""
+ raise NotImplementedError("Subclass implementation required.")
+
+class PerfectMatchRanker(Ranker):
+ """
+ A ranking method using perfect string matching.
- if self.method == "deezymatch":
- Path(self.deezy_parameters["dm_path"]).mkdir(parents=True, exist_ok=True)
- if self.deezy_parameters["do_test"] == True:
- self.deezy_parameters["dm_model"] += "_test"
- self.deezy_parameters["dm_cands"] += "_test"
- deezy_processing.train_deezy_model(
- self.deezy_parameters, self.strvar_parameters, self.wikidata_to_mentions
- )
- deezy_processing.generate_candidates(
- self.deezy_parameters, self.mentions_to_wikidata
- )
+ Example:
+ ```python
+ ranker = PerfectMatchRanker(resources_path="/path/to/resources/")
+ ranker.load()
+ queries = ['London', 'Barcelona', 'Bologna']
+ results = [ranker.matches(query) for query in queries]
+ # Print the results
+ print("Candidate Selection Results:")
+ for matches in results:
+ print(matches)
+ ```
+ """
+ # Override the method_name class attribute.
+ method_name: str = "perfectmatch"
- # This dictionary is not used anymore:
- self.wikidata_to_mentions = dict()
+ def matches(self, queries: List[str]) -> List[List[StringMatch]]:
+ return [self.match_query(query) for query in queries]
- def perfect_match(self, queries: List[str]) -> Tuple[dict, dict]:
+ def match_query(self, query: str) -> List[StringMatch]:
"""
- Perform perfect matching between a provided list of mentions
- (``queries``) and the altnames in the knowledge base.
+ Performs perfect matching between a provided toponym (`query`) and the
+ altnames in the knowledge base.
Arguments:
- queries (list): A list of mentions (string) identified in a text
- to match.
+ query: A toponym query (string) to be matched.
Returns:
- Tuple[dict, dict]: A tuple containing two dictionaries:
-
- #. The first dictionary maps each mention to its candidate
- list, where the candidate list is a dictionary with the
- mention itself as the key and a perfect match score of
- ``1.0``.
-
- #. The second dictionary stores the already collected
- candidates for each mention. It is an updated version of the
- Ranker's ``already_collected_cands`` attribute.
+ A list of StringMatch instances, containing
+ potential matches for the given toponym. In the case of
+ perfect string matching, all candidates have string_similarity
+ equal to 1.0.
Note:
- This method checks if each mention has an exact match in the
+ This method checks if the query has an exact match in the
mentions_to_wikidata dictionary. If a match is found, it assigns a
- perfect match score of ``1.0`` to the mention. Otherwise, an empty
- dictionary is assigned as the candidate list for the mention.
+ perfect match score of ``1.0`` to the query. Otherwise, an empty
+ dictionary is assigned as the list of matches for the query.
"""
- candidates = {}
- for query in queries:
- if query in self.already_collected_cands:
- candidates[query] = self.already_collected_cands[query]
- else:
- if query in self.mentions_to_wikidata:
- candidates[query] = {query: 1.0}
- self.already_collected_cands[query] = {query: 1.0}
- else:
- candidates[query] = {}
- self.already_collected_cands[query] = {}
+ if query in self.mentions_to_wikidata:
+ return [StringMatch(query, 1.0)]
+ # If no match exists, assign an empty list to matches.
+ return list()
- return candidates, self.already_collected_cands
+class PartialMatchRanker(PerfectMatchRanker):
+ """
+ A ranking method using partial string matching.
+
+ This class extends PerfectMatchRanker because perfect matches are sought
+ before attempting a partial match.
+
+ Example:
+ ```python
+ # Create a Ranker object:
+ ranker = PartialMatchRanker(resources_path="/path/to/resources/")
+ # Load resources
+ ranker.load()
+ # Perform candidate selection
+ queries = ['London', 'Paraguay']
+ results = [ranker.matches(query) for query in queries]
+ # Print the results
+ print("Candidate Selection Results:")
+ for matches in results:
+ print(matches)
+ ```
+ """
+ # Override the method_name class attribute.
+ method_name: str = "partialmatch"
+
+ # Override the load method to initialise ``pandarellel`` for parallization.
+ def load(self):
+ super().load()
- def damlev_dist(self, query: str, row: pd.Series) -> float:
+ pandarallel.initialize(nb_workers=10)
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+ def match_query(self, query: str) -> List[StringMatch]:
"""
- Calculate the Damerau-Levenshtein distance between a mention and a row
- in the dataset.
+ Performs partial string matching for a given toponym query.
Arguments:
- query (str): A mention identified in a text.
- row (Series): A pandas Series representing a row in the dataset
- with a "mentions" column, corresponding to an alternate name
- of an etity in the knowledge base.
+ query (str): A toponym to be matched.
Returns:
- float:
- The similarity score between the query and the row, ranging
- from ``0.0`` to ``1.0``.
+ A list of StringMatch instances, containing potential matches for
+ the given toponym.
Note:
- This method computes the Damerau-Levenshtein distance between the
- lowercase versions of a mention and the "mentions" column value in
- the given row.
-
- The distance is then normalized to a similarity score by
- subtracting it from ``1.0``.
-
- Example:
- >>> ranker = Ranker(...)
- >>> query = 'apple'
- >>> row = pd.Series({'mentions': 'orange'})
- >>> similarity = ranker.damlev_dist(query, row)
- >>> print(similarity)
- 0.1666666865348816
+ This method identifies candidates via partial string matching.
+ If a perfect match exists, partial matching is skipped.
"""
- return 1.0 - normalized_damerau_levenshtein_distance(
- query.lower(), row["mentions"].lower()
+ # First attempt a perfect string match.
+ candidates = super().match_query(query)
+ if candidates:
+ return candidates
+
+ # Seek partial string matches.
+ mention_df = pd.DataFrame({"mentions": self.mentions_to_wikidata.keys()})
+ mention_df["score"] = mention_df.parallel_apply(
+ lambda row: self.matching_score(query, row), axis=1
)
+ mention_df = mention_df.dropna()
+ mention_df = mention_df.query('score == score.max()')
+ cands_dict = mention_df.set_index("mentions").to_dict()["score"]
+ matches = [StringMatch(k, v) for (k, v) in cands_dict.items()]
+ return matches
- def check_if_contained(self, query: str, row: pd.Series) -> float:
+ def matching_score(self, query: str, row: pd.Series) -> float:
"""
- Returns the amount of overlap, if a mention is contained within a row
- in the dataset.
+ Calculate the partial string matching score as the amount of overlap,
+ if a toponym is contained within a row in the dataset.
Arguments:
- query (str): A mention identified in a text.
+ query (str): A toponym identified in a text.
row (Series): A pandas Series representing a row in the dataset
with a "mentions" column, corresponding to a mention in the
knowledge base.
Returns:
- float:
- The match score indicating the degree of containment,
+ The match score indicating the degree of containment,
ranging from ``0.0`` to ``1.0`` (perfect match).
Example:
- >>> ranker = Ranker(...)
- >>> query = 'apple'
- >>> row = pd.Series({'mentions': 'Delicious apple'})
- >>> match_score = ranker.check_if_contained(query, row)
- >>> print(match_score)
- 0.3333333333333333
+ ```python
+ ranker = PartialMatchRanker(...)
+ query = 'apple'
+ row = pd.Series({'mentions': 'Delicious apple'})
+ match_score = ranker.matching_score(query, row)
+ print(match_score)
+ > 0.3333333333333333
+ ```
"""
# Fix strings
s1 = query.lower()
@@ -408,117 +392,230 @@ def check_if_contained(self, query: str, row: pd.Series) -> float:
if s2 in s1:
return len(row["mentions"]) / len(query)
- def partial_match(self, queries: List[str], damlev: bool) -> Tuple[dict, dict]:
- """
- Perform partial matching for a list of given mentions (``queries``).
- Arguments:
- queries (list): A list of mentions (strings) identified in a text
- to match.
- damlev (bool): A flag indicating whether to use the
- Damerau-Levenshtein distance for matching (True) or
- containment-based matching (False).
+class LevenshteinRanker(PartialMatchRanker):
+ """
+ A ranking method based on partial string matching via the Levenshtein distance.
- Returns:
- Tuple[dict, dict]: A tuple containing two dictionaries:
+ This class extends PerfectMatchRanker because perfect matches are sought
+ before attempting a partial match.
- #. The first dictionary maps each mention to its candidate
- list, where the candidate list is a dictionary with the
- mention variations as keys and their match scores as values.
+ Example:
+ ```python
+ # Create a Ranker object:
+ ranker = LevenshteinRanker(resources_path="/path/to/resources/")
+ # Load resources
+ ranker.load()
+ # Perform candidate selection
+ queries = ['London', 'Paraguay']
+ results = [ranker.matches(query) for query in queries]
+ # Print the results
+ print("Candidate Selection Results:")
+ for matches in results:
+ print(matches)
+ ```
+ """
+ # Override the method_name class attribute.
+ method_name: str = "levenshtein"
- #. The second dictionary stores the already collected
- candidates for each mention. It is an updated version of the
- Ranker's ``already_collected_cands`` attribute.
+ def matching_score(self, query: str, row: pd.Series) -> float:
+ """
+ Calculates the partial string matching score as the Damerau-Levenshtein
+ distance between a toponym and a row in the dataset.
- Example:
- >>> ranker = Ranker(...)
- >>> queries = ['apple', 'banana', 'orange']
- >>> candidates, already_collected = ranker.partial_match(queries, damlev=False)
- >>> print(candidates)
- {'apple': {'apple': 1.0}, 'banana': {'bananas': 0.5, 'banana split': 0.75}, 'orange': {'orange': 1.0}}
- >>> print(already_collected)
- {'apple': {'apple': 1.0}, 'banana': {'bananas': 0.5, 'banana split': 0.75}, 'orange': {'orange': 1.0}}
+ Arguments:
+ query (str): A toponym identified in a text.
+ row (Series): A pandas Series representing a row in the dataset
+ with a "mentions" column, corresponding to an alternate name
+ of an entity in the knowledge base.
+
+ Returns:
+ The similarity score between the query and the row, ranging
+ from ``0.0`` to ``1.0``.
Note:
- This method performs partial matching for each mention in the given
- list. If a mention has already been matched perfectly, it skips the
- partial matching process for that mention. For the remaining
- mentions, it calculates the match score based on the specified
- partial matching method: Levenshtein distance or containment.
+ This method computes the Damerau-Levenshtein distance between the
+ lowercase versions of a query and the "mentions" column value in
+ the given row. The distance is then normalized to a similarity score
+ by subtracting it from ``1.0``.
+ Example:
+ ```python
+ ranker = LevenshteinRanker(...)
+ query = 'apple'
+ row = pd.Series({'mentions': 'orange'})
+ similarity = ranker.matching_score(query, row)
+ print(similarity)
+ > 0.1666666865348816
+ ```
"""
+ return 1.0 - normalized_damerau_levenshtein_distance(
+ query.lower(), row["mentions"].lower()
+ )
- candidates, self.already_collected_cands = self.perfect_match(queries)
+class DeezyMatchRanker(PerfectMatchRanker):
+ """
+ A ranking method using DeezyMatch (a deep neural network approach to
+ fuzzy string matching).
- # the rest go through
- remainers = [x for x, y in candidates.items() if len(y) == 0]
+ This class extends PerfectMatchRanker because perfect matches are sought
+ before attempting a fuzzy string match.
- for query in remainers:
- mention_df = pd.DataFrame({"mentions": self.mentions_to_wikidata.keys()})
+ Arguments:
+ resources_path (str): Relative path to the resources directory
+ (containing Wikidata resources).
+ mentions_to_wikidata (dict, optional): An empty dictionary which
+ will store the mapping between mentions and Wikidata IDs,
+ which will be loaded through the Ranker's
+ [load method][t_res.geoparser.ranking.Ranker.load].
+ wikidata_to_mentions (dict, optional): An empty dictionary which
+ will store the mapping between Wikidata IDs and mentions,
+ which will be loaded through Ranker's
+ [load method][t_res.geoparser.ranking.Ranker.load].
+ strvar_parameters (dict, optional): Dictionary of string variation
+ parameters required to create a DeezyMatch training dataset.
+ For the default settings, see Notes below.
+ deezy_parameters (dict, optional): Dictionary of DeezyMatch parameters
+ for model training. For the default settings, see Notes below.
- if damlev:
- mention_df["score"] = mention_df.parallel_apply(
- lambda row: self.damlev_dist(query, row), axis=1
- )
- else:
- mention_df["score"] = mention_df.parallel_apply(
- lambda row: self.check_if_contained(query, row), axis=1
- )
+ Example:
+ ```python
+ ranker = DeezyMatchRanker(resources_path="/path/to/resources/")
+ ranker.load()
+ queries = ['London', 'Shefrield']
+ results = [ranker.matches(query) for query in queries]
+ # Print the results
+ print("Candidate Selection Results:")
+ for matches in results:
+ print(matches)
+ ```
- mention_df = mention_df.dropna()
+ Note:
+ - The default settings for ``strvar_parameters``:
+ ```python
+ strvar_parameters: Optional[dict] = {
+ # Parameters to create the string pair dataset:
+ "ocr_threshold": 60,
+ "top_threshold": 85,
+ "min_len": 5,
+ "max_len": 15,
+ "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()),
+ "w2v_ocr_model": "w2v_*_news",
+ "overwrite_dataset": False,
+ }
+ ```
- # currently hardcoded cutoff
- top_scores = sorted(
- list(set(list(mention_df["score"].unique()))), reverse=True
- )[:1]
- mention_df = mention_df[mention_df["score"].isin(top_scores)]
- mention_df = mention_df.set_index("mentions").to_dict()["score"]
+ - The default settings for ``deezy_parameters``:
+ ```python
+ deezy_parameters: Optional[dict] = {
+ "dm_path": str(Path("resources/deezymatch/").resolve()),
+ "dm_cands": "wkdtalts",
+ "dm_model": "w2v_ocr",
+ "dm_output": "deezymatch_on_the_fly",
+ "ranking_metric": "faiss",
+ "selection_threshold": 50,
+ "num_candidates": 1,
+ "verbose": False,
+ "overwrite_training": False,
+ "do_test": False,
+ }
+ ```
+ """
+ # Override the method_name class attribute.
+ method_name: str = "deezymatch"
+
+ # Override the constructor to include DeezyMatch model parameters.
+ def __init__(
+ self,
+ resources_path: str,
+ mentions_to_wikidata: Optional[dict] = dict(),
+ wikidata_to_mentions: Optional[dict] = dict(),
+ strvar_parameters: Optional[dict] = None,
+ deezy_parameters: Optional[dict] = None,
+ ):
+ super().__init__(resources_path, mentions_to_wikidata, wikidata_to_mentions)
- candidates[query] = mention_df
+ # set paths based on resources path
+ if strvar_parameters is None:
+ strvar_parameters = {
+ # Parameters to create the string pair dataset:
+ "ocr_threshold": 60,
+ "top_threshold": 85,
+ "min_len": 5,
+ "max_len": 15,
+ "w2v_ocr_path": os.path.join(resources_path, "models/w2v/"),
+ "w2v_ocr_model": "w2v_*_news",
+ "overwrite_dataset": False,
+ }
- self.already_collected_cands[query] = mention_df
+ # Default DeezyMatch parameters:
+ deezy_params = {
+ # Paths and filenames of DeezyMatch models and data:
+ "dm_path": os.path.join(resources_path, "deezymatch/"),
+ "dm_cands": "wkdtalts",
+ "dm_model": "w2v_ocr",
+ "dm_output": "deezymatch_on_the_fly",
+ # Ranking measures:
+ "ranking_metric": "faiss",
+ "selection_threshold": 50,
+ "num_candidates": 1,
+ "search_size": 3,
+ "verbose": False,
+ # DeezyMatch training:
+ "overwrite_training": False,
+ "do_test": False,
+ }
+ if deezy_parameters is not None:
+ if not set(deezy_parameters) <= set(deezy_params):
+ raise ValueError(f"Invalid REL config parameters: {set(deezy_parameters).difference(set(deezy_params))}.")
+ # Update the default parameters with any given parameters.
+ deezy_params.update(deezy_parameters)
- return candidates, self.already_collected_cands
+ self.strvar_parameters = strvar_parameters
+ self.deezy_parameters = deezy_params
- def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]:
+ def __str__(self) -> str:
"""
- Perform DeezyMatch (a deep neural network approach to fuzzy string
- matching) on-the-fly for a list of given mentions (``queries``).
-
- Arguments:
- queries (list): A list of mentions (strings) identified in a text
- to match.
+ Returns a string representation of the Ranker object, including the
+ method name and DeezyMatch training parameters.
Returns:
- Tuple[dict, dict]: A tuple containing two dictionaries:
+ A string representation of the Ranker object.
+ """
+ s = super().__str__()
+ s += " * DeezyMatch details:\n"
+ s += f" * Model: {self.deezy_parameters['dm_model']}\n"
+ s += f" * Ranking metric: {self.deezy_parameters['ranking_metric']}\n"
+ s += f" * Selection threshold: {self.deezy_parameters['selection_threshold']}\n"
+ s += f" * Num candidates: {self.deezy_parameters['num_candidates']}\n"
+ s += f" * Overwrite training: {self.deezy_parameters['overwrite_training']}\n"
+ s += f" * Overwrite dataset: {self.strvar_parameters['overwrite_dataset']}\n"
+ s += f" * Test mode: {self.deezy_parameters['do_test']}\n"
+ return s
+
+ # Override the base class implementation to optionally train the model.
+ def load(self, train: bool=True):
+ super().load()
+ if train or self.deezy_parameters["overwrite_training"]:
+ self.train()
- #. The first dictionary maps each mention to its candidate
- list, where the candidate list is a dictionary with the
- mention variations as keys and their match scores as values.
+ def matches(self, queries: List[str]) -> List[List[StringMatch]]:
+ """
+ Performs DeezyMatch ranking on-the-fly for given toponym queries.
- #. The second dictionary stores the already collected
- candidates for each mention. It is an updated version of the
- Ranker's ``already_collected_cands`` attribute.
+ Arguments:
+ queries (List[str]): A list of toponyms to be matched.
- Example:
- >>> ranker = Ranker(...)
- >>> ranker.load_resources()
- >>> queries = ['London', 'Shefrield']
- >>> candidates, already_collected = ranker.deezy_on_the_fly(queries)
- >>> print(candidates)
- {'London': {'London': 1.0}, 'Shefrield': {'Sheffield': 0.03382000000000005}}
- >>> print(already_collected)
- {'London': {'London': 1.0}, 'Shefrield': {'Sheffield': 0.03382000000000005}}
+ Returns:
+ A list of lists of StringMatch instances containing potential
+ matches for each of the given toponyms.
Note:
- This method performs DeezyMatch on-the-fly for each mention in a
- given list of mentions identified in a text. If a query has
- already been matched perfectly, it skips the fuzzy matching
- process for that query. For the remaining queries,
- it uses the DeezyMatch model to generate candidates and ranks them
- based on the specified ranking metric and selection threshold,
- provided when initialising the :py:meth:`~geoparser.ranking.Ranker`
- object.
+ This method performs DeezyMatch on-the-fly for the given toponym.
+ If a perfect match exists, DeezyMatch matching is skipped.
+ Otherwise, it uses the DeezyMatch model to generate candidates and
+ ranks them based on the specified ranking metric and selection
+ threshold, provided when initialising the ranker.
"""
dm_path = self.deezy_parameters["dm_path"]
@@ -526,37 +623,60 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]:
dm_model = self.deezy_parameters["dm_model"]
dm_output = self.deezy_parameters["dm_output"]
- # first we fill in the perfect matches and already collected queries
- cands_dict, self.already_collected_cands = self.perfect_match(queries)
+ # First attempt a perfect string match.
+ r = range(len(queries))
+
+ perfect_matches = dict()
+ for i in r:
+ matches = super().match_query(queries[i])
+ if matches:
+ perfect_matches[i] = matches
+
+ # If perfect string matches are found for all queries, return them.
+ if set(perfect_matches.keys()) == set(r):
+ return list(perfect_matches.values())
+
+ # Seek fuzzy string matches for those queries not perfectly matched.
+ candidate_scenario = os.path.join(
+ dm_path, "combined", dm_cands + "_" + dm_model
+ )
+ pretrained_model_path = os.path.join(
+ f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".model"
+ )
+ pretrained_vocab_path = os.path.join(
+ f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".vocab"
+ )
- # the rest go through
- remainers = [x for x, y in cands_dict.items() if len(y) == 0]
+ unmatched_indices = list(set(r).difference(set(perfect_matches.keys())))
+ unmatched_indices.sort()
+ unmatched_queries = {i: queries[i] for i in unmatched_indices}
+
+ deezy_result = candidate_ranker(
+ candidate_scenario=candidate_scenario,
+ query=list(unmatched_queries.values()),
+ ranking_metric=self.deezy_parameters["ranking_metric"],
+ selection_threshold=self.deezy_parameters["selection_threshold"],
+ num_candidates=self.deezy_parameters["num_candidates"],
+ search_size=self.deezy_parameters["num_candidates"],
+ verbose=self.deezy_parameters["verbose"],
+ output_path=os.path.join(dm_path, "ranking", dm_output),
+ pretrained_model_path=pretrained_model_path,
+ pretrained_vocab_path=pretrained_vocab_path,
+ )
- if remainers:
- candidate_scenario = os.path.join(
- dm_path, "combined", dm_cands + "_" + dm_model
- )
- pretrained_model_path = os.path.join(
- f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".model"
- )
- pretrained_vocab_path = os.path.join(
- f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".vocab"
- )
+ if len(deezy_result.index) != len(unmatched_queries):
+ raise Exception(f"DeezyMatch result contains {len(deezy_result.index)} rows. Expected {len(unmatched_queries)}.")
- candidates = candidate_ranker(
- candidate_scenario=candidate_scenario,
- query=remainers,
- ranking_metric=self.deezy_parameters["ranking_metric"],
- selection_threshold=self.deezy_parameters["selection_threshold"],
- num_candidates=self.deezy_parameters["num_candidates"],
- search_size=self.deezy_parameters["num_candidates"],
- verbose=self.deezy_parameters["verbose"],
- output_path=os.path.join(dm_path, "ranking", dm_output),
- pretrained_model_path=pretrained_model_path,
- pretrained_vocab_path=pretrained_vocab_path,
- )
+ # Map deezy results into the range r.
+ rows = {unmatched_indices[i]: deezy_result.iloc[i] for i in range(len(unmatched_indices))}
+
+ all_matches = list()
+ for i in r:
+ if i in perfect_matches.keys():
+ all_matches.append(perfect_matches[i])
+ else:
+ row = rows[i]
- for _, row in candidates.iterrows():
# Reverse cosine distance to cosine similarity:
returned_cands = dict()
if self.deezy_parameters["ranking_metric"] == "faiss":
@@ -573,145 +693,37 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]:
returned_cands = row["cosine_dist"]
returned_cands = {k: 1 - returned_cands[k] for k in returned_cands}
- cands_dict[row["query"]] = returned_cands
-
- self.already_collected_cands[row["query"]] = returned_cands
-
- return cands_dict, self.already_collected_cands
-
- def run(self, queries: List[str]) -> Tuple[dict, dict]:
- """
- Run the appropriate ranking method based on the specified method.
-
- Arguments:
- queries (list): A list of mentions (strings) identified in a text
- to match.
-
- Returns:
- Tuple[dict, dict]:
- A tuple containing two dictionaries. The resulting dictionaries
- will vary depending on the method set in the Ranker object.
- See Notes below for further information.
-
- Example:
- >>> myranker = Ranker(method="perfectmatch", ...)
- >>> ranker.mentions_to_wikidata = myranker.load_resources()
- >>> queries = ['London', 'Barcelona', 'Bologna']
- >>> candidates, already_collected = myranker.run(queries)
- >>> print(candidates)
- {'London': {'London': 1.0}, 'Barcelona': {'Barcelona': 1.0}, 'Bologna': {'Bologna': 1.0}}
- >>> print(already_collected)
- {'London': {'London': 1.0}, 'Barcelona': {'Barcelona': 1.0}, 'Bologna': {'Bologna': 1.0}}
-
- Note:
- This method executes the appropriate ranking method based on the
- ``method`` parameter, selected when initialising the
- :py:meth:`~geoparser.ranking.Ranker` object.
-
- It delegates the execution to the corresponding method:
+ matches = [StringMatch(k, v) for (k, v) in returned_cands.items()]
+ all_matches.append(matches)
- * :py:meth:`~geoparser.ranking.Ranker.perfect_match`
- * :py:meth:`~geoparser.ranking.Ranker.partial_match`
- * :py:meth:`~geoparser.ranking.Ranker.levenshtein`
- * :py:meth:`~geoparser.ranking.Ranker.deezy_on_the_fly`
+ if len(all_matches) != len(queries):
+ raise ValueError(f'Found {len(all_matches)} lists of matches for {len(queries)} queries')
+
+ return all_matches
- See the documentation of those methods for more details about
- their processing if the provided mentions (``queries``).
+ def train(self):
"""
- if self.method == "perfectmatch":
- return self.perfect_match(queries)
- if self.method == "partialmatch":
- return self.partial_match(queries, damlev=False)
- if self.method == "levenshtein":
- return self.partial_match(queries, damlev=True)
- if self.method == "deezymatch":
- return self.deezy_on_the_fly(queries)
- raise SyntaxError(f"Unknown method: {self.method}")
-
- def find_candidates(self, mentions: List[dict]) -> Tuple[dict, dict]:
+ Trains a DeezyMatch model. The training will be skipped if the model
+ already exists and the ``overwrite_training`` key in the
+ ``deezy_parameters`` passed when initialising the
+ [Ranker][t_res.geoparser.ranking.Ranker] object is set to ``False``. The
+ training will be run on test mode if the ``do_test`` key in the
+ ``deezy_parameters`` passed when initialising the
+ [Ranker][t_res.geoparser.ranking.Ranker] object is set to ``True``.
"""
- Find candidates for the given mentions using the selected ranking
- method.
-
- Arguments:
- mentions (list): A list of predicted mentions as dictionaries.
- Returns:
- Tuple[dict, dict]: A tuple containing two dictionaries:
-
- #. The first dictionary maps each original mention to a
- sub-dictionary, where the sub-dictionary maps the mention
- variations to a sub-sub-dictionary with two keys: ``"Score"``
- (the string matching similarity score) and ``"Candidates"``
- (a dictionary containing the Wikidata candidates, where the
- key is the Wikidata ID and value is the the relative mention-
- to-wikidata frequency).
- #. The second dictionary stores the already collected candidates
- for each query.
-
- The variation is found by the candidate ranker in the knowledge
- base, and for each variation, the candidate ranking score and
- the candidates from Wikidata are provided. E.g. for mention
- "Guadaloupe" in sentence "sn83030483-1790-03-31-a-i0004_1", the
- candidates will show as follows:
-
- .. code-block:: json
-
- {
- "Guadaloupe": {
- "Score": 1.0,
- "Candidates": {
- "Q17012": 0.003935458480913026,
- "Q3153836": 0.07407407407407407
- }
- }
- }
+ Path(self.deezy_parameters["dm_path"]).mkdir(parents=True, exist_ok=True)
+ if self.deezy_parameters["do_test"] == True:
+ if self.deezy_parameters["dm_model"][-5:] != "_test":
+ self.deezy_parameters["dm_model"] += "_test"
+ if self.deezy_parameters["dm_cands"][-5:] != "_test":
+ self.deezy_parameters["dm_cands"] += "_test"
+ deezy_processing.train_deezy_model(
+ self.deezy_parameters, self.strvar_parameters, self.wikidata_to_mentions
+ )
+ deezy_processing.generate_candidates(
+ self.deezy_parameters, self.mentions_to_wikidata
+ )
- Note:
- This method takes a list of mentions and finds candidates for each
- mention using the selected ranking method. It first extracts the
- queries from the mentions and then calls the appropriate method
- based on the ranking method chosen when initialising the
- :py:meth:`~geoparser.ranking.Ranker` object.
-
- The method returns a dictionary that maps each original mention to
- a sub-dictionary containing the mention variations as keys and
- their corresponding Wikidata match scores as values.
-
- Additionally, it updates the already collected candidates
- dictionary (the Ranker object's ``already_collected_cands``
- attribute).
- """
- # Extract the mention
- queries = list(set([mention["mention"] for mention in mentions]))
-
- # Pass the mentions to :py:meth:`geoparser.ranking.Ranker.run`
- cands, self.already_collected_cands = self.run(queries)
-
- # Get Wikidata candidates
- wk_cands = dict()
- for original_mention in cands:
- wk_cands[original_mention] = dict()
- for variation in cands[original_mention]:
- # If the candidates of the variation of the original mention
- # have already been stored, reuse them:
- stored_value = self.already_collected_cands[original_mention][variation]
- if type(stored_value) == dict:
- wk_cands[original_mention][variation] = stored_value
- # If the candidates of the variation of the original mention
- # have not yet been found, find them:
- else:
- match_score = cands[original_mention][variation]
- # Find Wikidata ID and relv.
- found_cands = self.mentions_to_wikidata.get(variation, dict())
- if found_cands and not variation in wk_cands[original_mention]:
- wk_cands[original_mention][variation] = {
- "Score": match_score,
- "Candidates": found_cands,
- }
- self.already_collected_cands[original_mention][variation] = {
- "Score": match_score,
- "Candidates": found_cands,
- }
-
- return wk_cands, self.already_collected_cands
+ # This dictionary is not used anymore:
+ self.wikidata_to_mentions = dict()
diff --git a/t_res/utils/REL/entity_disambiguation.py b/t_res/utils/REL/entity_disambiguation.py
index dd3c0dfa..780de528 100644
--- a/t_res/utils/REL/entity_disambiguation.py
+++ b/t_res/utils/REL/entity_disambiguation.py
@@ -6,7 +6,7 @@
import time
from pathlib import Path
from string import punctuation
-from typing import Any, Dict
+from typing import Any, Dict, List, Tuple
import numpy as np
import torch
@@ -31,32 +31,28 @@ class EntityDisambiguation:
and uses the trained model to predict the most likely entity for each
mention.
- This class uses a deep learning architecture, specifically the
- :py:class:`~utils.REL.mulrel_ranker.MulRelRanker` model, for entity
- disambiguation.
+ This class uses a deep learning architecture, specifically the
+ [MulRelRanker][t_res.utils.REL.mulrel_ranker.MulRelRanker] model, for
+ entity disambiguation.
- .. note::
+ Note: Credit:
+ This class and its methods are adapted from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE).
- **Credit:**
+ ```
+ Reference:
- This class and its methods are adapted from the `REL: Radboud Entity
- Linker `_ Github repository:
- Copyright (c) 2020 Johannes Michael van Hulst. See the `permission
- notice `_.
-
- ::
-
- Reference:
-
- @inproceedings{vanHulst:2020:REL,
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
-
+ }
+ ```
"""
def __init__(self, db_embs, user_config, reset_embeddings=False):
@@ -67,7 +63,7 @@ def __init__(self, db_embs, user_config, reset_embeddings=False):
self.config = self.__get_config(user_config)
# Use CPU if cuda is not available:
- self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ self.device = self.config["device"]
self.prerank_model = None
self.model = None
self.reset_embeddings = reset_embeddings
@@ -121,17 +117,18 @@ def __init__(self, db_embs, user_config, reset_embeddings=False):
raise Exception("You cannot train a model and reset the embeddings.")
self.model = MulRelRanker(self.config, self.device).to(self.device)
- def __get_config(self, user_config):
+ def __get_config(self, user_config) -> dict:
"""
User configuration that may overwrite default settings.
Returns:
- dict: The configuration used for entity disambiguation.
+ The configuration used for entity disambiguation.
"""
default_config: Dict[str, Any] = {
"mode": user_config["mode"],
"model_path": user_config["model_path"],
+ "device": user_config["device"],
"prerank_ctx_window": 50,
"keep_p_e_m": 4,
"keep_ctx_ent": 3,
@@ -171,9 +168,6 @@ def __load_embeddings(self):
and entities (``snd``, ``entity``, and ``word``). It also adds the
unknown token to the vocabulary and retrieves the corresponding embedding
from the database.
-
- Returns:
- None
"""
self.__batch_embs = {}
@@ -195,9 +189,6 @@ def __load_embeddings(self):
def train(self, org_train_dataset, org_dev_dataset):
"""
Trains the entity disambiguation model.
-
- Returns:
- None.
"""
train_dataset = self.get_data_items(org_train_dataset, "train", predict=False)
@@ -359,10 +350,13 @@ def train(self, org_train_dataset, org_dev_dataset):
break
self.best_performance = {"f1": best_f1, "p": best_p, "r": best_r}
- def __create_dataset_LR(self, dataset, predictions):
+ def __create_dataset_LR(self, dataset, predictions) -> tuple:
"""
Creates a dataset for logistic regression, to estimate posterior
probabilities of the linked entities.
+
+ Returns:
+ A tuple of numpy arrays.
"""
X = []
y = []
@@ -392,9 +386,6 @@ def train_LR(self, train_json, dev_json, model_path_lr):
Function that applies LR to get confidence scores for the
disambiguated entities. Recall should be high, because if
it is low than we would have ignored a corrrect entity.
-
- Returns:
- None
"""
print(os.path.join(model_path_lr, "lr_model.pkl"))
@@ -416,26 +407,26 @@ def train_LR(self, train_json, dev_json, model_path_lr):
with open(path, "wb") as handle:
pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
- def predict(self, data):
+ def predict(self, data) -> dict:
"""
Performs entity disambiguation on the given data. It does not require
ground truth entities to be present.
- Returns: Predictions and time taken for the ED step.
+ Returns:
+ Predictions and time taken for the ED step.
"""
data = self.get_data_items(data, "raw", predict=True)
predictions, timing = self.__predict(data, include_timing=True, eval_raw=True)
return predictions
- def normalize_scores(self, scores):
+ def normalize_scores(self, scores) -> List[float]:
"""
Normalizes a list of scores between 0 and 1 by rescaling them and
computing their ratio over their sum.
Returns:
- List[float]:
- A list of normalized scores where each score is the ratio of
+ A list of normalized scores where each score is the ratio of
the rescaled score over their sum.
"""
min_score = min(scores)
@@ -455,7 +446,7 @@ def normalize_scores(self, scores):
return normalized_scores
- def __compute_cross_cand_confidence(self, scores):
+ def __compute_cross_cand_confidence(self, scores) -> List[List[float]]:
"""
This function takes a series of numpy arrays of scores and returns
a list of lists of confidence scores.
@@ -464,12 +455,12 @@ def __compute_cross_cand_confidence(self, scores):
scores (numpy.ndarray): A numpy array of scores.
Returns:
- List[List[float]]: A list of lists of confidence scores.
+ A list of lists of confidence scores.
"""
normalised_scores = [self.normalize_scores(score) for score in scores]
return normalised_scores
- def __compute_confidence(self, scores, preds):
+ def __compute_confidence(self, scores, preds) -> List[float]:
"""
Computes confidence scores for the given entity disambiguation outputs
using logistic regression.
@@ -480,9 +471,7 @@ def __compute_confidence(self, scores, preds):
scores.
Returns:
- List[float]:
- A list of confidence scores for each entity disambiguation
- output.
+ A list of confidence scores for each entity disambiguation output.
"""
X = np.array([[score[pred]] for score, pred in zip(scores, preds)])
if self.model_lr:
@@ -492,11 +481,12 @@ def __compute_confidence(self, scores, preds):
confidence_scores = [0.0 for _ in scores]
return confidence_scores
- def __predict(self, data, include_timing=False, eval_raw=False):
+ def __predict(self, data, include_timing=False, eval_raw=False) -> dict:
"""
Uses the trained model to make predictions of individual batches (i.e. documents).
- Returns: Predictions and time taken for the ED step
+ Returns:
+ Predictions and time taken for the ED step
"""
predictions = {items[0]["doc_name"]: [] for items in data}
self.model.eval()
@@ -677,12 +667,13 @@ def __predict(self, data, include_timing=False, eval_raw=False):
else:
return predictions
- def prerank(self, dataset, dname, predict=False):
+ def prerank(self, dataset, dname, predict=False) -> list:
"""
Responsible for preranking the set of possible candidates using both
context and p(e|m) scores.
- Returns: Dataset with, by default, max 3 + 4 candidates per mention.
+ Returns:
+ Dataset with, by default, max 3 + 4 candidates per mention.
"""
new_dataset = []
has_gold = 0
@@ -804,9 +795,6 @@ def __update_embeddings(self, emb_name, embs):
"""
Responsible for updating the dictionaries with their respective word,
entity and snd embeddings.
-
- Returns:
- None
"""
embs = embs.to(self.device)
@@ -839,9 +827,6 @@ def __update_embeddings(self, emb_name, embs):
def __embed_words(self, words_filt, name):
"""
Responsible for retrieving embeddings using the given sqlite3 database.
-
- Returns:
- None.
"""
embs = rel_utils.get_db_emb(self.db_embs, words_filt, name)
@@ -853,12 +838,12 @@ def __embed_words(self, words_filt, name):
self.embeddings["{}_voca".format(name)].add_to_vocab(c)
self.__batch_embs[name].append(torch.tensor(e))
- def get_data_items(self, dataset, dname, predict=False):
+ def get_data_items(self, dataset, dname, predict=False) -> list:
"""
Responsible for formatting the dataset. Triggers the preranking function.
Returns:
- Preranking function.
+ List returned by the `prerank` method.
"""
data = []
@@ -1070,14 +1055,13 @@ def get_data_items(self, dataset, dname, predict=False):
return self.prerank(data, dname, predict)
- def __eval(self, testset, system_pred):
+ def __eval(self, testset, system_pred) -> Tuple[float, float, float, int]:
"""
Responsible for evaluating data points, which is solely used for the
local entity disambiguation step.
Returns:
- Tuple[float, float, float, int]:
- A tuple containing the F1 score, recall, precision, and the
+ A tuple containing the F1 score, recall, precision, and the
number of mentions for which there is no valid candidate.
"""
gold = []
@@ -1105,21 +1089,19 @@ def __eval(self, testset, system_pred):
def __save(self, path):
"""
Responsible for storing the trained model during optimisation.
-
- Returns:
- None.
"""
torch.save(self.model.state_dict(), "{}.state_dict".format(path))
with open("{}.config".format(path), "w") as f:
json.dump(self.config, f)
- def __load(self, path):
+ def __load(self, path) -> MulRelRanker:
"""
Responsible for loading a trained model and its respective config. Note
that this config cannot be overwritten. If required, this behavior may
be modified in future releases.
- Returns: The loaded trained model.
+ Returns:
+ The loaded trained model.
"""
if os.path.exists("{}.config".format(path)):
with open("{}.config".format(path), "r") as f:
diff --git a/t_res/utils/REL/mulrel_ranker.py b/t_res/utils/REL/mulrel_ranker.py
index 675e11bb..1144b7d1 100644
--- a/t_res/utils/REL/mulrel_ranker.py
+++ b/t_res/utils/REL/mulrel_ranker.py
@@ -2,36 +2,32 @@
import torch
import torch.nn.functional as F
from torch.autograd import Variable
-
+from typing import Any
class PreRank(torch.nn.Module):
"""
PreRank class is used for preranking entities for a given mention
by multiplying entity vectors with word vectors.
- .. note::
-
- **Credit:**
-
+ Note: Credit:
This class and its methods are taken (minimally
- adapted when necessary) from the `REL: Radboud Entity
- Linker `_ Github repository:
- Copyright (c) 2020 Johannes Michael van Hulst. See the `permission
- notice `_.
+ adapted when necessary) from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE).
- ::
+ ```
+ Reference:
- Reference:
-
- @inproceedings{vanHulst:2020:REL,
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
-
+ }
+ ```
"""
def __init__(self, config, embeddings=None):
@@ -41,11 +37,12 @@ def __init__(self, config, embeddings=None):
super(PreRank, self).__init__()
self.config = config
- def forward(self, token_ids, token_offsets, entity_ids, embeddings):
+ def forward(self, token_ids, token_offsets, entity_ids, embeddings) -> torch.Tensor:
"""
Multiplies local context words with entity vectors for a given mention.
- Returns: entity scores.
+ Returns:
+ Entity scores.
"""
sent_vecs = embeddings["word_embeddings_bag"](
@@ -69,50 +66,45 @@ class MulRelRanker(torch.nn.Module):
"""
The MulRelRanker class implements a neural network model for entity disambiguation.
- .. note::
-
- **Credit:**
-
+ Note: Credit:
This class and its methods are taken (minimally
- adapted when necessary) from the `REL: Radboud Entity
- Linker `_ Github repository:
- Copyright (c) 2020 Johannes Michael van Hulst. See the `permission
- notice `_.
+ adapted when necessary) from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE).
This is based on the ``mulrel-nel`` approach developed by Le and
Titov (2018), whose original code is available in the
- `mulrel-nel: Multi-relational Named Entity Linking
- `_ Github repository, and
- on Ganea and Hofmann (2017).
-
- ::
+ [mulrel-nel: Multi-relational Named Entity Linking](https://github.com/lephong/mulrel-nel)
+ Github repository, and on Ganea and Hofmann (2017).
- References:
+ ```
+ References:
- @inproceedings{vanHulst:2020:REL,
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
+ }
- @inproceedings{ganea2017deep,
+ @inproceedings{ganea2017deep,
title={Deep Joint Entity Disambiguation with Local Neural Attention},
author={Ganea, Octavian-Eugen and Hofmann, Thomas},
booktitle={Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
pages={2619--2629},
year={2017}
- }
+ }
- @inproceedings{le2018improving,
+ @inproceedings{le2018improving,
title={Improving Entity Linking by Modeling Latent Relations between Mentions},
author={Le, Phong and Titov, Ivan},
booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
pages={1595--1604},
year={2018}
- }
-
+ }
+ ```
"""
def __init__(self, config, device):
@@ -179,11 +171,12 @@ def __local_ent_scores(
entity_mask,
embeddings,
p_e_m=None,
- ):
+ ) -> Any:
"""
Computes local entity scores.
- Returns: entity scores.
+ Returns:
+ Entity scores.
"""
batchsize, n_words = token_ids.size()
@@ -251,14 +244,14 @@ def forward(
p_e_m,
embeddings,
gold=None,
- ):
+ ) -> tuple:
"""
Responsible for the forward pass of the entity disambiguation model
and produces a ranking of candidates for a given set of mentions:
- * ctx_layer refers to function f. See Figure 3 in Le and Titov (2018).
- * ent_scores refers to function q.
- * score_combine refers to function g.
+ - ctx_layer refers to function f. See Figure 3 in Le and Titov (2018).
+ - ent_scores refers to function q.
+ - score_combine refers to function g.
Returns:
Ranking of entities per mention.
@@ -445,9 +438,6 @@ def forward(
def regularize(self, max_norm=1):
"""
Regularizes model parameters.
-
- Returns:
- None
"""
l1_w_norm = self.score_combine_linear_1.weight.norm()
l1_b_norm = self.score_combine_linear_1.bias.norm()
@@ -471,11 +461,12 @@ def regularize(self, max_norm=1):
self.score_combine_linear_2.bias.data * max_norm / l2_b_norm.data
)
- def loss(self, scores, true_pos, lamb=1e-7):
+ def loss(self, scores, true_pos, lamb=1e-7) -> torch.Tensor:
"""
Computes given ranking loss (Equation 7) and adds a regularization term.
- Returns: loss of given batch.
+ Returns:
+ Loss of given batch.
"""
loss = F.multi_margin_loss(scores, true_pos, margin=self.config["margin"])
if self.config["use_local_only"]:
diff --git a/t_res/utils/REL/utils.py b/t_res/utils/REL/utils.py
index 39a4797d..d94c942d 100644
--- a/t_res/utils/REL/utils.py
+++ b/t_res/utils/REL/utils.py
@@ -7,41 +7,38 @@ def flatten_list_of_lists(
list_of_lists: List[List[Any]],
) -> Tuple[List[Any], List[int]]:
"""
- Flatten a list of lists for input to torch.nn.EmbeddingBag.
+ Flatten a list of lists for input to `torch.nn.EmbeddingBag`.
Args:
list_of_lists (List[List[Any]]): A list of lists to be flattened.
Returns:
- tuple: A tuple containing the flattened list and the offsets.
+ A tuple containing the flattened list and the offsets.
Example:
- >>> list_of_lists = [[1, 2, 3], [4, 5], [6]]
- >>> print(flatten_list_of_lists(list_of_lists))
- ([1, 2, 3, 4, 5, 6], array([0, 3, 5]))
+ ```
+ flatten_list_of_lists([[1, 2, 3], [4, 5], [6]])
+ > ([1, 2, 3, 4, 5, 6], array([0, 3, 5]))
+ ```
- .. note::
+ Note: Credit:
+ This function is taken from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE).
- **Credit:**
+ ```
+ Reference:
- This function is taken from the `REL: Radboud Entity
- Linker `_ Github repository:
- Copyright (c) 2020 Johannes Michael van Hulst. See the `permission
- notice `_.
-
- ::
-
- Reference:
-
- @inproceedings{vanHulst:2020:REL,
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
-
+ }
+ ```
"""
list_of_lists = [[]] + list_of_lists
offsets = np.cumsum([len(x) for x in list_of_lists])[:-1]
@@ -62,35 +59,32 @@ def make_equal_len(
Defaults to ``True``.
Returns:
- tuple: A tuple containing the lists of equal length and the mask.
+ A tuple containing the lists of equal length and the mask.
Example:
- >>> lists = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
- >>> print(make_equal_len(lists))
- ([[1, 2, 3, 0], [4, 5, 0, 0], [6, 7, 8, 9]], [[1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
-
- .. note::
-
- **Credit:**
+ ```
+ make_equal_len([[1, 2, 3], [4, 5], [6, 7, 8, 9]])
+ > ([[1, 2, 3, 0], [4, 5, 0, 0], [6, 7, 8, 9]], [[1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
+ ```
- This function is taken from the `REL: Radboud Entity
- Linker `_ Github repository:
- Copyright (c) 2020 Johannes Michael van Hulst. See the `permission
- notice `_.
+ Note: Credit:
+ This function is taken from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE).
- ::
+ ```
+ Reference:
- Reference:
-
- @inproceedings{vanHulst:2020:REL,
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
-
+ }
+ ```
"""
lens = [len(l) for l in lists]
max_len = max(1, max(lens))
@@ -112,33 +106,32 @@ def is_important_word(s: str) -> bool:
s (str): The word to be checked.
Returns:
- bool: True if the word is important, False otherwise.
+ True if the word is important, False otherwise.
Example:
- >>> print(is_important_word("apple"))
- True
-
- .. note::
-
- **Credit:**
-
- This function is adapted from the `REL: Radboud Entity
- Linker `_ Github repository:
- Copyright (c) 2020 Johannes Michael van Hulst. See the `permission
- notice `_.
+ ```
+ is_important_word("apple")
+ > True
+ ```
- ::
+ Note: Credit:
+ This function is taken from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE).
- Reference:
+ ```
+ Reference:
- @inproceedings{vanHulst:2020:REL,
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
+ }
+ ```
"""
try:
if len(s) <= 3 or s.lower() in STOPWORDS:
@@ -149,6 +142,7 @@ def is_important_word(s: str) -> bool:
return True
+"""A set of common stopwords used for word filtering, obtained from the [REL: Radboud Entity Linker](https://github.com/informagi/REL/) Github repository."""
STOPWORDS = {
"a",
"about",
@@ -484,4 +478,3 @@ def is_important_word(s: str) -> bool:
"best",
"using",
}
-"""A set of common stopwords used for word filtering, obtained from the `REL: Radboud Entity Linker `_ Github repository."""
diff --git a/t_res/utils/REL/vocabulary.py b/t_res/utils/REL/vocabulary.py
index 008eba2d..c586b548 100644
--- a/t_res/utils/REL/vocabulary.py
+++ b/t_res/utils/REL/vocabulary.py
@@ -18,30 +18,26 @@ class Vocabulary:
"""
A class representing a vocabulary object used for storing references to embeddings.
- .. note::
-
- **Credit:**
-
- The code for this class and its methods is taken from the `REL: Radboud Entity
- Linker `_ Github repository: Copyright (c)
- 2020 Johannes Michael van Hulst. See the `permission notice
- `_. See `the original script
- `_ for more
- information.
-
- ::
-
- Reference:
-
- @inproceedings{vanHulst:2020:REL,
+ Note: Credit:
+ This function is taken from the [REL: Radboud Entity
+ Linker](https://github.com/informagi/REL/) Github repository:
+ Copyright (c) 2020 Johannes Michael van Hulst. See the [permission
+ notice](https://github.com/informagi/REL/blob/main/LICENSE). See the [original
+ script](https://github.com/informagi/REL/blob/main/src/REL/vocabulary.py) for
+ more information.
+
+ ```
+ Reference:
+
+ @inproceedings{vanHulst:2020:REL,
author = {van Hulst, Johannes M. and Hasibi, Faegheh and Dercksen, Koen and Balog, Krisztian and de Vries, Arjen P.},
title = {REL: An Entity Linker Standing on the Shoulders of Giants},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '20},
year = {2020},
publisher = {ACM}
- }
-
+ }
+ ```
"""
unk_token = UNK_TOKEN
@@ -70,7 +66,7 @@ def normalize(
with ``'0'`` during normalization. Defaults to ``False``.
Returns:
- str: The normalized token.
+ The normalized token.
"""
if token in [Vocabulary.unk_token, "", ""]:
return token
@@ -85,15 +81,12 @@ def normalize(
else:
return token
- def add_to_vocab(self, token: str) -> None:
+ def add_to_vocab(self, token: str):
"""
Add the given token to the vocabulary.
Arguments:
token (str): The token to be added to the vocabulary.
-
- Returns:
- None.
"""
new_id = len(self.id2word)
self.id2word.append(token)
@@ -105,7 +98,7 @@ def size(self) -> int:
Get the size of the vocabulary.
Returns:
- int: The number of words in the vocabulary.
+ The number of words in the vocabulary.
"""
return len(self.id2word)
@@ -117,8 +110,8 @@ def get_id(self, token: str) -> int:
token (str): The token for which to retrieve the ID.
Returns:
- int: The ID of the token in the vocabulary, or the ID of the
- unknown token if the token is not found.
+ The ID of the token in the vocabulary, or the ID of the
+ unknown token if the token is not found.
"""
tok = Vocabulary.normalize(token)
return self.word2id.get(tok, self.unk_id)
diff --git a/t_res/utils/batch_job.py b/t_res/utils/batch_job.py
new file mode 100644
index 00000000..74c99b36
--- /dev/null
+++ b/t_res/utils/batch_job.py
@@ -0,0 +1,506 @@
+import os
+import sys
+import yaml
+import json
+import argparse
+import logging
+import pickle
+import importlib
+from math import ceil
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Tuple
+from tqdm import tqdm
+
+from sentence_splitter import SentenceSplitter
+from datasets import Dataset
+from transformers.pipelines.pt_utils import KeyDataset
+import pandas as pd
+import sqlite3
+
+from t_res.geoparser import ner, ranking, linking, pipeline
+from t_res.utils.dataclasses import Candidates, SentenceCandidates
+
+RECOGNISER_KEY = 'recogniser'
+RANKER_KEY = 'ranker'
+LINKER_KEY = 'linker'
+BATCH_SIZE_KEY = 'batch_size'
+LOG_LEVEL_KEY = 'log_level'
+
+def run():
+ parser = argparse.ArgumentParser(description='Run a T-Res batch job.')
+ parser.add_argument('config_file', type=str, help='Path to the YAML batch job config file.')
+ parser.add_argument('input_file', type=str, help='Path to the input CSV data file.')
+ parser.add_argument('resources_path', type=str, help='Path to the resources directory.')
+ parser.add_argument('results_path', type=str, help='Path to the results directory.')
+ help = '''[Optional] Path to the place of publication CSV data file. \
+ Must include columns named "Wikidata ID" and "Location"'''
+ parser.add_argument('place_of_pub_file', type=str, help=help, nargs='?')
+
+ args = parser.parse_args()
+
+ with open(args.config_file) as stream:
+ try:
+ config = yaml.safe_load(stream)
+ except yaml.YAMLError as err:
+ print(f"Error parsing YAML config file: {err}")
+ sys.exit()
+
+ validate_config(config)
+ tqdm.pandas()
+
+ if not os.path.exists(args.resources_path):
+ raise ValueError(f"Resources path does not exist: {args.resources_path}")
+ if not os.path.isfile(args.input_file):
+ raise ValueError(f"Missing input data file: {args.input_file}")
+ Path(args.results_path).mkdir(parents=True, exist_ok=True)
+
+ batch_job = BatchJob.new(
+ batch_size=config[BATCH_SIZE_KEY],
+ config=config,
+ input_file=args.input_file,
+ resources_path=args.resources_path,
+ results_path=args.results_path,
+ place_of_pub_file=args.place_of_pub_file,
+ )
+ batch_job.load()
+ batch_job.run()
+
+def validate_config(config: dict):
+ keys = {RECOGNISER_KEY, RANKER_KEY, LINKER_KEY, BATCH_SIZE_KEY}
+ missing_keys = keys.difference(config.keys())
+ if missing_keys:
+ raise ValueError(f"Missing config key(s): {missing_keys}")
+ if LOG_LEVEL_KEY in config.keys():
+ if not config[LOG_LEVEL_KEY] in {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'}:
+ raise ValueError(f'Invalid log_level config parameter: {config[LOG_LEVEL_KEY]}')
+
+# TODO: add error handling around run_batch so the whole job does not fail on a single error.
+class BatchJob:
+ """
+ A wrapper for the Pipeline class for efficient & convenient processing
+ of large datasets.
+ """
+
+ text_colname = 'text'
+ nlp_colname = 'NLP'
+
+ predictions_csv = 'predictions.csv'
+ predictions_pickle = 'predictions.pkl'
+ predictions_colname = 'predictions'
+
+ place_of_pub_wqid_key = 'place_of_pub_wqid'
+ place_of_pub_key = 'place_of_pub'
+
+ def __init__(
+ self,
+ config: dict,
+ input_file: str,
+ resources_path: str,
+ results_path: str,
+ place_of_pub_file: Optional[str]=None,
+ ):
+
+ # Set the default logging level.
+ if not LOG_LEVEL_KEY in config.keys():
+ config[LOG_LEVEL_KEY] = 'INFO'
+ self.config = config
+ self.config_str = json.dumps(config, indent=4)
+ self.resources_path = resources_path
+ self.results_path = results_path
+ self.input_file = input_file
+ self.place_of_pub_file = place_of_pub_file
+
+ try:
+ self.batch_size = int(config[BATCH_SIZE_KEY])
+ except:
+ raise ValueError(f'Batch size must be an integer. Use 0 for unlimited batch size.')
+
+ self.batches_processed = 0
+
+ def new(batch_size: int, **kwargs) -> 'BatchJob':
+ """
+ Static constructor.
+
+ Args:
+ batch_size (int): A non-negative integer. The size of each batch.
+ kwargs (dict): A dictionary of keyword arguments matching the
+ arguments to the BatchJob __init__ constructor.
+
+ Returns:
+ A BatchJob (subclass) instance.
+ """
+ if batch_size == 0:
+ return UnlimitedBatchJob(**kwargs)
+ if batch_size == 1:
+ return SingletonBatchJob(**kwargs)
+ if batch_size > 1:
+ return LimitedBatchJob(**kwargs)
+ raise ValueError(f'Invalid batch_size: {batch_size}')
+
+ def load(self):
+
+ # Construct the T-Res pipeline.
+ self.construct_pipeline()
+
+ # Read input data & drop rows with empty text.
+ self.input_data = pd.read_csv(self.input_file)
+ if not self.text_colname in self.input_data.columns:
+ raise ValueError(f'Input data must contain a column named "{self.text_colname}"')
+ self.input_data.dropna(subset=[self.text_colname])
+
+ # Read place of publication information into a dictionary.
+ self.place_of_pub_data = dict()
+ self.missing_place_of_pub_data = list()
+ if self.place_of_pub_file:
+
+ for i, row in pd.read_csv(self.place_of_pub_file).iterrows():
+ self.place_of_pub_data[row[self.nlp_colname]] = {
+ self.place_of_pub_wqid_key: row['Wikidata ID'],
+ self.place_of_pub_key: row['location']
+ }
+
+ # Handle the case where x["NLP"] is not found in place_of_pub_data.
+ def place_of_pub_for_nlp(nlp: str):
+ if nlp in self.place_of_pub_data.keys():
+ return self.place_of_pub_data[nlp]
+ if not nlp in self.missing_place_of_pub_data:
+ self.missing_place_of_pub_data.append(nlp)
+ return {
+ self.place_of_pub_wqid_key: "",
+ self.place_of_pub_key: ""
+ }
+
+ self.place_of_pub_series = self.input_data.apply(
+ lambda x: place_of_pub_for_nlp(x[self.nlp_colname]),
+ axis=1,
+ )
+ else:
+ self.place_of_pub_data = None
+
+ def construct_pipeline(self):
+
+ recogniser = ner.Recogniser.new(
+ **self.config[RECOGNISER_KEY])
+ ranker = ranking.Ranker.new(
+ resources_path=self.resources_path,
+ **self.config[RANKER_KEY])
+
+ # Fill in linking parameters in the case of a REL Linker.
+ if self.config[LINKER_KEY]['method_name'] == 'reldisamb':
+ self.config[LINKER_KEY]['ranker'] = ranker
+ if 'rel_params' not in self.config[LINKER_KEY].keys():
+ print("No `rel_params` configuration parameter found for REL linking. Using defaults.")
+ self.config[LINKER_KEY]['rel_params'] = dict()
+ rel_params = self.config[LINKER_KEY]['rel_params']
+ rel_params['do_test'] = False
+ rel_params['model_path'] = os.path.join(self.resources_path, "models/disambiguation/")
+ db_database_path = os.path.join(self.resources_path, "rel_db/embeddings_database.db")
+ with sqlite3.connect(db_database_path) as conn:
+ rel_params['db_embeddings'] = conn.cursor()
+
+ linker = linking.Linker.new(
+ resources_path=self.resources_path,
+ **self.config[LINKER_KEY])
+
+ self.pipe = pipeline.Pipeline(
+ recogniser=recogniser,
+ ranker=ranker,
+ linker=linker,
+ )
+ # self.logger.info('Constructed T-Res pipeline')
+
+ def initialise_logging(self):
+
+ logger = logging.getLogger(__name__)
+ self.log_file = os.path.join(self.run_path, f'{self.run_title()}.log')
+ logging.basicConfig(
+ filename=self.log_file,
+ encoding='utf-8',
+ format='%(asctime)s %(levelname)s %(message)s',
+ datefmt='%m/%d/%Y %H:%M:%S',
+ )
+ logger.setLevel(self.config[LOG_LEVEL_KEY])
+ self.logger = logger
+
+ print(">>>> Running T-Res batch job >>>>")
+ self.logger.info(f'Starting T-Res batch job...')
+ self.logger.info(f'T-Res version: {importlib.metadata.version("t_res")}')
+ self.logger.info(f'Input data file: {self.input_file}')
+ if self.place_of_pub_file:
+ self.logger.info(f'Place of publication data file: {self.place_of_pub_file}')
+ if self.missing_place_of_pub_data:
+ self.logger.warning(f'Missing place of publication data for the \
+ following NLPs:\n{self.missing_place_of_pub_data}')
+ else:
+ self.logger.info('Place of publication data found for all NLPs in the input data')
+ self.logger.info(f'Results will be written to: {self.results_path}')
+ self.logger.info(f'Resources will be read from: {self.resources_path}')
+ self.logger.info(f'Config:\n{self.config_str}')
+
+ self.logger.info(f'Recogniser device: {self.pipe.recogniser.device}')
+ if isinstance(self.pipe.linker, linking.RelDisambLinker):
+ self.logger.info(f'REL Linker device: {self.pipe.linker.entity_disambiguation_model.device}')
+
+ def timestamp(self) -> str:
+ return self.start_time.strftime('%Y-%m-%d_%H-%M-%S')
+
+ def run_title(self) -> str:
+ return f't-res_batch_{self.timestamp()}'
+
+ def run(self):
+
+ # Store the start time of this run.
+ self.start_time = datetime.now()
+
+ # Create a subdirectory for this run.
+ self.run_path = os.path.join(self.results_path, self.run_title())
+ os.mkdir(self.run_path)
+ self.initialise_logging()
+
+ predictions = self.run_batches()
+
+ # Store the start time of this run.
+ self.end_time = datetime.now()
+
+ # Save the predictions.
+ with open(os.path.join(self.run_path, self.predictions_pickle), 'wb') as f:
+ pickle.dump(predictions, f)
+
+ self.save_results(predictions)
+
+ # TODO: tidy up (remove intermediate files, unless configured to keep them).
+ self.logger.info('Batch job finished successfully.')
+ self.logger.info(f'Execution time: {self.execution_time()}')
+ print(f'>>>> T-Res batch job finished ({self.execution_time()}) <<<<')
+
+ def run_batches(self) -> pd.Series:
+
+ predictions_list = list()
+ while(self.next_batch_range()):
+ # Split input into batches of size batch_size.
+ r = self.next_batch_range()
+ next_batch = self.input_data.iloc[r[0]:r[1]]
+ self.logger.info(f'Running batch {self.batches_processed + 1}. Items {r[0]}-{r[1]}')
+ print(f'Batch {self.batches_processed + 1} of {self.count_batches()}:')
+ predictions_list.append(self.run_batch(next_batch))
+
+ return pd.concat(predictions_list)
+
+ def run_batch(self, batch) -> pd.Series:
+
+ mentions_series = self.run_batch_ner(batch)
+ candidates_series = self.run_batch_ranking(mentions_series)
+ predictions_series = self.run_batch_linking(candidates_series)
+ self.batches_processed += 1
+ return predictions_series
+
+ def run_batch_ner(self, batch) -> pd.Series:
+
+ print('NER...')
+ tick = datetime.now()
+
+ splitter = SentenceSplitter(language='en', non_breaking_prefix_file=None)
+ def run_ner(row):
+ # Handle the case of empty text.
+ text = str(row[self.text_colname])
+ if len(text) <= 1:
+ return list()
+ # Create a HuggingFace Dataset instance from the list of sentences (to leverage GPU).
+ sentences = splitter.split(text)
+ dataset = Dataset.from_pandas(pd.DataFrame({'text': sentences}))
+ # Call the recogniser pipeline on the dataset.
+ ner_predictions = self.pipe.recogniser.pipe(KeyDataset(dataset, 'text'))
+ # Return a list of non-empty SentenceMentions instances.
+ sms = [self.pipe.recogniser.post_process(p, s) for p, s in zip(ner_predictions, sentences)]
+ return [sm for sm in sms if not sm.is_empty()]
+
+ result = batch.progress_apply(run_ner, axis=1)
+
+ tock = datetime.now()
+ self.logger.info(f'NER execution time: {tock - tick}')
+ return result
+
+ def run_batch_ranking(self, mentions_series) -> pd.Series:
+
+ print('Candidate selection...')
+ tick = datetime.now()
+
+ # Convert to a data frame to access the row index via the `name` field.
+ result = pd.DataFrame(mentions_series).progress_apply(
+ lambda x: self.pipe.run_candidate_selection(
+ x[0],
+ place_of_pub_wqid=self.place_of_pub_wqid(x.name),
+ place_of_pub=self.place_of_pub(x.name),
+ ),
+ axis=1,
+ )
+
+ tock = datetime.now()
+ self.logger.info(f'Candidate Selection execution time: {tock - tick}')
+ return result
+
+ def run_batch_linking(self, candidate_series):
+
+ print('Disambiguation...')
+ tick = datetime.now()
+ result = candidate_series.progress_apply(
+ lambda x: self.pipe.run_disambiguation(x),
+ )
+ tock = datetime.now()
+ self.logger.info(f'Disambiguation execution time: {tock - tick}')
+ return result
+
+ def place_of_pub_wqid(self, row_index: int) -> str:
+ """
+ Gets the place of publication Wikidata ID for a given row of the input data file.
+ Returns an empty string if no place of publication information is available.
+
+ Args:
+ row_index (int): The row index in the input data file.
+
+ Returns:
+ str: The place of publication Wikidata ID, if available, otherwise an empty string.
+ """
+ if self.place_of_pub_file:
+ return self.place_of_pub_series[row_index][self.place_of_pub_wqid_key]
+ return ""
+
+ def place_of_pub(self, row_index: int) -> str:
+ """
+ Gets the place of publication for a given row of the input data file.
+ Returns an empty string if no place of publication information is available.
+
+ Args:
+ row_index (int): The row index in the input data file.
+
+ Returns:
+ str: The place of publication, if available, otherwise an empty string.
+ """
+ if self.place_of_pub_file:
+ return self.place_of_pub_series[row_index][self.place_of_pub_key]
+ return ""
+
+ def save_results(self, predictions: pd.Series):
+ """
+ Write the results to a CSV file.
+
+ Args:
+ predictions (Series): A pandas Series containing an instance of
+ the `Predictions` dataclass for each row in the input data.
+ """
+ def summarise(p):
+ if not p:
+ return list()
+ return p.summary_dict()
+
+ # Write an extra column alongside the CSV input data.
+ predictions_column = predictions.apply(summarise)
+ results = pd.concat([self.input_data, predictions_column.rename(self.predictions_colname)], axis=1)
+
+ results.to_csv(self.results_file(), index=False)
+ self.logger.info(f'Results written to {self.results_file()}')
+
+ # Write a new CSV file containing one row per toponym prediction.
+ predictions_list = predictions_column.tolist()
+
+ flat_predictions_list = [p for predictions in predictions_list for p in predictions]
+ # Include the index of the corresponding row of the input CSV file.
+ flat_predictions_list = [dict(p, **{'input_row_index': i}) for i, predictions
+ in enumerate(predictions_list) for p in predictions]
+ predictions_df = pd.DataFrame(flat_predictions_list)
+
+ predictions_file = os.path.join(self.run_path, self.predictions_csv)
+ predictions_df.to_csv(predictions_file, index=False)
+ self.logger.info(f'Predictions written to {predictions_file}')
+
+ def results_file(self) -> str:
+ _, input_filename = os.path.split(self.input_file)
+ prefix, _ = os.path.splitext(input_filename)
+ suffix = '_' + self.config[RECOGNISER_KEY]['method_name']
+ suffix += '-' + self.config[RANKER_KEY]['method_name']
+ suffix += '-' + self.config[LINKER_KEY]['method_name']
+ if self.config[LINKER_KEY]['method_name'] == 'reldisamb':
+ if self.config[LINKER_KEY]['rel_params']['with_publication']:
+ if self.config[LINKER_KEY]['rel_params']['predict_place_of_publication']:
+ suffix += '-predictpub'
+ else:
+ suffix += '-withpub'
+ else:
+ suffix += '-nopub'
+ if self.config[LINKER_KEY]['rel_params']['combined_score']:
+ suffix += '-combined'
+ if self.config[LINKER_KEY]['rel_params']['without_microtoponyms']:
+ suffix += '-nomicro'
+ else:
+ suffix += '-withmicro'
+ return os.path.join(self.run_path, prefix + suffix + ".csv")
+
+ def next_batch_range(self) -> Optional[Tuple[int, int]]:
+ """
+ Computes the range of indices of items in the next batch.
+ Item indices start counting from zero and the range is inclusive
+ of the lower end of the range and exclusive of the upper end.
+
+ For instance, if batch range (0, 10) includes items with indices
+ 0 to 9.
+
+ Returns:
+ The range of item indices in the next batch.
+ """
+
+ # Count items from zero.
+ next_item = self.batches_processed * self.batch_size
+ final_item = len(self.input_data.index) - 1
+ if next_item > final_item:
+ return None
+ return next_item, min(next_item + self.batch_size, final_item + 1)
+
+ def count_batches(self) -> int:
+ return ceil(len(self.input_data.index) / self.batch_size)
+
+ def execution_time(self):
+ return self.end_time - self.start_time
+
+class LimitedBatchJob(BatchJob):
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ if not self.batch_size > 0:
+ raise ValueError(f'Invalid batch size: {self.batch_size}')
+
+class UnlimitedBatchJob(BatchJob):
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ if self.batch_size != 0:
+ raise ValueError(f'Invalid batch size: {self.batch_size}')
+
+ # Override the `load` method to set the batch size equal to the input data size.
+ def load(self):
+ super().load()
+ self.batch_size = len(self.input_data.index)
+
+class SingletonBatchJob(BatchJob):
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ if self.batch_size != 1:
+ raise ValueError(f'Invalid batch size: {self.batch_size}')
+
+ # Override the `run_batches` method to run the pipeline end-to-end.
+ def run_batches(self) -> pd.Series:
+
+ def run(row):
+ if self.config[LOG_LEVEL_KEY] == 'DEBUG':
+ self.logger.debug(f'Running pipeline on text:\n{row[self.text_colname]}')
+ self.logger.debug(f'Place of publication ID:{self.place_of_pub_wqid(row.name)}')
+ self.logger.debug(f'Place of publication:\n{self.place_of_pub(row.name)}')
+ return self.pipe.run(
+ row[self.text_colname],
+ place_of_pub_wqid=self.place_of_pub_wqid(row.name),
+ place_of_pub=self.place_of_pub(row.name),
+ )
+
+ print('Running end-to-end pipeline...')
+ return self.input_data.progress_apply(run, axis=1)
diff --git a/t_res/utils/dataclasses.py b/t_res/utils/dataclasses.py
new file mode 100644
index 00000000..b9f26ad4
--- /dev/null
+++ b/t_res/utils/dataclasses.py
@@ -0,0 +1,1018 @@
+"""
+The `t_res.utils.dataclasses` module defines all data structures used within the T-Res pipeline,
+implemented as Python dataclasses.
+"""
+
+from typing import List, Dict, Tuple, Optional
+from pydantic.dataclasses import dataclass as pdataclass
+from dataclasses import field
+
+from sentence_splitter import SentenceSplitter
+
+################################
+# Dataclasses for Recogniser
+################################
+
+@pdataclass(order=True, frozen=True)
+class Mention:
+ """Dataclass representing a toponym mention in text.
+
+ Attributes:
+ mention (str): The toponym mention.
+ start_offset (int): The token offset inside the text marking the start of the mention.
+ end_offset (int): The token offset inside the text marking the end of the mention.
+ start_char (int): The character offset inside the text marking the start of the mention.
+ ner_score (float): The NER confidence score.
+ ner_label (float): The NER label of the mention.
+ entity_link (str): The consolidated entity link of the mention ('O' for predicted mentions).
+ """
+ sort_index: int = field(init=False)
+ mention: str
+ start_offset: int
+ end_offset: int
+ start_char: int
+ ner_score: float
+ ner_label: str
+ entity_link: str
+
+ def __post_init__(self):
+ object.__setattr__(self, 'sort_index', self.start_char)
+
+ def __str__(self, pad_mention: int=0, pad_label: int=0):
+ s = f"{self.mention.ljust(pad_mention)} {self.ner_label.ljust(pad_label)}"
+ s += f" chars: {self.start_char}-{self.end_char()}"
+ s += f" confidence: {self.ner_score}"
+ return s
+
+ def from_dict(data: dict) -> 'Mention':
+ """Constructs a `Mention` instance from a dictionary."""
+ if 'sort_index' in data.keys():
+ del data['sort_index']
+ return Mention(**data)
+
+ def end_char(self) -> int:
+ """Returns the character offset inside the text marking the end of the mention."""
+ return self.start_char + len(self.mention)
+
+ def is_microtoponym(self) -> bool:
+ """Returns `True` if the `ner_label` is not `LOC`, indicating a microtoponym."""
+ return self.ner_label != "LOC"
+
+# Helper class for backwards compatibility with training functions in rel_utils.py
+@pdataclass(frozen=True)
+class TrainingMention(Mention):
+ """Helper class providing backwards compatibility with training functions in `rel_utils.py`.
+
+ Attributes:
+ gold (str): The Wikidata ID of the known ("gold standard") toponym, or 'NIL' if not known.
+ """
+ gold: str
+
+ def from_dict(data: dict) -> 'TrainingMention':
+ """Constructs a `TrainingMention` instance from a dictionary."""
+ if isinstance(data['gold'], list) and len(data['gold']) != 1:
+ raise ValueError(f"Multiple gold standard toponymn IDs: {data['gold']}")
+ if 'tag' in data.keys() and 'ner_label' not in data.keys():
+ data['ner_label'] = data['tag']
+ return TrainingMention(
+ mention=data['mention'],
+ start_offset=-1,
+ end_offset=-1,
+ start_char=data['pos'],
+ ner_score=-1.0,
+ ner_label=data['ner_label'],
+ entity_link='',
+ gold=data['gold'][0] if isinstance(data['gold'], list) else data['gold'],
+ )
+
+@pdataclass(frozen=True)
+class Sentence:
+ """Dataclass representing a sentence.
+
+ Attributes:
+ sentence (str): The sentence.
+ """
+ sentence: str
+
+ def __len__(self):
+ return len(self.sentence)
+
+@pdataclass(frozen=True)
+class SentenceContext(Sentence):
+ """Dataclass representing a sentence with (optional) context.
+
+ Attributes:
+ preceding_sentence (Optional[str]): The preceding sentence (context).
+ following_sentence (Optional[str]): The following sentence (context).
+ sent_idx (Optional[int]): The sentence index (within a block of text). Defaults to None.
+ """
+ preceding_sentence: Optional[str]
+ following_sentence: Optional[str]
+ sent_idx: Optional[int]=None
+
+ def from_text(text: str, language: str="en", non_breaking_prefix_file: str=None) -> List['SentenceContext']:
+ """Constructs a list of `SentenceContext` instances from a block of text."""
+ splitter = SentenceSplitter(language=language, non_breaking_prefix_file=non_breaking_prefix_file)
+ sentences = splitter.split(text)
+ return [SentenceContext(s, sentences[i - 1] if i > 0 else None,
+ sentences[i + 1] if i < len(sentences) - 1 else None)
+ for i, s in enumerate(sentences)]
+
+ def from_sentence(sentence: str) -> 'SentenceContext':
+ """Constructs a `SentenceContext` instance from a string."""
+ return SentenceContext(sentence, None, None)
+
+ # Helper method for the Predictions as_dict method.
+ def context_as_list(self) -> List[str]:
+ """Converts this instance to a list of strings."""
+ preceding = self.preceding_sentence if self.preceding_sentence is not None else ''
+ following = self.following_sentence if self.following_sentence is not None else ''
+ return [preceding, following]
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> Sentence:
+ """Constructs a `SentenceContext` instance from a dictionary."""
+ ps = data['preceding_sentence'] if 'preceding_sentence' in data.keys() else None
+ fs = data['following_sentence'] if 'following_sentence' in data.keys() else None
+ sent_idx = data['sent_idx'] if 'sent_idx' in data.keys() else None
+ if ps or fs or sent_idx:
+ return SentenceContext(data['sentence'], ps, fs, sent_idx)
+ return Sentence(data['sentence'])
+
+# Recogniser::run method output type.
+@pdataclass(frozen=True)
+class SentenceMentions:
+ """Dataclass representing toponym mentions in a sentence.
+
+ Attributes:
+ sentence (Sentence): The sentence.
+ mentions (List[Mention]): A list of toponym mentions, ordered by character offset within the sentence.
+ """
+ sentence: Sentence
+ mentions: List[Mention]
+
+ def __post_init__(self):
+ if self.is_empty():
+ return
+ if max([m.end_char() for m in self.mentions]) > len(self.sentence):
+ raise ValueError("Max end char exceeds sentence length.")
+
+ def __str__(self):
+ s = f"Toponym mentions for sentence: '{self.sentence.sentence}'"
+ if self.is_empty():
+ s += "\n None"
+ return s
+ pad_mention = max([len(m.mention) for m in self.mentions])
+ pad_label = max([len(m.ner_label) for m in self.mentions])
+ for m in self.mentions:
+ s += f"\n {m.__str__(pad_mention, pad_label)}"
+ return s
+
+ def is_empty(self) -> bool:
+ """Returns `True` if the list of toponym mentions is empty."""
+ return len(self.mentions) == 0
+
+ def len(self) -> int:
+ """Returns the length of the list of toponym mentions."""
+ return len(self.mentions)
+
+ def exclude_microtoponyms(self) -> 'SentenceMentions':
+ """Returns this `SentenceMentions` instance omitting any microtoponym mentions."""
+ mentions = list(filter(lambda m: not m.is_microtoponym(), self.mentions))
+ return SentenceMentions(self.sentence, mentions)
+
+ # Helper method for backwards compatibility with training functions in `rel_utils.py`.
+ def from_list(data: List[Dict]) -> 'SentenceMentions':
+ """Constructs a `SentenceMentions` instance from a list of dictionaries.
+
+ Helper method for backwards compatibility with training functions in `rel_utils.py`.
+ """
+ # The data are assumed to be in the format returned by the
+ # `prepare_initial_data` method in `rel_utils.py`.
+ mentions = [TrainingMention.from_dict(d) for d in data]
+ # Check that all sentences in the list are identical.
+ if {d['sentence'] for d in data} != {data[0]['sentence']}:
+ raise ValueError("Inconsistent sentences.")
+ d = data[0]
+ context = SentenceContext(d['sentence'], d['context'][0], d['context'][1], d['sent_idx'])
+ return SentenceMentions(context, mentions)
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> 'SentenceMentions':
+ """Constructs a `SentenceMentions` instance from a dictionary."""
+ return SentenceMentions(
+ sentence=SentenceContext.from_dict(data['sentence']),
+ mentions=[Mention.from_dict(d) for d in data['mentions']],
+ )
+
+ # For API deserialisation.
+ def from_json(data: List[Dict]) -> List['SentenceMentions']:
+ """Constructs a list of `SentenceMentions` instances from a list of dictionaries."""
+ return [SentenceMentions.from_dict(d) for d in data]
+
+
+################################
+# Dataclasses for Ranker
+################################
+
+@pdataclass(order=True, frozen=True)
+class StringMatch:
+ """Dataclass representing a potential toponym string match.
+
+ Attributes:
+ variation (str): The toponym spelling variation.
+ string_similarity (float): String matching similarly score.
+ """
+ sort_index: float = field(init=False)
+ variation: str
+ string_similarity: float
+
+ def __post_init__(self):
+ object.__setattr__(self, 'sort_index', self.string_similarity)
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> 'StringMatch':
+ """Constructs a `StringMatch` instance from a dictionary."""
+ if 'sort_index' in data.keys():
+ del data['sort_index']
+ if 'wqid_links' in data.keys():
+ return StringMatchLinks(**data)
+ return StringMatch(**data)
+
+@pdataclass(order=True, frozen=True)
+class StringMatchLinks(StringMatch):
+ """Dataclass representing a potential toponym string match
+ with potential Wikidata ID links.
+
+ Attributes:
+ wqid_links (List[str]): List of potential Wikidata ID links.
+ """
+ wqid_links: List[str]
+
+ def as_string_match(self) -> StringMatch:
+ """Converts this `StringMatchLinks` instance into a `StringMatch` instance."""
+ return StringMatch(self.variation, self.string_similarity)
+
+# Ranker::run method output type.
+@pdataclass(frozen=True)
+class CandidateMatches:
+ """Dataclass representing candidate matches for a toponym.
+
+ Attributes:
+ mention (Mention): The toponym mention in the text.
+ ranking_method (str): The string matching method used.
+ matches (List[StringMatchLinks]): A list of potential toponym matches, each with potential Wikidata links.
+ """
+ mention: Mention
+ ranking_method: str
+ matches: List[StringMatchLinks]
+
+ def __post_init__(self):
+ # Check that the variations are unique in self.matches.
+ variations = [match.variation for match in self.matches]
+ if len(variations) != len(set(variations)):
+ raise ValueError("StringMatch variations must be unique.")
+ # Order matches by decreasing string similarity.
+ object.__setattr__(self, 'matches', sorted(self.matches, reverse=True))
+
+ def is_empty(self) -> bool:
+ """Returns `True` if the list of toponym matches is empty."""
+ return len(self.matches) == 0
+
+ def get(self, variation: str) -> StringMatchLinks:
+ """Returns the StringMatch instance with the given spelling variation
+ or None if no such match exists."""
+ for m in self.matches:
+ if m.variation == variation:
+ return m
+ return None
+
+
+################################
+# Dataclasses for Linker
+################################
+
+# Base dataclass.
+@pdataclass(frozen=True)
+class WikidataLink:
+ """Dataclass representing a potential toponym link in Wikidata.
+
+ Attributes:
+ wqid (str): The Wikidata ID.
+ wkdt_class (Optional[str]): The Wikidata class of this Wikidata entry (if available).
+ coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the link in Wikidata.
+ """
+ wqid: str
+ wkdt_class: Optional[str]
+ coords: Optional[Tuple[float, float]]
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> 'WikidataLink':
+ """Constructs a `WikidataLink` instance from a dictionary."""
+ if 'freq' in data.keys():
+ if 'normalized_score' in data.keys():
+ return RelDisambLink(**data)
+ return MostPopularLink(**data)
+ return ByDistanceLink(**data)
+
+@pdataclass(frozen=True)
+class MostPopularLink(WikidataLink):
+ """Dataclass representing a string match and potential links in
+ Wikidata under the `mostpopular` linking method.
+
+ Attributes:
+ freq (int): The mention-to-wikidata link frequency.
+ """
+ freq: int
+
+ def __post_init__(self):
+ if not isinstance(self.freq, int):
+ raise ValueError("freq must be an integer.")
+
+@pdataclass(frozen=True)
+class ByDistanceLink(WikidataLink):
+ """Dataclass representing a string match and potential links in
+ Wikidata under the `bydistance` linking method.
+
+ Attributes:
+ place_of_pub_coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the place of publication.
+ geodist (Optional[float]): The geodesic distance between the wqid and the origin wqid.
+ normalized_score (float): The normalized score from resource `mentions_to_wikidata_normalized.json`.
+ """
+ place_of_pub_coords: Optional[Tuple[float, float]]
+ geodist: Optional[float]
+ normalized_score: float
+
+ def __post_init__(self):
+ if not isinstance(self.normalized_score, float):
+ raise ValueError("normalized_score must be an float.")
+
+@pdataclass(frozen=True)
+class RelDisambLink(MostPopularLink):
+ """Dataclass representing a string match and potential links in
+ Wikidata under the `reldisamb` linking method.
+
+ Attributes:
+ normalized_score (float): The normalized score from resource `mentions_to_wikidata_normalized.json`.
+ """
+ normalized_score: float
+
+ def __post_init__(self):
+ super().__post_init__()
+ if not isinstance(self.normalized_score, float):
+ raise ValueError("normalized_score must be an float.")
+
+@pdataclass(order=True, frozen=True)
+class CandidateLinks:
+ """Dataclass representing a collection of potential links in Wikidata for a given string match.
+
+ Attributes:
+ string_match (StringMatch): A StringMatch instance.
+ wikidata_links (List[WikidataLink]): A list of candidate WikidataLink instances.
+ """
+ sort_index: float = field(init=False)
+ string_match: StringMatch
+ wikidata_links: List[WikidataLink]
+
+ def __post_init__(self):
+ object.__setattr__(self, 'sort_index', self.string_match.string_similarity)
+
+ # One liner.
+ def __str__(self, pad_variation: int=0) -> str:
+ s = f"{self.string_match.variation.ljust(pad_variation)}"
+ s += f" [{'{:.3f}'.format(self.string_match.string_similarity)}]"
+ s += f": {self.links_str()}"
+ return s
+
+ def links_str(self) -> str:
+ """Returns a string representation of the list of Wikidata links (for pretty-printing)."""
+ if self.is_empty():
+ return "None"
+ s = ', '.join(link.wqid for link in self.wikidata_links[:3])
+ if len(self.wikidata_links) > 3:
+ s += ", ..."
+ return s
+
+ def is_empty(self) -> bool:
+ """Returns `True` if the list of Wikidata links is empty."""
+ return len(self.wikidata_links) == 0
+
+ def attach_scores(self, scores: Dict[str, float]) -> 'PredictedLinks':
+ """Transforms this CandidateLinks instance into a PredictedLinks instance
+ by attaching disambiguation scores."""
+ # Check that there is one score for each link.
+ if scores.keys() != {link.wqid for link in self.wikidata_links}:
+ raise ValueError("Incompatible disambiguation scores.")
+ return PredictedLinks(self.string_match, self.wikidata_links, scores)
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> 'CandidateLinks':
+ """Constructs a `CandidateLinks` instance from a dictionary."""
+ if 'disambiguation_scores' in data.keys():
+ return PredictedLinks(
+ string_match=StringMatch.from_dict(data['string_match']),
+ wikidata_links=[WikidataLink.from_dict(d) for d in data['wikidata_links']],
+ disambiguation_scores=data['disambiguation_scores'],
+ )
+ return CandidateLinks(
+ string_match=StringMatch.from_dict(data['string_match']),
+ wikidata_links=[WikidataLink.from_dict(d) for d in data['wikidata_links']],
+ )
+
+# Extend CandidateLinks to include disambigution scores. Note that we use
+# inheritance, rather than composition, for compatibility with the `links`
+# field in the Candidates dataclass.
+@pdataclass(order=True, frozen=True)
+class PredictedLinks(CandidateLinks):
+ """Dataclass representing a collection of potential links in Wikidata with scores for each.
+
+ Attributes:
+ disambiguation_scores (Dict[str, float]): A disambiguation score for each potential link in Wikidata.
+ """
+ disambiguation_scores: Dict[str, float]
+
+ def links_str(self) -> str:
+ """(Override) Returns a string representation of the list of Wikidata links (for pretty-printing)."""
+ if self.is_empty():
+ return "None"
+ l = [f"{s} ({v})" for s, v in self.cross_cand_scores().items()]
+ s = ', '.join(l[:3])
+ if len(self.wikidata_links) > 3:
+ s += ", ..."
+ return s
+
+ def best_disambiguation_score(self) -> float:
+ """Returns the greatest disambiguation score."""
+ if self.is_empty():
+ return None
+ return max(self.disambiguation_scores.values())
+
+ # TODO: use min(self.wikidata_links, key=lambda link: link....) if poss.
+ def best_wikidata_link(self) -> WikidataLink:
+ """Returns the Wikidata link with the greatest disambiguation score."""
+ if self.is_empty():
+ return None
+ for link in self.wikidata_links:
+ if link.wqid == self.best_wqid():
+ return link
+
+ def best_wqid(self) -> float:
+ """Returns the Wikidata ID of the link with the greatest disambiguation score."""
+ if self.is_empty():
+ return None
+ scores = self.disambiguation_scores
+ return max(scores, key=lambda key: scores[key])
+
+ def cross_cand_scores(self, len=7) -> dict:
+ """Returns the top 7 Wikidata links in order of their disambiguation score
+ (providing backwards compatibility with T-Res pipeline output in previous versions)."""
+ scores = {k: round(v, 3) for (k, v) in self.disambiguation_scores.items()}
+ return dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)[:len])
+
+ # Helper method for the Predictions as_dict method.
+ def scores_as_list(self) -> list:
+ """Returns the disambiguation scores as a list.
+
+ Helper method for the Predictions as_dict method."""
+ ret = [[k, round(v, 3)] for k, v in self.disambiguation_scores.items()]
+ return sorted(ret, key=lambda x: (x[1], x[0]), reverse=True)
+
+# Linker::run method output type.
+@pdataclass(order=True, frozen=True)
+class MentionCandidates:
+ """Dataclass representing candidate string matches for a toponym,
+ each with candidate Wikidata links.
+
+ Attributes:
+ mention (Mention): The toponym mention in the text.
+ ranking_method (str): The string matching method used.
+ linking_method (str): The linking method used.
+ links (List[CandidateLinks]): A list of CandidateLinks instances, ordered by decreasing string similarity.
+ place_of_pub_wqid (Optional[str]): Place of publication Wikidata ID.
+ place_of_pub (Optional[str]): Place of publication.
+ """
+ sort_index: float = field(init=False)
+ mention: Mention
+ ranking_method: str
+ linking_method: str
+ links: List[CandidateLinks]
+ place_of_pub_wqid: Optional[str]
+ place_of_pub: Optional[str]
+
+ def __post_init__(self):
+ object.__setattr__(self, 'sort_index', self.mention.start_char)
+ # Check that the variations are unique in self.links.
+ variations = [m.string_match.variation for m in self.links]
+ if len(variations) != len(set(variations)):
+ raise ValueError("StringMatch variations must be unique.")
+ object.__setattr__(self, 'links', sorted(self.links, reverse=True))
+ if self.place_of_pub_wqid:
+ if self.place_of_pub_wqid[0] != "Q":
+ raise ValueError(f"Invalid Wikidata ID: {self.place_of_pub_wqid}")
+
+ def __str__(self) -> str:
+ s = f"Candidates for toponym mention: '{self.mention.mention}':"
+ if self.is_empty():
+ s += "\n None"
+ return s
+ pad_variation = max([len(l.string_match.variation) for l in self.links])
+ for link in self.links:
+ if link.is_empty():
+ continue
+ s += f"\n {link.__str__(pad_variation)}"
+ return s
+
+ def is_empty(self) -> bool:
+ """Returns `True` if the list of `CandidateLinks` is empty *or* the
+ `CandidateLinks` instance with the best string match is empty."""
+ return len(self.links) == 0 or self.links[0].is_empty()
+
+ def get(self, variation: str) -> Optional[CandidateLinks]:
+ """Returns the CandidateLinks instance with the given spelling variation,
+ or None if no such match exists."""
+ for m in self.links:
+ if m.string_match.variation == variation:
+ return m
+ return None
+
+ def best_match(self) -> Optional[CandidateLinks]:
+ """Returns the CandidateLinks instance whose StringMatch has the highest string similarity,
+ or None if no such match exists."""
+ if self.is_empty():
+ return None
+ # The list of CandidateLinks instances is ordered by decreasing string similarity.
+ return self.links[0]
+
+ def best_string_match(self) -> Optional[StringMatch]:
+ """Returns the StringMatch instance with the highest string similarity.
+ or None if no such match exists."""
+ if self.is_empty():
+ return None
+ return self.best_match().string_match
+
+ def best_wikidata_link(self) -> Optional[WikidataLink]:
+ """Returns the Wikidata link with the highest disambiguation score, associated with
+ the best string match candidate, or None if no such match exists."""
+ # Get the candidate with highest string similarity.
+ best_match = self.best_match()
+ if not best_match or best_match.is_empty():
+ return None
+ if not isinstance(best_match, PredictedLinks):
+ raise ValueError(f"Expected PredictedLinks instance. Got {type(best_match)}")
+ return best_match.best_wikidata_link()
+
+ def best_wqid(self) -> Optional[str]:
+ """Returns the Wikidata ID of the best Wikidata Link, or None if no best link exists."""
+ best_wikidata_link = self.best_wikidata_link()
+ if not best_wikidata_link:
+ return None
+ return best_wikidata_link.wqid
+
+ def best_coords(self) -> Optional[Tuple[float, float]]:
+ """Returns the lat-long coordinates of the best Wikidata Link, or None if no best link exists."""
+ best_wikidata_link = self.best_wikidata_link()
+ if not best_wikidata_link:
+ return None
+ return best_wikidata_link.coords
+
+ def best_disambiguation_score(self) -> Optional[float]:
+ """Returns the disambiguation score of the best match, or None if no such match exists."""
+ best_match = self.best_match()
+ if not best_match or best_match.is_empty():
+ return None
+ if not isinstance(best_match, PredictedLinks):
+ return None
+ return best_match.best_disambiguation_score()
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> 'MentionCandidates':
+ """Constructs a `MentionCandidates` instance from a dictionary."""
+ place_of_pub_wqid=data['place_of_pub_wqid'] if 'place_of_pub_wqid' in data.keys() and len(data['place_of_pub_wqid']) > 0 else None
+ place_of_pub=data['place_of_pub'] if 'place_of_pub' in data.keys() and len(data['place_of_pub']) > 0 else None
+ return MentionCandidates(
+ mention=Mention.from_dict(data['mention']),
+ ranking_method=data['ranking_method'],
+ linking_method=data['linking_method'],
+ links=[CandidateLinks.from_dict(d) for d in data['links']],
+ place_of_pub_wqid=place_of_pub_wqid,
+ place_of_pub=place_of_pub,
+ )
+
+################################
+# Dataclasses for Pipeline
+################################
+
+@pdataclass(frozen=True)
+class SentenceCandidates:
+ """Dataclass representing candidate matches for all toponym mentions in a sentence.
+
+ Attributes:
+ sentence (Sentence): The sentence.
+ candidates (List[MentionCandidates]): List of candidates for each toponym mention in the sentence.
+ """
+ sentence: Sentence
+ candidates: List[MentionCandidates]
+
+ def __post_init__(self):
+ if self.is_empty():
+ return
+ if max([cs.mention.end_char() for cs in self.candidates]) > len(self.sentence):
+ raise ValueError("Inconsistent candidate mentions. Max end char exceeds sentence length.")
+
+ def is_empty(self, ignore_empty_candidates: bool=True) -> bool:
+ """Returns `True` if the list of `MentionCandidates` is empty.
+ If `ignore_empty_candidates` is `True`, only non-empty candidates are considered."""
+ if ignore_empty_candidates:
+ return len(self.candidates) == 0 or all([c.is_empty() for c in self.candidates])
+ return len(self.candidates) == 0
+
+ def remove_microtoponyms(self):
+ """Removes any `MentionCandidates` instances in the `candidates` list that
+ refer to a microtoponym mention."""
+ indices = [i for i, c in enumerate(self.candidates) if c.mention.is_microtoponym()]
+ if not indices:
+ return self
+ indices.sort(reverse=True)
+ for i in indices:
+ del self.candidates[i]
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> 'SentenceCandidates':
+ """Constructs a `SentenceCandidates` instance from a dictionary."""
+ return SentenceCandidates(
+ sentence=SentenceContext.from_dict(data['sentence']),
+ candidates=[MentionCandidates.from_dict(d) for d in data['candidates']]
+ )
+
+# Pipeline::run_candidate_selection method output type.
+@pdataclass(frozen=True)
+class Candidates:
+ """Dataclass representing candidate matches for all toponym mentions
+ in a block of text.
+
+ Attributes:
+ sentence_candidates (List[SentenceCandidates]): List of setence candidates for each sentence in the text.
+ """
+ sentence_candidates: List[SentenceCandidates]
+
+ def __post_init__(self):
+ if self.is_empty():
+ return
+ # Check that all place of publication data is consistent.
+ if {self.place_of_pub_wqid()} != {c.place_of_pub_wqid for c in self.candidates()}:
+ raise ValueError("Inconsistent place of publication Wikidata IDs.")
+ if {self.place_of_pub()} != {c.place_of_pub for c in self.candidates()}:
+ raise ValueError("Inconsistent place of publication data.")
+
+ def __str__(self):
+ split = self.text().split(' ')
+ s = f"{type(self).__name__} for text: '{' '.join(split[:3])}...{' '.join(split[-3:])}':"
+ if self.is_empty():
+ s += "\n None"
+ return s
+ mention_candidates = self.candidates(ignore_empty_candidates = False)
+ def len_variation(c: MentionCandidates) -> int:
+ if c.best_match():
+ return len(c.best_match().string_match.variation)
+ return 0
+ pad_mention = max([len(c.mention.mention) for c in mention_candidates])
+ pad_variation = max([len_variation(c) for c in mention_candidates])
+ for c in mention_candidates:
+ s += f"\n {self.candidates_str(c, pad_mention, pad_variation)}"
+ return s
+
+ def candidates_str(self, candidates: MentionCandidates, pad_mention: int=0, pad_variation: int=0) -> str:
+ """Returns a string representation of a `MentionCandidates` instance (for pretty-printing)."""
+ s = f"{candidates.mention.mention.ljust(pad_mention)} => "
+ if candidates.best_match():
+ s += f"{candidates.best_match().__str__(pad_variation)}"
+ else:
+ s += f"None"
+ return s
+
+ def candidates(self, ignore_empty_candidates: bool=True) -> List[MentionCandidates]:
+ """Returns all `MentionCandidates` as a list. If `ignore_empty_candidates` is `True`,
+ only non-empty candidates are considered."""
+ if ignore_empty_candidates:
+ return [c for sc in self.sentence_candidates for c in sc.candidates if not c.is_empty()]
+ return [c for sc in self.sentence_candidates for c in sc.candidates]
+
+ def sentences(self, ignore_empty_candidates: bool=True) -> List[str]:
+ """Returns the sentence corresponding to each `MentionCandidates` instance that
+ is returned by the `candidates` method."""
+ if ignore_empty_candidates:
+ return [(sc.sentence.sentence, c)[0] for sc in self.sentence_candidates
+ for c in sc.candidates if not c.is_empty()]
+ return [sc.sentence.sentence for sc in self.sentence_candidates]
+
+ def is_empty(self, ignore_empty_candidates: bool=True) -> bool:
+ """Returns `True` if the list of `SentenceCandidates` instances is empty.
+ If `ignore_empty_candidates` is `True`, only non-empty candidates are considered."""
+ return len(self.candidates(ignore_empty_candidates)) == 0
+
+ def text(self) -> str:
+ """Returns the complete text."""
+ return " ".join([scs.sentence.sentence for scs in self.sentence_candidates])
+
+ # TODO: unit test needed.
+ def sentence_contexts(self) -> List[SentenceContext]:
+ """Returns a list of `SentenceContext` instances."""
+ scs = self.sentence_candidates
+ return [SentenceContext(sc.sentence.sentence,
+ scs[i - 1].sentence.sentence if i > 0 else None,
+ scs[i + 1].sentence.sentence if i < len(scs) - 1 else None)
+ for i, sc in enumerate(scs)]
+
+ def place_of_pub_wqid(self) -> Optional[str]:
+ """Returns the place of publication Wikidata ID, if available."""
+ if self.is_empty(ignore_empty_candidates=False):
+ return None
+ return self.candidates(ignore_empty_candidates=False)[0].place_of_pub_wqid
+
+ def place_of_pub(self) -> Optional[str]:
+ """Returns the place of publication, if available."""
+ if self.is_empty(ignore_empty_candidates=False):
+ return None
+ return self.candidates(ignore_empty_candidates=False)[0].place_of_pub
+
+ # For API deserialisation.
+ def from_dict(data: dict) -> 'Candidates':
+ """Constructs a `Candidates` instance from a dictionary."""
+ sentence_candidates = [SentenceCandidates.from_dict(d) for d in data['sentence_candidates']]
+ is_predicted_links = [isinstance(links, PredictedLinks)
+ for scs in sentence_candidates
+ for mc in scs.candidates
+ for links in mc.links]
+ if any(is_predicted_links):
+ return Predictions(sentence_candidates)
+ return Candidates(sentence_candidates)
+
+# Pipeline::run_disambiguation method output type.
+@pdataclass(frozen=True)
+class Predictions(Candidates):
+ """Dataclass representing toponym predictions in text."""
+
+ def __post_init__(self):
+ super().__post_init__()
+ for c in self.candidates():
+ if not all([isinstance(links, PredictedLinks) for links in c.links]):
+ raise ValueError("Candidate links must be scored.")
+
+ def best_wqids(self) -> List[Optional[str]]:
+ """Returns a list of predicted Wikidata IDs (one per toponym mention)."""
+ return [c.best_wqid() for c in self.candidates()]
+
+ def best_coords(self) -> List[Optional[Tuple[float, float]]]:
+ """Returns a list of predicted lat-long coordinates (one per toponym mention)."""
+ return [c.best_coords() for c in self.candidates()]
+
+ def best_disambiguation_scores(self) -> List[Optional[float]]:
+ """Returns a list of greatest disambiguation scores (one per toponym mention)."""
+ return [c.best_disambiguation_score() for c in self.candidates()]
+
+ def apply_rel_disambiguation(
+ self,
+ rel_predictions: dict,
+ with_publication: bool) -> 'RelPredictions':
+ """Incorporates predictions generated by the REL disambiguation method and
+ returns an instance of the `RelPredictions` subclass."""
+
+ if not rel_predictions:
+ return RelPredictions(self.sentence_candidates, list())
+
+ # If with_publication is True, drop the "artificial" final toponym mention.
+ if with_publication and not self.is_empty(ignore_empty_candidates=False):
+ del rel_predictions["linking"][-1]
+
+ # Incoroporate the REL model predictions.
+ rel_scores = [RelScores(
+ mention=d["mention"],
+ scores={wqid: score for wqid, score in zip(d["candidates"], d["scores"])},
+ confidence=d["conf_ed"]) for d in rel_predictions["linking"]]
+
+ return RelPredictions(self.sentence_candidates, rel_scores)
+
+ def place_of_pub_mention(self) -> dict:
+ """Returns a dictionary containing a toponym mention for the place of publication.
+
+ Helper method for backward compatibility with training functions in `rel_utils.py`.
+ """
+ place_of_pub = self.place_of_pub()
+ place_of_pub_wqid = self.place_of_pub_wqid()
+ if not place_of_pub or not place_of_pub_wqid:
+ raise ValueError("Missing place of publication info.")
+ prefix = "This article is published in "
+ place_of_pub_sentence = f"{prefix}{place_of_pub}."
+ # NOTE: this dict is slightly inconsistent versus the mention_dicts
+ # constructed in the as_dict method:
+ # - "ner_score" and "conf_md" are missing
+ # - "tag" is instead named "ner_label"
+ # These inconsistencies are preserved from an earlier version and perhaps
+ # should be fixed in future. Note that this format *is* consistent with
+ # the keys in the TrainingPredictions.as_list() method.
+ return {
+ "mention": place_of_pub,
+ "sent_idx": 0,
+ "sentence": place_of_pub_sentence,
+ "gold": [place_of_pub_wqid],
+ "ngram": place_of_pub,
+ "context": ["", ""],
+ "pos": len(prefix),
+ "end_pos": len(prefix) + len(place_of_pub),
+ "candidates": [[place_of_pub_wqid, 1.0]],
+ "place": place_of_pub,
+ "place_wqid": place_of_pub_wqid,
+ "ner_label": "LOC",
+ }
+
+ # Converts to a dictionary for backwards compatibility with entity_disambiguation.py
+ # (similar to the deprecated `format_prediction` method in pipeline.py)
+ def as_dict(self, with_publication: bool) -> dict:
+ """Converts to a dictionary for backwards compatibility with `entity_disambiguation.py`."""
+ d = dict()
+ d["linking"] = []
+ contexts = self.sentence_contexts()
+ for i, sc in enumerate(self.sentence_candidates):
+ for c in sc.candidates:
+ if c.is_empty():
+ candidates = []
+ else:
+ candidates = c.best_match().scores_as_list()
+ mention_dict = {
+ "mention": c.mention.mention,
+ "context": contexts[i].context_as_list(),
+ "candidates": candidates,
+ "gold": ["NONE"],
+ "ner_score": c.mention.ner_score,
+ "pos": c.mention.start_char,
+ "sent_idx": i,
+ "end_pos": c.mention.end_char(),
+ "ngram": c.mention.mention,
+ "conf_md": c.mention.ner_score,
+ "tag": c.mention.ner_label,
+ "sentence": sc.sentence.sentence,
+ "place": c.place_of_pub,
+ "place_wqid": c.place_of_pub_wqid,
+ # TODO: Do we need to include `string_match_candidates`? It's not used
+ # in `entity_disambiguation.py` and entails repetition of the wikidata links:
+ # "string_match_candidates": [link.string_match for link in self.links],
+ }
+ d["linking"].append(mention_dict)
+
+ # Append a mention for the place of publication, unless this instance
+ # has no sentence candidates, in which case it lacks place of publication info.
+ # (NB: Replaces add_publication from rel_utils.py):
+ if with_publication and not self.is_empty(ignore_empty_candidates=False):
+ d["linking"].append(self.place_of_pub_mention())
+
+ return d
+
+ def summary_dict(self) -> List[dict]:
+ """Returns a summary prediction for each toponym mention as a list of dictionaries."""
+ l = list()
+ for c, s in zip(self.candidates(ignore_empty_candidates=True),
+ self.sentences(ignore_empty_candidates=True)):
+ disambiguation_score = c.best_disambiguation_score()
+ if disambiguation_score:
+ disambiguation_score = round(disambiguation_score, 3)
+ d = {
+ 'mention': c.mention.mention,
+ 'sentence': s,
+ 'start_char': c.mention.start_char,
+ 'end_char': c.mention.end_char(),
+ 'ner_label': c.mention.ner_label,
+ 'ner_score': c.mention.ner_score,
+ 'prediction': c.best_wqid(),
+ 'predicted_coordinates': c.best_coords(),
+ 'toponym_match': c.best_string_match().variation,
+ 'string_similarity': c.best_string_match().string_similarity,
+ 'disambiguation_score': disambiguation_score,
+ }
+ l.append(d)
+ return l
+
+@pdataclass(frozen=True)
+class TrainingPredictions(Predictions):
+ """Dataclass representing toponym predictions for training a REL model."""
+
+ def __post_init__(self):
+ super().__post_init__()
+
+ # Similar to the as_dict method in Predictions, but now for backward
+ # compatibility with the `prepare_rel_trainset` function in `rel_utils.py`.
+ def as_list(self, with_publication: bool) -> List[dict]:
+ """Converts to a list of dictionaries.
+
+ Helper method for backwards compatibility with training functions in `rel_utils.py`."""
+ l = list()
+ for sc in self.sentence_candidates:
+ if not isinstance(sc.sentence, SentenceContext):
+ raise ValueError(f"Expected SentenceContext instance. Got: {type(sc)}")
+ for c in sc.candidates:
+ if not isinstance(c.mention, TrainingMention):
+ raise ValueError(f"Expected TrainingMention instance. Got: {type(c)}")
+ if c.is_empty():
+ candidates = []
+ else:
+ candidates = c.best_match().scores_as_list()
+ mention_dict = {
+ "mention": c.mention.mention,
+ "sent_idx": sc.sentence.sent_idx,
+ "sentence": sc.sentence.sentence,
+ "ngram": c.mention.mention,
+ "context": sc.sentence.context_as_list(),
+ "pos": c.mention.start_char,
+ "end_pos": c.mention.end_char(),
+ "place": c.place_of_pub,
+ "place_wqid": c.place_of_pub_wqid,
+ "candidates": candidates,
+ "ner_label": c.mention.ner_label,
+ "gold": [c.mention.gold] if c.mention.gold != 'NIL' else 'NIL',
+ }
+ l.append(mention_dict)
+
+ # Append a mention for the place of publication, unless this instance
+ # has no sentence candidates, in which case it lacks place of publication info.
+ # (NB: Replaces add_publication from rel_utils.py):
+ if with_publication and not self.is_empty(ignore_empty_candidates=False):
+ l.append(self.place_of_pub_mention())
+ return l
+
+@pdataclass(frozen=True)
+class RelScores:
+ """Dataclass representing scores produced by the REL entity disambiguation model.
+
+ Attributes:
+ mention (str): The toponym mention.
+ scores (Dict[str, float]): REL entity disambiguation scores.
+ confidence (float): REL entity disambiguation confidence score.
+ """
+ mention: str
+ scores: Dict[str, float]
+ confidence: float
+
+@pdataclass(frozen=True)
+class CombinedScores(RelScores):
+ """Dataclass representing combined scores produced by combining REL scores with
+ proximity and popularity measures.
+
+ Attributes:
+ mention (str): The toponym mention.
+ scores (Dict[str, float]): REL entity disambiguation scores.
+ confidence (float): REL entity disambiguation confidence score.
+ """
+ rel_scores: Dict[str, float]
+
+@pdataclass(frozen=True)
+class RelPredictions(Predictions):
+ """Dataclass representing toponym predictions in text produced by REL entity disambiguation.
+
+ Attributes:
+ rel_scores (List[RelScores]): A list of REL entity disambiguation scores.
+ """
+ rel_scores: List[RelScores]
+
+ def __post_init__(self):
+ count_candidates = len(super().candidates(ignore_empty_candidates=False))
+ if len(self.rel_scores) != count_candidates:
+ raise ValueError(f"""Expected one RelScores instance per linked toponym mention.
+ Got {len(self.rel_scores)} instances and {count_candidates} mentions.""")
+
+ # Override the candidates method to return REL linking predictions.
+ def candidates(self, ignore_empty_candidates: bool=True) -> List[MentionCandidates]:
+ """(Override) Returns all `MentionCandidates` as a list, with REL disambiguation
+ scores determining the predicted Wikidata links. If `ignore_empty_candidates` is `True`,
+ only non-empty candidates are considered."""
+
+ # Construct equivalent Candidate instances but with the REL scores in the PredictedLinks.
+ ret = list()
+ for c, rs in zip(super().candidates(ignore_empty_candidates=False), self.rel_scores):
+
+ # Check that the mention in the RelScores instance matches that in the candidate.
+ if rs.mention != c.mention.mention:
+ raise ValueError(f"Inconsistent toponym mentions in RelScores ({rs.mention}) and candidate ({c.mention.mention})")
+
+ if c.is_empty():
+ if not ignore_empty_candidates:
+ ret.append(c)
+ continue
+
+ predicted_links = c.best_match()
+
+ # Get the list of WikidataLink instances for which REL scores are available.
+ wikidata_links = [wl for wl in predicted_links.wikidata_links if wl.wqid in rs.scores.keys()]
+ links = [PredictedLinks(predicted_links.string_match, wikidata_links, rs.scores)]
+
+ ret.append(MentionCandidates(
+ c.mention,
+ c.ranking_method,
+ c.linking_method,
+ links,
+ c.place_of_pub_wqid,
+ c.place_of_pub))
+ return ret
+
+ def interim_candidates(self, ignore_empty_candidates: bool=True) -> List[MentionCandidates]:
+ """Returns the list of `MentionCandidates` instances with their interim disambiguation
+ scores, that is, the scores obtained before applying the REL disambiguation method."""
+ return super().candidates(ignore_empty_candidates)
diff --git a/t_res/utils/deezy_processing.py b/t_res/utils/deezy_processing.py
index 0da3d10a..ca7f7d57 100644
--- a/t_res/utils/deezy_processing.py
+++ b/t_res/utils/deezy_processing.py
@@ -1,3 +1,8 @@
+"""
+The `t_res.utils.deezy_processing` module contains utility functions associated
+with training and applying the DeezyMatch model for fuzzy string matching.
+"""
+
import glob
import itertools
import os
@@ -33,18 +38,16 @@ def obtain_matches(
sims (list): The list of 100 nearest neighbors from the OCR word2vec
model.
fuzz_ratio_threshold (float): The threshold used for
- `thefuzz.fuzz.ratio `.
+ [`thefuzz.fuzz.ratio`](https://github.com/seatgeek/thefuzz#simple-ratio).
If the nearest neighbor word is an existing English word and the
- string similarity is below ``fuzz_ratio_threshold``, it is considered
- a negative match, i.e. not an OCR variation. Defaults to ``70``.
+ string similarity is below `fuzz_ratio_threshold`, it is considered
+ a negative match, i.e. not an OCR variation.
Returns:
- Tuple[List[str], List[str]]: A tuple that contains two lists:
+ A tuple containing two lists:
- #. The first list consists of *positive* matches for the input
- word.
- #. The second list consists of *negative* matches, a list of
- negative matches for the input word.
+ 1. The first list consists of *positive* matches for the input word.
+ 1. The second list consists of *negative* matches, a list of negative matches for the input word.
"""
negative = []
positive = [word]
@@ -96,7 +99,7 @@ def obtain_matches(
def create_training_set(
deezy_parameters: dict, strvar_parameters: dict, wikidata_to_mentions: dict
-) -> None:
+):
"""
Create a training set for DeezyMatch consisting of positive and negative
string matches.
@@ -115,14 +118,10 @@ def create_training_set(
parameters required to create a DeezyMatch training dataset.
wikidata_to_mentions (dict): Mapping between Wikidata IDs and mentions.
- Returns:
- None.
-
Note:
This function creates a new file with the string pairs dataset called
- ``w2v_ocr_pairs.txt`` inside the folder path defined as ``dm_path`` in
- the DeezyMatch parameters passed in setting up the ranker passed to
- this function as ``myranker``.
+ `w2v_ocr_pairs.txt` inside the folder path defined as ``dm_path`` in
+ the DeezyMatch parameters passed to this function.
"""
# Path to the output string pairs dataset:
@@ -253,26 +252,22 @@ def create_training_set(
fw.write(pm)
-def train_deezy_model(deezy_parameters: dict, strvar_parameters: dict, wikidata_to_mentions: dict) -> None:
+def train_deezy_model(deezy_parameters: dict, strvar_parameters: dict, wikidata_to_mentions: dict):
"""
- Train a DeezyMatch model using the provided ``myranker`` parameters and
- input files.
+ Train a DeezyMatch model using the provided parameters and input files.
This function trains a DeezyMatch model based on the specified parameters
- in the myranker object and the required input files. If the
- ``overwrite_training`` parameter is set to True or the model does not
+ in the ranker object and the required input files. If the
+ `overwrite_training` parameter is set to True or the model does not
exist, the function will train a new DeezyMatch model.
Arguments:
deezy_parameters (dict): Dictionary of DeezyMatch parameters
for model training.
- Returns:
- None
-
Note:
- This function returns a DeezyMatch model, stored in the location
- specified in the DeezyMatch ``input_dfm.yaml`` file.
+ This function writes a DeezyMatch model to disk, in the location
+ specified in the DeezyMatch `input_dfm.yaml` file.
"""
# Read the filepaths:
@@ -313,13 +308,13 @@ def train_deezy_model(deezy_parameters: dict, strvar_parameters: dict, wikidata_
print("The DeezyMatch model is already trained!")
-def generate_candidates(deezy_parameters: dict, mentions_to_wikidata: dict) -> None:
+def generate_candidates(deezy_parameters: dict, mentions_to_wikidata: dict):
"""
Obtain Wikidata candidates (Wikipedia mentions to Wikidata entities) and
generate their corresponding vectors.
This function retrieves Wikidata candidates based on the mentions stored
- in the ``myranker`` object and generates their corresponding vectors using
+ in the ``ranker`` object and generates their corresponding vectors using
the DeezyMatch model. It writes the candidates to a file and generates
embeddings with the DeezyMatch model.
@@ -328,15 +323,11 @@ def generate_candidates(deezy_parameters: dict, mentions_to_wikidata: dict) -> N
for model training.
mentions_to_wikidata (dict): Mapping between mentions and Wikidata IDs.
- Returns:
- None.
-
Note:
The function saves the candidates to a file and generates embeddings
using the DeezyMatch model. The resulting vectors are stored in the
output directories specified in the DeezyMatch parameters passed to
- the ranker passed to this function in the ``myranker`` keyword
- argument.
+ this function.
"""
deezymatch_outputs_path = deezy_parameters["dm_path"]
candidates = deezy_parameters["dm_cands"]
diff --git a/t_res/utils/get_data.py b/t_res/utils/get_data.py
index 169a9be0..3d7d89fc 100644
--- a/t_res/utils/get_data.py
+++ b/t_res/utils/get_data.py
@@ -1,3 +1,10 @@
+"""
+The `t_res.utils.get_data` module contains utility functions for downloading
+benchmark datasets from the [British Library
+repository](https://bl.iro.bl.uk/concern/datasets/f3686eb9-4227-45cb-9acb-0453d35e6a03)
+and from the [HIPE repository](https://impresso.github.io/CLEF-HIPE-2020/datasets.html).
+"""
+
import os
import zipfile
from pathlib import Path
@@ -5,15 +12,12 @@
import wget
-def download_lwm_data(news_path: str) -> None:
+def download_lwm_data(news_path: str):
"""
Download the LwM dataset from the BL repository and unzip it.
Arguments:
- news_path (str): The path where the dataset will be downloaded.
-
- Returns:
- None.
+ news_path (str): The directory path where the dataset will be stored.
"""
url = (
"https://bl.iro.bl.uk/downloads/0192d762-7277-46d0-8363-1636079e7afd?locale=en"
@@ -31,15 +35,12 @@ def download_lwm_data(news_path: str) -> None:
zip_ref.extractall(news_path)
-def download_hipe_data(hipe_path: str) -> None:
+def download_hipe_data(hipe_path: str):
"""
Download the HIPE dataset from the HIPE repository and unzip it.
Arguments:
- hipe_path (str): The path where the dataset will be downloaded.
-
- Returns:
- None.
+ hipe_path (str): The directory path where the dataset will be stored.
"""
dev_url = "https://raw.githubusercontent.com/hipe-eval/HIPE-2022-data/main/data/v2.1/hipe2020/en/HIPE-2022-v2.1-hipe2020-dev-en.tsv"
test_url = "https://raw.githubusercontent.com/hipe-eval/HIPE-2022-data/main/data/v2.1/hipe2020/en/HIPE-2022-v2.1-hipe2020-test-en.tsv"
diff --git a/t_res/utils/ner.py b/t_res/utils/ner_utils.py
similarity index 96%
rename from t_res/utils/ner.py
rename to t_res/utils/ner_utils.py
index 84bbf361..1a2a83b3 100644
--- a/t_res/utils/ner.py
+++ b/t_res/utils/ner_utils.py
@@ -1,14 +1,19 @@
+"""
+The `t_res.utils.ner_utils` module contains utility functions associated
+with named entity recognition (NER) model training and inference.
+"""
+
from collections import namedtuple
from typing import List, Literal, NamedTuple, Tuple, Union
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, tokenization_utils_base
def training_tokenize_and_align_labels(
examples: dict,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
label_encoding_dict: dict,
-):
+) -> tokenization_utils_base.BatchEncoding:
"""
Tokenize and align labels during training.
@@ -25,16 +30,15 @@ def training_tokenize_and_align_labels(
tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): A
transformers tokenizer object, which is the tokenizer of the base
model.
- label_encoding_dict (Dict): A dictionary mapping NER labels to label
- IDs, from ``label2id`` in
- :py:meth:`~geoparser.recogniser.Recogniser.train`.
+ label_encoding_dict (Dict): A dictionary mapping NER labels to label IDs,
+ from `label2id` in the [CustomRecogniser][t_res.geoparser.ner.CustomRecogniser]
+ `train` method.
Returns:
- transformers.tokenization_utils_base.BatchEncoding:
- The tokenized inputs with aligned labels.
-
+ The tokenized inputs with aligned labels as a `transformers.tokenization_utils_base.BatchEncoding` instance.
+
Credit:
- This function is adapted from `HuggingFace `_.
+ This function is adapted from [HuggingFace](https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner.py).
"""
label_all_tokens = True
tokenized_inputs = tokenizer(
@@ -194,8 +198,8 @@ def aggregate_mentions(
Arguments:
predictions (List[List]): A list of token predictions, where each
token prediction is represented as a list of values. For details
- on each of those tuples, see
- :py:meth:`~utils.ner.collect_named_entities`.
+ on each of those tuples, see the NER Utils function
+ [collect_named_entities][t_res.utils.ner_utils.collect_named_entities].
setting (Literal["pred", "gold"]): The setting for aggregation:
- If set to ``"pred"``, the function aggregates predicted mentions.
diff --git a/t_res/utils/preprocess_data.py b/t_res/utils/preprocess_data.py
index d8f5c785..da69ac27 100644
--- a/t_res/utils/preprocess_data.py
+++ b/t_res/utils/preprocess_data.py
@@ -1,6 +1,7 @@
"""
-This script reads the original data sources and formats them for our
-experiments.
+The `t_res.utils.preprocess_data` module contains utility functions to
+read input datasets from their original sources and format them for
+use in the T-Res pipeline and in benchmarking experiments.
"""
import glob
@@ -26,16 +27,24 @@ def turn_wikipedia2wikidata(
wikipedia_path (str): The path to your wikipedia directory.
Returns:
- Optional[str]:
- The corresponding Wikidata ID if available, or None if not.
-
- Example:
- >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Colosseum", "../resources")
- 'Q10285'
- >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Ancient_Egypt", "../resources")
- 'Q11768'
- >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Invalid_Location", "../resources")
- Warning: invalid_location is not in wikipedia2wikidata, the wkdt_qid will be None.
+ The corresponding Wikidata ID if available, otherwise None.
+
+ Example: Examples:
+ **Wikipedia title: Colosseum**
+ ```
+ turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Colosseum", "../resources")
+ > 'Q10285'
+ ```
+ **Wikipedia title: Ancient_Egypt**
+ ```
+ turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Ancient_Egypt", "../resources")
+ > 'Q11768'
+ ```
+ **Wikipedia title: Invalid_Location**
+ ```
+ turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Invalid_Location", "../resources")
+ > 'Warning: invalid_location is not in wikipedia2wikidata, the wkdt_qid will be None.'
+ ```
"""
if not wikipedia_title == "NIL" and not wikipedia_title == "*":
wikipedia_title = wikipedia_title.split("/wiki/")[-1]
@@ -69,9 +78,8 @@ def reconstruct_sentences(dTokens: dict) -> dict:
information and annotations.
Returns:
- dict:
- A dictionary mapping sentence IDs to their corresponding
- reconstructed sentences and character start positions.
+ A dictionary mapping sentence IDs to their corresponding reconstructed sentences
+ and character start positions.
Note:
This function takes into account white spaces to ensure character
@@ -167,14 +175,13 @@ def process_lwm_for_ner(tsv_topres_path: str):
the annotated TSV files.
Returns:
- pandas.DataFrame:
- A DataFrame containing the processed LwM data for NER training,
+ A DataFrame containing the processed LwM data for NER training,
with the following columns:
-
- - **id**: The unique identifier of each sentence (``_
- ``).
- - **ner_tags**: A list of NER tags assigned to each token in the
- sentence.
+
+ - **id**: The unique identifier of each sentence
+ (`_`).
+ - **ner_tags**: A list of NER tags assigned to each token in
+ the sentence.
- **tokens**: A list of tokens in the sentence.
Note:
@@ -247,7 +254,8 @@ def process_lwm_for_linking(
gazetteer_ids (list): The set of Wikidata IDs in the gazetteer.
Returns:
- pandas.DataFrame: A DataFrame with the following columns:
+ A DataFrame with the following columns:
+
- ``article_id``: The identifier of the article.
- ``sentences``: A list of dictionaries containing the sentence
position and text.
@@ -396,30 +404,14 @@ def aggregate_hipe_entities(entity: dict, lEntities: List[dict]) -> List[dict]:
lEntities (list): The list of entities to be updated.
Returns:
- List[dict]
- The updated list of entities after aggregating the current entity.
+ The updated list of entities after aggregating the current entity.
Example:
- >>> entity = {
- "ne_type": "I-LOC",
- "word": "York",
- "wkdt_qid": "Q60",
- "start": 12,
- "end": 15,
- "meto_type": "city",
- }
- >>> lEntities = [
- {
- "ne_type": "B-LOC",
- "word": "New",
- "wkdt_qid": "Q60",
- "start": 8,
- "end": 10,
- "meto_type": "city",
- }
- ]
- >>> updated_entities = aggregate_hipe_entities(entity, lEntities)
- >>> print(updated_entities)
+ ```
+ entity = {"ne_type": "I-LOC", "word": "York", "wkdt_qid": "Q60", "start": 12, "end": 15, "meto_type": "city"}
+ lEntities = [{"ne_type": "B-LOC", "word": "New", "wkdt_qid": "Q60", "start": 8, "end": 10, "meto_type": "city"}]
+ updated_entities = aggregate_hipe_entities(entity, lEntities)
+ print(updated_entities)
[
{
"ne_type": "B-LOC",
@@ -430,6 +422,7 @@ def aggregate_hipe_entities(entity: dict, lEntities: List[dict]) -> List[dict]:
"meto_type": "city",
}
]
+ ```
Note:
The function takes an entity and a list of entities and aggregates
@@ -475,7 +468,8 @@ def process_hipe_for_linking(hipe_path: str, gazetteer_ids: List[str]) -> pd.Dat
gazetteer_ids (List[str]): The set of Wikidata IDs in the gazetteer.
Returns:
- pandas.DataFrame: A DataFrame with the following columns:
+ A DataFrame with the following columns:
+
- ``article_id``: The identifier of the article.
- ``sentences``: A list of dictionaries containing the sentence
position and text.
@@ -717,22 +711,23 @@ def process_tsv(filepath: str) -> Tuple[dict, dict]:
filepath (str): The path to the TSV file.
Returns:
- tuple: A tuple containing two dictionaries:
- #. **dMTokens**: A dictionary of tokens with positional
- information and multi-token annotations. The keys in dTokens
- are tuples of two elements (the sentence number in the document,
- and the character position).
-
- #. **dTokens**: A dictionary of tokens with positional information,
+ A tuple containing two dictionaries:
+
+ 1. **dMTokens**: A dictionary of tokens with positional
+ information and multi-token annotations. The keys in dTokens
+ are tuples of two elements (the sentence number in the document,
+ and the character position).
+
+ 1. **dTokens**: A dictionary of tokens with positional information,
Wikipedia ID, label, and BIO annotations. The values of dTokens
are tuples of six elements:
- #. the actual token,
- #. the wikipedia url,
- #. the toponym class,
- #. the sentence number in the document,
- #. the character position of a token in the document, and
- #. the character end position of a token in the document.
+ 1. the actual token,
+ 1. the wikipedia url,
+ 1. the toponym class,
+ 1. the sentence number in the document,
+ 1. the character position of a token in the document, and
+ 1. the character end position of a token in the document.
Note:
This function assumes a specific format and structure of the TSV file.
diff --git a/t_res/utils/process_data.py b/t_res/utils/process_data.py
index c8da8cd5..fdd23690 100644
--- a/t_res/utils/process_data.py
+++ b/t_res/utils/process_data.py
@@ -1,3 +1,7 @@
+"""
+The `t_res.utils.process_data` module contains utility functions for data processing.
+"""
+
import json
import os
import sys
@@ -8,15 +12,13 @@
import pandas as pd
from tqdm import tqdm
-from . import ner
-
-if TYPE_CHECKING:
- from ..geoparser import recogniser
+from . import ner_utils
+from ..geoparser import ner
def eval_with_exception(str2parse: str, in_case: Optional[Any] = "") -> Any:
"""
- Evaluate a string expression using :py:func:`ast.literal_eval`. If
+ Evaluate a string expression using `ast.literal_eval`. If
the evaluation succeeds, the result is returned. If a ``ValueError``
occurs during evaluation, the provided ``in_case`` value is returned
instead.
@@ -24,18 +26,19 @@ def eval_with_exception(str2parse: str, in_case: Optional[Any] = "") -> Any:
Arguments:
str2parse (str): The string expression to be evaluated.
in_case (Any, optional): The value to return in case of a
- ``ValueError``. Defaults to ``""``.
+ ``ValueError``.
Returns:
- Any:
- The evaluated result if successful, or the ``in_case`` value if an
+ The evaluated result if successful, or the ``in_case`` value if an
error occurs.
- Example:
- >>> eval_with_exception("[1, 2, 3]")
- [1, 2, 3]
- >>> process_data.eval_with_exception(None, [])
- []
+ Example: Examples:
+ ```
+ eval_with_exception("[1, 2, 3]")
+ > [1, 2, 3]
+ eval_with_exception(None, [])
+ > []
+ ```
"""
try:
return literal_eval(str2parse)
@@ -50,19 +53,19 @@ def prepare_sents(df: pd.DataFrame) -> Tuple[dict, dict, dict]:
Returns:
Tuple[dict, dict, dict]: A tuple consisting of three dictionaries:
- #. ``dSentences``: A dictionary in which we keep, for each article/
+ 1. ``dSentences``: A dictionary in which we keep, for each article/
sentence (expressed as e.g. ``"10732214_1"``, where
``"10732214"`` is the article_id and ``"1"`` is the order of
the sentence in the article), the full original unprocessed
sentence.
- #. ``dAnnotated``: A dictionary in which we keep, for each article/
+ 1. ``dAnnotated``: A dictionary in which we keep, for each article/
sentence, an inner dictionary mapping the position of an
annotated named entity (i.e. its start and end character, as a
tuple, as the key) and another tuple as its value, which
consists of: the type of named entity (such as ``LOC`` or
``BUILDING``, the mention, and its annotated link), all
extracted from the gold standard.
- #. ``dMetadata``: A dictionary in which we keep, for each article/
+ 1. ``dMetadata``: A dictionary in which we keep, for each article/
sentence, its metadata: ``place`` (of publication), ``year``,
``ocr_quality_mean``, ``ocr_quality_sd``, ``publication_title``,
``publication_code``, and ``place_wqid`` (Wikidata ID of the
@@ -153,6 +156,7 @@ def align_gold(predictions: List[dict], annotations: dict) -> List[dict]:
``"O"``).
- ``link`` (str): The predicted entity link (initially set to
``"O"``).
+
annotations (dict): A dictionary where the keys are tuples
representing the start and end positions of gold standard
detections in a sentence, and the values are tuples containing the
@@ -161,8 +165,7 @@ def align_gold(predictions: List[dict], annotations: dict) -> List[dict]:
``"Q335322"``).
Returns:
- List[dict]:
- A list of dictionaries representing the aligned gold standard
+ A list of dictionaries representing the aligned gold standard
labels. Each dictionary contains the same keys as the predictions:
- ``start`` (int): The start position of the aligned token.
@@ -202,13 +205,11 @@ def postprocess_predictions(
Postprocess predictions to be used later in the pipeline.
Arguments:
- predictions (list): the output of the
- :py:meth:`geoparser.recogniser.Recogniser.ner_predict` method,
+ predictions (list): the output of the
+ [Recogniser][t_res.geoparser.ner.Recogniser] `ner_predict` method,
where, given a sentence, a list of dictionaries is returned, where
each dictionary corresponds to a recognised token, e.g.:
-
- .. code-block:: json
-
+ ```json
{
"entity": "O",
"score": 0.99975187,
@@ -216,22 +217,23 @@ def postprocess_predictions(
"start": 0,
"end": 4
}
+ ```
- gold_positions (list): the output of the
- :py:func:`utils.process_data.align_gold` function, which
+ gold_positions (list): the output of the
+ [align_gold][t_res.utils.process_data.align_gold] function, which
aligns the gold standard text to the tokenisation performed by the
named entity recogniser, to enable assessing the performance of
the NER and linking steps.
Returns:
- dict: A dictionary with three key-value pairs:
-
- #. ``sentence_preds`` is mapped to the list of lists
- representation of ``predictions``,
- #. ``sentence_trues`` is mapped to the list of lists
- representation of 'gold_positions', and
- #. ``sentence_skys`` is the same as ``sentence_trues``, but with
- empty link.
+ A dictionary with three key-value pairs:
+
+ 1. ``sentence_preds`` is mapped to the list of lists
+ representation of ``predictions``,
+ 1. ``sentence_trues`` is mapped to the list of lists
+ representation of 'gold_positions', and
+ 1. ``sentence_skys`` is the same as ``sentence_trues``, but with
+ empty link.
"""
postprocessed_sentence = dict()
sentence_preds = [
@@ -253,10 +255,9 @@ def postprocess_predictions(
return postprocessed_sentence
-# TODO/typing: set ``myner: recogniser.Recogniser`` here, but creates problem with Sphinx currently
def ner_and_process(
- dSentences: dict, dAnnotated: dict, myner
-) -> Tuple[dict, dict, dict, dict, dict]:
+ dSentences: dict, dAnnotated: dict, recogniser: ner.Recogniser
+) -> Tuple[dict, dict, dict, dict, dict, dict]:
"""
Perform named entity recognition in the LwM way, and postprocess the
output to prepare it for the experiments.
@@ -272,95 +273,93 @@ def ner_and_process(
key) and another tuple as its value, which consists of: the type
of named entity (such as ``LOC`` or ``BUILDING``, the mention, and
its annotated link), all extracted from the gold standard.
- myner (recogniser.Recogniser): a Recogniser object, for NER.
+ recogniser (ner.Recogniser): a Recogniser object, for NER.
Returns:
- Tuple[dict, dict, dict, dict, dict]:
- A tuple consisting of five dictionaries:
-
- #. **dPreds**: A dictionary where the NER predictions are stored,
- where the key is the sentence_id (i.e. ``article_id + "_" +
- sentence_pos``) and the value is a list of lists, where each
- element corresponds to one token in a sentence, for example:
-
- .. code-block:: json
-
- ["From", "O", "O", 0, 4, 0.999826967716217]
-
- ...where the the elements by their position are:
-
- #. the token,
- #. the NER tag,
- #. the link to wikidata, set to ``"O"`` for now because we haven't
- performed linking yet,
- #. the starting character of the token,
- #. the end character of the token, and
- #. the NER prediction score.
-
- This dictionary is stored as a JSON file in the ``outputs/data``
- folder, with the suffix ``_ner_predictions.json``.
-
- #. **dTrues**: A dictionary where the gold standard named entities
- are stored, which has the same format as **dPreds** above, but
- with the manually annotated data instead of the predictions.
-
- This dictionary is stored as a JSON file in the ``outputs/data``
- folder, with the suffix ``_gold_standard.json``.
-
- #. **dSkys**: A dictionary where the skyline will be stored, for
- the linking experiments. At this point, it will be the same as
- **dPreds**, without the NER prediction score. During linking, it
- will be filled with the gold standard entities when these have
- been retrieved using candidates.
-
- This dictionary is stored as a JSON file in the ``outputs/data``
- folder, with the suffix ``_ner_skyline.json``.
-
- #. **gold_tokenization**: A dictionary where the gold standard
- entities are stored, and keys represent ``sentence_id`` (i.e.
- ``article_id + "_" + sentence_pos``) and the values are lists of
- dictionaries, each looking like this:
-
- .. code-block:: json
-
- {
- "entity": "B-LOC",
- "score": 1.0,
- "word": "Unitec",
- "start": 193,
- "end": 199,
- "link": "B-Q30"
- }
-
- This dictionary is stored as a JSON file in the ``outputs/data``
- folder, with the suffix ``_gold_positions.json``.
-
- #. **dMentionsPred**: A dictionary of detected mentions but not
- yet linked mentions, for example:
-
- .. code-block:: json
-
- {
- "sn83030483-1790-03-03-a-i0001_9": [
- {
- "mention": "Unitec ? States",
- "start_offset": 38,
- "end_offset": 40,
- "start_char": 193,
- "end_char": 206,
- "ner_score": 0.79,
- "ner_label": "LOC",
- "entity_link": "O"
- }
- ],
- }
-
- This dictionary is stored as a JSON file in the ``outputs/data``
- folder, with the suffix ``_pred_mentions.json``.
-
- #. **dMentionsGold**: A dictionary consisting of gold standard
- mentions, analogous to the dictionary of detected mentions, but
- with the gold standard ``ner_label`` and ``entity_link``.
+ A tuple consisting of six dictionaries:
+
+ 1. **dPreds**: A dictionary where the NER predictions are stored,
+ where the key is the sentence_id (i.e. ``article_id + "_" +
+ sentence_pos``) and the value is a list of lists, where each
+ element corresponds to one token in a sentence, for example:
+
+ ```json
+ ["From", "O", "O", 0, 4, 0.999826967716217]
+ ```
+ ...where the the elements by their position are:
+
+ 1. the token,
+ 1. the NER tag,
+ 1. the link to wikidata, set to ``"O"`` for now because we haven't
+ performed linking yet,
+ 1. the starting character of the token,
+ 1. the end character of the token, and
+ 1. the NER prediction score.
+
+ This dictionary is stored as a JSON file in the ``outputs/data``
+ folder, with the suffix ``_ner_predictions.json``.
+
+ 1. **dTrues**: A dictionary where the gold standard named entities
+ are stored, which has the same format as **dPreds** above, but
+ with the manually annotated data instead of the predictions.
+
+ This dictionary is stored as a JSON file in the ``outputs/data``
+ folder, with the suffix ``_gold_standard.json``.
+
+ 1. **dSkys**: A dictionary where the skyline will be stored, for
+ the linking experiments. At this point, it will be the same as
+ **dPreds**, without the NER prediction score. During linking, it
+ will be filled with the gold standard entities when these have
+ been retrieved using candidates.
+
+ This dictionary is stored as a JSON file in the ``outputs/data``
+ folder, with the suffix ``_ner_skyline.json``.
+
+ 1. **gold_tokenization**: A dictionary where the gold standard
+ entities are stored, and keys represent ``sentence_id`` (i.e.
+ ``article_id + "_" + sentence_pos``) and the values are lists of
+ dictionaries, each looking like this:
+
+ ```json
+ {
+ "entity": "B-LOC",
+ "score": 1.0,
+ "word": "Unitec",
+ "start": 193,
+ "end": 199,
+ "link": "B-Q30"
+ }
+ ```
+
+ This dictionary is stored as a JSON file in the ``outputs/data``
+ folder, with the suffix ``_gold_positions.json``.
+
+ 1. **dMentionsPred**: A dictionary of detected mentions but not
+ yet linked mentions, for example:
+
+ ```json
+ {
+ "sn83030483-1790-03-03-a-i0001_9": [
+ {
+ "mention": "Unitec ? States",
+ "start_offset": 38,
+ "end_offset": 40,
+ "start_char": 193,
+ "end_char": 206,
+ "ner_score": 0.79,
+ "ner_label": "LOC",
+ "entity_link": "O"
+ }
+ ],
+ }
+ ```
+
+ This dictionary is stored as a JSON file in the ``outputs/data``
+ folder, with the suffix ``_pred_mentions.json``.
+
+ 1. **dMentionsGold**: A dictionary consisting of gold standard
+ mentions, analogous to the dictionary of detected mentions, but
+ with the gold standard ``ner_label`` and ``entity_link``.
"""
gold_tokenization = dict()
dPreds = dict()
@@ -371,17 +370,17 @@ def ner_and_process(
for sent_id in tqdm(list(dSentences.keys())):
sent = dSentences[sent_id]
annotations = dAnnotated[sent_id]
- predictions = myner.ner_predict(sent)
+ predictions = recogniser.ner_predict(sent)
gold_positions = align_gold(predictions, annotations)
sentence_postprocessing = postprocess_predictions(predictions, gold_positions)
dPreds[sent_id] = sentence_postprocessing["sentence_preds"]
dTrues[sent_id] = sentence_postprocessing["sentence_trues"]
dSkys[sent_id] = sentence_postprocessing["sentence_skys"]
gold_tokenization[sent_id] = gold_positions
- dMentionsPred[sent_id] = ner.aggregate_mentions(
+ dMentionsPred[sent_id] = ner_utils.aggregate_mentions(
sentence_postprocessing["sentence_preds"], "pred"
)
- dMentionsGold[sent_id] = ner.aggregate_mentions(
+ dMentionsGold[sent_id] = ner_utils.aggregate_mentions(
sentence_postprocessing["sentence_trues"], "gold"
)
@@ -410,7 +409,12 @@ def update_with_linking(ner_predictions: dict, link_predictions: pd.Series) -> d
link_predictions[lp]["token_start"], link_predictions[lp]["token_end"] + 1
):
position_ner = resulting_preds[x][1][:2]
- resulting_preds[x][2] = position_ner + link_predictions[lp]["pred_wqid"]
+ # TODO: improve handling of empty Wikidata predictions:
+ # resulting_preds[x][2] = position_ner + link_predictions[lp]["pred_wqid"]
+ if link_predictions[lp]["pred_wqid"]:
+ resulting_preds[x][2] = position_ner + link_predictions[lp]["pred_wqid"]
+ else:
+ resulting_preds[x][2] = position_ner
return resulting_preds
@@ -524,7 +528,7 @@ def store_for_scorer(
scenario_name: str,
dresults: dict,
articles_test: List[str],
-) -> None:
+):
"""
Stores the results in the required format for evaluation using the CLEF-HIPE scorer.
@@ -535,15 +539,12 @@ def store_for_scorer(
articles_test (list): A list of sentences that are part of the split used
for evaluating the performance in the provided experiment.
- Returns:
- None.
-
Note:
The function also creates a TSV file with the results in the Conll
format required by the scorer.
- For more information about the CLEF-HIPE scorer, see
- https://github.com/impresso/CLEF-HIPE-2020-scorer.
+ For more information, see the
+ [CLEF-HIPE scorer project](https://github.com/impresso/CLEF-HIPE-2020-scorer).
"""
# Bundle 2 associated tasks: NERC-coarse and NEL
with open(
diff --git a/t_res/utils/process_wikipedia.py b/t_res/utils/process_wikipedia.py
index 91dd4477..303289d2 100644
--- a/t_res/utils/process_wikipedia.py
+++ b/t_res/utils/process_wikipedia.py
@@ -1,3 +1,8 @@
+"""
+The `t_res.utils.process_wikipedia` module contains utility functions for handling
+Wikipedia links and page titles.
+"""
+
import sqlite3
import urllib.parse
from typing import Optional
@@ -7,25 +12,31 @@ def make_wikilinks_consistent(url: str) -> str:
"""
Make the wiki links consistent by performing the following operations:
- #. Convert the URL to lowercase.
- #. Unquote the URL to decode any percent-encoded characters.
- #. Replace underscores with spaces if they exist in the unquoted URL.
- #. Remove any fragment identifier (text after the '#' symbol) if present.
- #. Quote the modified URL to encode any special characters.
+ 1. Convert the URL to lowercase.
+ 1. Unquote the URL to decode any percent-encoded characters.
+ 1. Replace underscores with spaces if they exist in the unquoted URL.
+ 1. Remove any fragment identifier (text after the '#' symbol) if present.
+ 1. Quote the modified URL to encode any special characters.
Arguments:
url (str): The URL to make consistent.
Returns:
- str: The modified and quoted URL.
-
- Example:
- >>> make_wikilinks_consistent("Python_(programming_language)#Overview")
- 'python%20%28programming%20language%29'
- >>> make_wikilinks_consistent("Data_science")
- 'data%20science'
- >>> make_wikilinks_consistent("San_Francisco")
- 'san%20francisco'
+ The modified and quoted URL.
+
+ Example: Examples:
+ ```
+ make_wikilinks_consistent("Python_(programming_language)#Overview")
+ > 'python%20%28programming%20language%29'
+ ```
+ ```
+ make_wikilinks_consistent("Data_science")
+ > 'data%20science'
+ ```
+ ```
+ make_wikilinks_consistent("San_Francisco")
+ > 'san%20francisco'
+ ```
"""
url = url.lower()
unquote = urllib.parse.unquote(url)
@@ -42,24 +53,28 @@ def make_wikipedia2wikidata_consisent(entity: str) -> str:
Make the Wikipedia entity consistent with Wikidata by performing the
following operations:
- #. Make the wiki links consistent using the 'make_wikilinks_consistent'
- function.
- #. Unquote the modified and quoted URL to decode any percent-encoded
- characters.
- #. Replace spaces with underscores in the unquoted URL.
+ 1. Make the wiki links consistent using the 'make_wikilinks_consistent'
+ function.
+ 1. Unquote the modified and quoted URL to decode any percent-encoded
+ characters.
+ 1. Replace spaces with underscores in the unquoted URL.
Arguments:
entity (str): The Wikipedia entity to make consistent.
Returns:
- str: The modified Wikipedia entity consistent with the
- wikipedia2wikidata mapper.
-
- Example:
- >>> make_wikipedia2wikidata_consistent("New York City")
- 'new_york_city'
- >>> make_wikipedia2wikidata_consistent("Data science")
- 'data_science'
+ The modified Wikipedia entity consistent with the wikipedia2wikidata
+ mapper.
+
+ Example: Examples:
+ ```
+ make_wikipedia2wikidata_consistent("New York City")
+ > 'new_york_city'
+ ```
+ ```
+ make_wikipedia2wikidata_consistent("Data science")
+ > 'data_science'
+ ```
"""
quoted_entity = make_wikilinks_consistent(entity)
underscored = urllib.parse.unquote(quoted_entity).replace(" ", "_")
@@ -72,20 +87,19 @@ def title_to_id(
"""
Given a Wikipedia page title, returns the corresponding Wikidata ID.
The page title is the last part of a Wikipedia url **unescaped** and spaces
- replaced by underscores , e.g. for `https://en.wikipedia.org/wiki/Fermat%27s_Last_Theorem`,
+ replaced by underscores , e.g. for ,
the title would be `Fermat's_Last_Theorem`.
Arguments:
- path_to_db: The path to the wikidata2wikipedia db
- page_title: The page title of the Wikipedia entry, e.g. ``Manatee``.
+ path_to_db (str): The path to the wikidata2wikipedia db
+ page_title (str): The page title of the Wikipedia entry, e.g. ``Manatee``.
Returns:
- str, optional:
- If a mapping could be found for ``wiki_page_title``, then returns
- the mapping, otherwise None.
+ If a mapping could be found for ``wiki_page_title``, then returns
+ the mapping, otherwise `None`.
Credit:
- This function is adapted from https://github.com/jcklie/wikimapper.
+ This function is adapted from .
"""
with sqlite3.connect(path_to_db) as conn:
diff --git a/t_res/utils/rel_e2e.py b/t_res/utils/rel_e2e.py
index 1d0beca4..450cdb37 100644
--- a/t_res/utils/rel_e2e.py
+++ b/t_res/utils/rel_e2e.py
@@ -1,3 +1,9 @@
+"""
+The `t_res.utils.rel_e2e` module contains utility functions for running
+end-to-end entity linking using the
+[Radboud Entity Linker](https://github.com/informagi/REL) (REL) model.
+"""
+
import json
import os
import sys
@@ -20,14 +26,14 @@ def rel_end_to_end(sent: str) -> dict:
sent (str): A sentence in plain text.
Returns:
- dict: The output from the REL end-to-end API for the input sentence.
+ The output from the REL end-to-end API for the input sentence.
"""
API_URL = "https://rel.cs.ru.nl/api"
el_result = requests.post(API_URL, json={"text": sent, "spans": []}).json()
return el_result
-def get_rel_from_api(dSentences: dict, rel_end2end_path: str) -> None:
+def get_rel_from_api(dSentences: dict, rel_end2end_path: str):
"""
Use the REL API to perform end-to-end entity linking.
@@ -37,9 +43,6 @@ def get_rel_from_api(dSentences: dict, rel_end2end_path: str) -> None:
sentence.
rel_end2end_path (str): The path of the file where the REL results
will be stored.
-
- Returns:
- None.
"""
# Dictionary to store REL predictions:
rel_preds = dict()
@@ -66,11 +69,11 @@ def match_wikipedia_to_wikidata(
Arguments:
wiki_title (str): A Wikipedia title in underscore-separated format.
- path_to_db (str): The path to your wikipedia database (e.g. "../resources/wikipedia/index_enwiki-latest.db").
+ path_to_db (str): The path to your wikipedia database (e.g.
+ "../resources/wikipedia/index_enwiki-latest.db").
Returns:
- str:
- The corresponding Wikidata QID for the entity, or ``"NIL"`` if not
+ The corresponding Wikidata QID for the entity, or ``"NIL"`` if not
found.
"""
wqid = process_wikipedia.title_to_id(
@@ -83,7 +86,7 @@ def match_wikipedia_to_wikidata(
return wqid
-def match_ent(pred_ents, start, end, prev_ann, gazetteer_ids):
+def match_ent(pred_ents, start, end, prev_ann, gazetteer_ids) -> tuple:
"""
Find the corresponding string and prediction information returned by REL
for a specific gold standard token position in a sentence.
@@ -98,10 +101,11 @@ def match_ent(pred_ents, start, end, prev_ann, gazetteer_ids):
gazetteer_ids (set): A set of entity IDs in the knowledge base.
Returns:
- tuple: A tuple with three elements:
- #. The entity type.
- #. The entity link.
- #. The entity type of the previous token.
+ A tuple with three elements:
+
+ 1. The entity type.
+ 1. The entity link.
+ 1. The entity type of the previous token.
"""
for ent in pred_ents:
wqid = match_wikipedia_to_wikidata(ent[3])
@@ -131,7 +135,7 @@ def match_ent(pred_ents, start, end, prev_ann, gazetteer_ids):
return "O", "O", ""
-def postprocess_rel(rel_preds, dSentences, gold_tokenization, wikigaz_ids):
+def postprocess_rel(rel_preds, dSentences, gold_tokenization, wikigaz_ids) -> dict:
"""
Retokenize the REL output for each sentence to match the gold standard
tokenization.
@@ -145,8 +149,7 @@ def postprocess_rel(rel_preds, dSentences, gold_tokenization, wikigaz_ids):
wikigaz_ids (set): A set of Wikidata IDs of entities in the gazetteer.
Returns:
- dict:
- A dictionary that maps a sentence ID to the REL predictions,
+ A dictionary that maps a sentence ID to the REL predictions,
retokenized as in the gold standard.
"""
dREL = dict()
@@ -168,7 +171,7 @@ def postprocess_rel(rel_preds, dSentences, gold_tokenization, wikigaz_ids):
def store_rel(
experiment: experiment.Experiment, dREL: dict, approach: str, how_split: str
-) -> None:
+):
"""
Store the REL results for a specific experiment, approach, and split, in
the format required by the HIPE scorer.
@@ -181,9 +184,6 @@ def store_rel(
how_split (str): The type of split for which to store the results
(e.g., ``originalsplit``, ``Ashton1860``).
- Returns:
- None.
-
Note:
This function saves a TSV file with the results in the Conll format
required by the scorer.
@@ -192,7 +192,7 @@ def store_rel(
scenario_name = (
approach
+ "_"
- + experiment.myner.model # The model name is needed due to tokenization
+ + experiment.recogniser.model # The model name is needed due to tokenization
+ "_"
+ how_split
)
@@ -212,12 +212,9 @@ def store_rel(
)
-def run_rel_experiments(self) -> None:
+def run_rel_experiments(self):
"""
Run the end-to-end REL experiments.
-
- Returns:
- None.
"""
# Continue only if flag is True:
if self.rel_experiments == False:
diff --git a/t_res/utils/rel_utils.py b/t_res/utils/rel_utils.py
index 73d6c3e9..6b9fdd21 100644
--- a/t_res/utils/rel_utils.py
+++ b/t_res/utils/rel_utils.py
@@ -1,3 +1,9 @@
+"""
+The `t_res.utils.rel_e2e` module contains utility functions for training
+and running the [Radboud Entity Linker](https://github.com/informagi/REL)
+(REL) model.
+"""
+
import json
import os
import sqlite3
@@ -10,6 +16,7 @@
import pandas as pd
from ..geoparser import ranking
+from .dataclasses import *
RANDOM_SEED = 42
"""Constant representing the random seed used for generating pseudo-random
@@ -21,9 +28,9 @@
reproducibility in experiments or when consistent random behavior is
desired.
-..
- If this docstring is changed, also make sure to edit prepare_data.py,
- linking.py, entity_disambiguation.py.
+Note:
+ If this docstring is changed, also make sure to edit `prepare_data.py`,
+ `linking.py` and `entity_disambiguation.py`.
"""
np.random.seed(RANDOM_SEED)
@@ -48,8 +55,7 @@ def get_db_emb(
use Wikipedia2Vec entity embeddings.
Returns:
- List[Optional[np.ndarray]]:
- A list of arrays (or ``None``) representing the embeddings for the
+ A list of arrays (or ``None``) representing the embeddings for the
given mentions.
Note:
@@ -98,8 +104,7 @@ def eval_with_exception(str2parse: str, in_case: Optional[Any] = "") -> Any:
Default is ``""``.
Returns:
- Any
- The parsed value if successful, or the specified value in case of
+ The parsed value if successful, or the specified value in case of
an error.
"""
try:
@@ -117,15 +122,14 @@ def prepare_initial_data(df: pd.DataFrame) -> dict:
df: The dataframe containing the linking training data.
Returns:
- dict:
- A dictionary with article IDs as keys and a list of mention
+ A dictionary with article IDs as keys and a list of mention
dictionaries as values. Each mention dictionary contains
information about a mention, excluding the "gold" field and
candidates (at this point).
Note:
The DataFrame passed to this function can be generated by the
- ``experiments/prepare_data.py`` script.
+ ``experiments/prepare_data.py`` script.
"""
dict_mentions = dict()
for i, row in df.iterrows():
@@ -175,8 +179,8 @@ def prepare_initial_data(df: pd.DataFrame) -> dict:
return dict_mentions
-
-def rank_candidates(rel_json: dict, wk_cands: dict, mentions_to_wikidata: dict) -> dict:
+# Deprecated (this logic has been moved to the RelDisambLinker)
+def rank_candidates(rel_json: dict, wk_cands: dict) -> dict:
"""
Rank the candidates for each mention in the provided JSON data.
@@ -188,7 +192,7 @@ def rank_candidates(rel_json: dict, wk_cands: dict, mentions_to_wikidata: dict)
entities.
Returns:
- dict: A new JSON dictionary with ranked candidates for each mention.
+ A new JSON dictionary with ranked candidates for each mention.
"""
new_json = dict()
for article in rel_json:
@@ -197,21 +201,33 @@ def rank_candidates(rel_json: dict, wk_cands: dict, mentions_to_wikidata: dict)
cands = []
tmp_cands = []
max_cand_freq = 0
- ranker_cands = wk_cands.get(mention_dict["mention"], dict())
- for c in ranker_cands:
+
+ # TODO: get the ranking method from wk_cands.
+ default = MentionCandidates(mention_dict["mention"], "TODO", "reldisamb", list())
+ linker_cands = wk_cands.get(mention_dict["mention"], default)
+
+ if not isinstance(linker_cands, MentionCandidates):
+ raise ValueError(f"Expected MentionCandidates instance. Found: {type(linker_cands)}")
+
+ # NOTE: mentions_to_wikidata here is the absolute link frequency data.
+
+ for m in linker_cands.links:
# DeezyMatch confidence score (cosine similarity):
- cand_selection_score = ranker_cands[c]["Score"]
+ cand_selection_score = m.string_match.string_similarity
# For each Wikidata candidate:
- for qc in ranker_cands[c]["Candidates"]:
+ for wikidata_link in m.wikidata_links:
+ wqid = wikidata_link.wqid
# Mention-to-wikidata absolute relevance:
- qcrlv_score = mentions_to_wikidata[c][qc]
+ # Here we assume that wikidata_link is a RelDisambLink instance:
+ qcrlv_score = wikidata_link.freq
if qcrlv_score > max_cand_freq:
max_cand_freq = qcrlv_score
- qcm2w_score = ranker_cands[c]["Candidates"][qc]
+ qcm2w_score = wikidata_link.normalized_score
# Average of CS conf score and mention2wiki norm relv:
if cand_selection_score:
qcm2w_score = (qcm2w_score + cand_selection_score) / 2
- tmp_cands.append((qc, qcrlv_score, qcm2w_score))
+ tmp_cands.append((wqid, qcrlv_score, qcm2w_score))
+
# Append candidate and normalized score weighted by candidate selection conf:
for cand in tmp_cands:
qc_id = cand[0]
@@ -245,7 +261,7 @@ def add_publication(
to an empty string.
Returns:
- dict: A new JSON dictionary with the added publication information.
+ A new JSON dictionary with the added publication information.
"""
new_json = rel_json.copy()
for article in rel_json:
@@ -277,8 +293,8 @@ def add_publication(
def prepare_rel_trainset(
df: pd.DataFrame,
rel_params,
- mentions_to_wikidata,
- myranker: ranking.Ranker,
+ ranker: ranking.Ranker,
+ linker,
dsplit: str,
) -> dict:
"""
@@ -286,7 +302,7 @@ def prepare_rel_trainset(
This function takes as input a pandas DataFrame (`df`) containing the
dataset generated in the ``experiments/prepare_data.py`` script, along
- with a Linking object (``mylinker``) and a Ranking object (``myranker``).
+ with a Linking object (``linker``) and a Ranking object (``ranker``).
It prepares the data in the format required to train and test a REL
disambiguation model, using the candidates from the ranker.
@@ -297,53 +313,37 @@ def prepare_rel_trainset(
entity disambiguation using the ``reldisamb`` approach.
mentions_to_wikidata (dict): Dictionary mapping mentions to Wikidata
entities, with counts.
- myranker (geoparser.ranking.Ranker): The Ranking object.
+ ranker (geoparser.ranking.Ranker): The Ranking object.
dsplit (str): The split identifier for the data (e.g., ``"train"``,
``"test"``).
Returns:
- dict: The prepared data in the format of a JSON dictionary.
+ The prepared data in the format of a JSON dictionary.
Note:
This function stores the formatted dataset as a JSON file.
"""
rel_json = prepare_initial_data(df)
+ sentence_mentions = {k: SentenceMentions.from_list(rel_json[k]) for k in rel_json.keys()}
- # Get unique mentions, to run them through the ranker:
- all_mentions = []
- for article in rel_json:
- if rel_params["without_microtoponyms"]:
- all_mentions += [
- y["mention"] for y in rel_json[article] if y["ner_label"] == "LOC"
- ]
- else:
- all_mentions += [y["mention"] for y in rel_json[article]]
- all_mentions = list(set(all_mentions))
- # Format the mentions are required by the ranker:
- all_mentions = [{"mention": mention} for mention in all_mentions]
- # Use the ranker to find candidates:
- wk_cands, other = myranker.find_candidates(all_mentions)
-
- # Rank the candidates:
- rel_json = rank_candidates(
- rel_json,
- wk_cands,
- mentions_to_wikidata,
- )
- # If "publ" is taken into account for the disambiguation, add the place
- # of publication as an additional already disambiguated entity per row:
- if rel_params["with_publication"] == True:
- rel_json = add_publication(
- rel_json,
- rel_params["default_publname"],
- rel_params["default_publwqid"],
- )
+ sentence_candidates = list()
+ for k in rel_json.keys():
+ matches = ranker.run(sentence_mentions[k].mentions)
+ candidates = [linker.run(m, rel_json[k][0]["place_wqid"], rel_json[k][0]["place"]) for m in matches]
+ sentence_candidates.append(SentenceCandidates(sentence_mentions[k].sentence, candidates))
+
+ # Get interim predictions (i.e. without applying the REL model).
+ predictions = [linker.disambiguate([scs], apply_rel=False) for scs in sentence_candidates]
+ training_predictions = [TrainingPredictions(p.sentence_candidates) for p in predictions]
+
+ trainset = {k: p.as_list(rel_params["with_publication"])
+ for k, p in zip(rel_json.keys(), training_predictions)}
## TO DO
with open(
os.path.join(rel_params["data_path"], "rel_{}.json").format(dsplit),
"w",
) as f:
- json.dump(rel_json, f)
+ json.dump(trainset, f)
- return rel_json
+ return trainset
diff --git a/tests/app_fixtures.py b/tests/app_fixtures.py
new file mode 100644
index 00000000..aa6735d2
--- /dev/null
+++ b/tests/app_fixtures.py
@@ -0,0 +1,3 @@
+# T-Res app test fixtures
+
+dict_candidates = {'sentence_candidates': [{'sentence': {'sentence': 'Harvey, from London;Thomas and Elizabeth, Barnett.'}, 'candidates': [{'sort_index': 13.0, 'mention': {'sort_index': 13, 'mention': 'London', 'start_offset': 3, 'end_offset': 3, 'start_char': 13, 'ner_score': 0.997, 'ner_label': 'LOC', 'entity_link': 'O'}, 'ranking_method': 'deezymatch', 'linking_method': 'reldisamb', 'links': [{'sort_index': 1.0, 'string_match': {'sort_index': 1.0, 'variation': 'London', 'string_similarity': 1.0}, 'wikidata_links': [{'wqid': 'Q179876', 'wkdt_class': 'Q3024240', 'freq': 2, 'normalized_score': 0.0007538635506973238}, {'wqid': 'Q8680', 'wkdt_class': 'Q3024240', 'freq': 2, 'normalized_score': 0.00022158209616662973}, {'wqid': 'Q6900329', 'wkdt_class': 'Q17912683', 'freq': 10, 'normalized_score': 0.004210526315789474}, {'wqid': 'Q145', 'wkdt_class': 'Q3624078', 'freq': 2, 'normalized_score': 2.249491052649338e-05}, {'wqid': 'Q309388', 'wkdt_class': 'Q17431399', 'freq': 2, 'normalized_score': 0.0010178117048346056}, {'wqid': 'Q6373', 'wkdt_class': 'Q17431399', 'freq': 2, 'normalized_score': 0.0003213883978788366}, {'wqid': 'Q734547', 'wkdt_class': 'Q7631958', 'freq': 1, 'normalized_score': 0.0008688097306689834}, {'wqid': 'Q1394500', 'wkdt_class': 'Q7631958', 'freq': 2, 'normalized_score': 0.0016051364365971107}, {'wqid': 'Q23800', 'wkdt_class': 'Q3957', 'freq': 1, 'normalized_score': 0.0007272727272727272}, {'wqid': 'Q84', 'wkdt_class': 'Q515', 'freq': 76938, 'normalized_score': 0.9761847364080442}, {'wqid': 'Q338466', 'wkdt_class': None, 'freq': 9, 'normalized_score': 0.21428571428571427}, {'wqid': 'Q87910176', 'wkdt_class': 'Q3241045', 'freq': 1, 'normalized_score': 0.034482758620689655}, {'wqid': 'Q927198', 'wkdt_class': 'Q486972', 'freq': 7, 'normalized_score': 0.02845528455284553}, {'wqid': 'Q123885', 'wkdt_class': 'Q45400320', 'freq': 4, 'normalized_score': 0.0007478033277248083}, {'wqid': 'Q8577', 'wkdt_class': 'Q159821', 'freq': 46, 'normalized_score': 0.0036853068418522673}, {'wqid': 'Q30822', 'wkdt_class': 'Q47345468', 'freq': 2, 'normalized_score': 0.01639344262295082}, {'wqid': 'Q736742', 'wkdt_class': 'Q3957', 'freq': 1, 'normalized_score': 0.0008787346221441124}, {'wqid': 'Q8111', 'wkdt_class': 'Q159821', 'freq': 3, 'normalized_score': 0.0012396694214876032}, {'wqid': 'Q2914560', 'wkdt_class': 'Q1474414', 'freq': 1, 'normalized_score': 0.0033444816053511705}, {'wqid': 'Q194209', 'wkdt_class': 'Q26132862', 'freq': 1, 'normalized_score': 0.015873015873015872}, {'wqid': 'Q2734390', 'wkdt_class': 'Q7631958', 'freq': 2, 'normalized_score': 0.004291845493562232}, {'wqid': 'Q460735', 'wkdt_class': 'Q1133961', 'freq': 4, 'normalized_score': 0.0032626427406199023}, {'wqid': 'Q1359589', 'wkdt_class': None, 'freq': 28, 'normalized_score': 0.00586756077116513}, {'wqid': 'Q8691', 'wkdt_class': 'Q94993988', 'freq': 38, 'normalized_score': 0.020127118644067795}, {'wqid': 'Q1584160', 'wkdt_class': 'Q2385804', 'freq': 1, 'normalized_score': 0.0012135922330097086}, {'wqid': 'Q269902', 'wkdt_class': 'Q26132862', 'freq': 1, 'normalized_score': 0.03125}, {'wqid': 'Q674773', 'wkdt_class': 'Q45400320', 'freq': 1, 'normalized_score': 0.0014124293785310734}, {'wqid': 'Q5364202', 'wkdt_class': 'Q483110', 'freq': 2, 'normalized_score': 0.001932367149758454}, {'wqid': 'Q16003969', 'wkdt_class': 'Q2385804', 'freq': 2, 'normalized_score': 0.0009385265133740028}, {'wqid': 'Q55018', 'wkdt_class': 'Q24354', 'freq': 4, 'normalized_score': 0.001195457262402869}, {'wqid': 'Q270263', 'wkdt_class': 'Q43501', 'freq': 2, 'normalized_score': 0.004796163069544364}, {'wqid': 'Q220198', 'wkdt_class': 'Q45400320', 'freq': 1, 'normalized_score': 0.0015313935681470138}, {'wqid': 'Q2733342', 'wkdt_class': 'Q736917', 'freq': 1, 'normalized_score': 0.0037735849056603774}, {'wqid': 'Q733210', 'wkdt_class': 'Q26132862', 'freq': 1, 'normalized_score': 0.010869565217391304}, {'wqid': 'Q278054', 'wkdt_class': 'Q3146899', 'freq': 5, 'normalized_score': 0.15625}, {'wqid': 'Q1137312', 'wkdt_class': 'Q67376938', 'freq': 68, 'normalized_score': 0.12710280373831775}, {'wqid': 'Q1988417', 'wkdt_class': 'Q123705', 'freq': 4, 'normalized_score': 0.04597701149425287}, {'wqid': 'Q245247', 'wkdt_class': 'Q62078547', 'freq': 1, 'normalized_score': 0.00022609088853719196}, {'wqid': 'Q2354215', 'wkdt_class': 'Q82794', 'freq': 14, 'normalized_score': 0.014388489208633094}, {'wqid': 'Q835031', 'wkdt_class': 'Q3917681', 'freq': 1, 'normalized_score': 0.02857142857142857}, {'wqid': 'Q270920', 'wkdt_class': 'Q207694', 'freq': 1, 'normalized_score': 0.00018583906337112061}, {'wqid': 'Q12252353', 'wkdt_class': 'Q41176', 'freq': 2, 'normalized_score': 0.019801980198019802}, {'wqid': 'Q170027', 'wkdt_class': 'Q3918', 'freq': 143, 'normalized_score': 0.020146520146520144}, {'wqid': 'Q800751', 'wkdt_class': 'Q55488', 'freq': 22, 'normalized_score': 0.029451137884872823}, {'wqid': 'Q214788', 'wkdt_class': 'Q55488', 'freq': 18, 'normalized_score': 0.020642201834862386}, {'wqid': 'Q92561', 'wkdt_class': 'Q515', 'freq': 811, 'normalized_score': 0.3156870377578824}, {'wqid': 'Q38602881', 'wkdt_class': 'Q41176', 'freq': 1, 'normalized_score': 0.2}, {'wqid': 'Q1488404', 'wkdt_class': 'Q811979', 'freq': 3, 'normalized_score': 0.030303030303030304}, {'wqid': 'Q14946379', 'wkdt_class': 'Q18917976', 'freq': 47, 'normalized_score': 0.11809045226130653}, {'wqid': 'Q23311', 'wkdt_class': 'Q515', 'freq': 149, 'normalized_score': 0.03187847667950364}, {'wqid': 'Q279459', 'wkdt_class': 'Q494829', 'freq': 11, 'normalized_score': 0.10679611650485436}, {'wqid': 'Q795691', 'wkdt_class': 'Q55485', 'freq': 6, 'normalized_score': 0.007308160779537149}, {'wqid': 'Q2477346', 'wkdt_class': 'Q486972', 'freq': 13, 'normalized_score': 0.8125}, {'wqid': 'Q3061911', 'wkdt_class': 'Q1093829', 'freq': 70, 'normalized_score': 0.48951048951048953}, {'wqid': 'Q6670323', 'wkdt_class': 'Q19953632', 'freq': 1, 'normalized_score': 0.037037037037037035}, {'wqid': 'Q171240', 'wkdt_class': 'Q11691', 'freq': 35, 'normalized_score': 0.013202565069784986}, {'wqid': 'Q1545354', 'wkdt_class': 'Q44782', 'freq': 56, 'normalized_score': 0.2545454545454545}, {'wqid': 'Q219867', 'wkdt_class': 'Q55488', 'freq': 24, 'normalized_score': 0.03301237964236589}, {'wqid': 'Q578794', 'wkdt_class': 'Q18608583', 'freq': 23, 'normalized_score': 0.03571428571428571}, {'wqid': 'Q1415441', 'wkdt_class': 'Q1248784', 'freq': 1, 'normalized_score': 0.005813953488372093}, {'wqid': 'Q6669759', 'wkdt_class': 'Q123705', 'freq': 9, 'normalized_score': 1.0}, {'wqid': 'Q985210', 'wkdt_class': 'Q55488', 'freq': 6, 'normalized_score': 0.008915304606240713}, {'wqid': 'Q795678', 'wkdt_class': 'Q55488', 'freq': 1, 'normalized_score': 0.017543859649122806}, {'wqid': 'Q7242790', 'wkdt_class': 'Q132241', 'freq': 3, 'normalized_score': 0.030303030303030304}, {'wqid': 'Q216185', 'wkdt_class': 'Q4989906', 'freq': 1, 'normalized_score': 0.0015082956259426848}, {'wqid': 'Q720102', 'wkdt_class': 'Q55488', 'freq': 16, 'normalized_score': 0.02069857697283312}, {'wqid': 'Q23306', 'wkdt_class': 'Q180673', 'freq': 89, 'normalized_score': 0.043078412391093904}, {'wqid': 'Q42182', 'wkdt_class': 'Q570116', 'freq': 1, 'normalized_score': 0.00041459369817578774}, {'wqid': 'Q1449564', 'wkdt_class': 'Q55488', 'freq': 6, 'normalized_score': 0.75}, {'wqid': 'Q14710970', 'wkdt_class': 'Q17343829', 'freq': 9, 'normalized_score': 0.6923076923076923}, {'wqid': 'Q1001456', 'wkdt_class': 'Q1093829', 'freq': 60, 'normalized_score': 0.5084745762711864}, {'wqid': 'Q503516', 'wkdt_class': 'Q13410447', 'freq': 1, 'normalized_score': 0.009345794392523364}, {'wqid': 'Q8982', 'wkdt_class': 'Q1248784', 'freq': 1, 'normalized_score': 0.004016064257028112}, {'wqid': 'Q8712', 'wkdt_class': 'Q94993988', 'freq': 2, 'normalized_score': 0.006329113924050633}, {'wqid': 'Q649419', 'wkdt_class': 'Q55488', 'freq': 6, 'normalized_score': 0.020134228187919462}, {'wqid': 'Q20657974', 'wkdt_class': 'Q17343829', 'freq': 4, 'normalized_score': 1.0}, {'wqid': 'Q565521', 'wkdt_class': 'Q1802963', 'freq': 1, 'normalized_score': 0.007042253521126761}, {'wqid': 'Q2716505', 'wkdt_class': 'Q2755753', 'freq': 1, 'normalized_score': 0.0049261083743842365}, {'wqid': 'Q123738', 'wkdt_class': 'Q22698', 'freq': 2, 'normalized_score': 0.0014545454545454545}, {'wqid': 'Q8703', 'wkdt_class': 'Q94993988', 'freq': 4, 'normalized_score': 0.00425531914893617}, {'wqid': 'Q15179170', 'wkdt_class': 'Q5367899', 'freq': 1, 'normalized_score': 0.05}, {'wqid': 'Q10818', 'wkdt_class': 'Q217327', 'freq': 11, 'normalized_score': 0.01785714285714286}, {'wqid': 'Q15242653', 'wkdt_class': 'Q33506', 'freq': 1, 'normalized_score': 0.023809523809523808}, {'wqid': 'Q20075', 'wkdt_class': 'Q5503', 'freq': 13, 'normalized_score': 0.004782928623988226}, {'wqid': 'Q6669738', 'wkdt_class': 'Q17343829', 'freq': 8, 'normalized_score': 0.6666666666666666}, {'wqid': 'Q756819', 'wkdt_class': 'Q79007', 'freq': 1, 'normalized_score': 0.001183431952662722}, {'wqid': 'Q130206', 'wkdt_class': 'Q537127', 'freq': 4, 'normalized_score': 0.0064516129032258064}, {'wqid': 'Q729177', 'wkdt_class': 'Q570116', 'freq': 2, 'normalized_score': 0.046511627906976744}, {'wqid': 'Q1399178', 'wkdt_class': 'Q32815', 'freq': 1, 'normalized_score': 0.027777777777777776}, {'wqid': 'Q5645763', 'wkdt_class': 'Q494829', 'freq': 1, 'normalized_score': 0.058823529411764705}, {'wqid': 'Q801124', 'wkdt_class': 'Q55485', 'freq': 8, 'normalized_score': 0.011396011396011397}, {'wqid': 'Q16707732', 'wkdt_class': 'Q41176', 'freq': 1, 'normalized_score': 0.022222222222222223}, {'wqid': 'Q4834838', 'wkdt_class': 'Q14350', 'freq': 1, 'normalized_score': 0.0026109660574412533}, {'wqid': 'Q21014314', 'wkdt_class': 'Q79007', 'freq': 1, 'normalized_score': 0.05555555555555555}, {'wqid': 'Q17509255', 'wkdt_class': 'Q79007', 'freq': 1, 'normalized_score': 0.041666666666666664}, {'wqid': 'Q800753', 'wkdt_class': 'Q55488', 'freq': 1, 'normalized_score': 0.006024096385542169}, {'wqid': 'Q6671078', 'wkdt_class': 'Q938381', 'freq': 7, 'normalized_score': 0.12068965517241378}, {'wqid': 'Q1782648', 'wkdt_class': 'Q34442', 'freq': 1, 'normalized_score': 0.007246376811594203}, {'wqid': 'Q62378', 'wkdt_class': 'Q33506', 'freq': 1, 'normalized_score': 0.0003386386725364037}, {'wqid': 'Q186309', 'wkdt_class': 'Q667018', 'freq': 1, 'normalized_score': 0.0026595744680851063}, {'wqid': 'Q148349', 'wkdt_class': 'Q2755753', 'freq': 1, 'normalized_score': 0.0009182736455463728}, {'wqid': 'Q212883', 'wkdt_class': 'Q26132862', 'freq': 1, 'normalized_score': 0.03571428571428571}, {'wqid': 'Q195436', 'wkdt_class': 'Q207694', 'freq': 1, 'normalized_score': 0.0012453300124533001}, {'wqid': 'Q5038252', 'wkdt_class': 'Q486972', 'freq': 1, 'normalized_score': 0.14285714285714285}, {'wqid': 'Q743535', 'wkdt_class': 'Q2755753', 'freq': 3, 'normalized_score': 0.001595744680851064}, {'wqid': 'Q83609', 'wkdt_class': 'Q3957', 'freq': 1, 'normalized_score': 0.002688172043010753}, {'wqid': 'Q801135', 'wkdt_class': 'Q55488', 'freq': 1, 'normalized_score': 0.018518518518518517}, {'wqid': 'Q79348', 'wkdt_class': 'Q1093829', 'freq': 14, 'normalized_score': 0.7777777777777777}, {'wqid': 'Q193196', 'wkdt_class': 'Q4671277', 'freq': 3, 'normalized_score': 0.000476114902396445}, {'wqid': 'Q4801470', 'wkdt_class': 'Q2418495', 'freq': 1, 'normalized_score': 0.005988023952095809}, {'wqid': 'Q23298', 'wkdt_class': 'Q180673', 'freq': 3, 'normalized_score': 0.00029925187032418956}, {'wqid': 'Q1431914', 'wkdt_class': 'Q644371', 'freq': 1, 'normalized_score': 0.003205128205128205}, {'wqid': 'Q1323689', 'wkdt_class': 'Q220505', 'freq': 32, 'normalized_score': 0.034782608695652174}, {'wqid': 'Q7594521', 'wkdt_class': 'Q1088552', 'freq': 1, 'normalized_score': 0.1}, {'wqid': 'Q26888', 'wkdt_class': 'Q7897276', 'freq': 2, 'normalized_score': 0.004514672686230248}, {'wqid': 'Q8709', 'wkdt_class': 'Q94993988', 'freq': 2, 'normalized_score': 0.003663003663003663}, {'wqid': 'Q801125', 'wkdt_class': 'Q55488', 'freq': 1, 'normalized_score': 0.0029239766081871343}, {'wqid': 'Q2422792', 'wkdt_class': 'Q1907114', 'freq': 1, 'normalized_score': 0.008264462809917356}, {'wqid': 'Q205679', 'wkdt_class': 'Q7897276', 'freq': 1, 'normalized_score': 0.0009652509652509653}, {'wqid': 'Q1666958', 'wkdt_class': 'Q59861107', 'freq': 1, 'normalized_score': 0.017857142857142856}, {'wqid': 'Q5011830', 'wkdt_class': 'Q14350', 'freq': 1, 'normalized_score': 0.07692307692307693}, {'wqid': 'Q772421', 'wkdt_class': 'Q494230', 'freq': 1, 'normalized_score': 0.005917159763313609}, {'wqid': 'Q2602736', 'wkdt_class': 'Q785020', 'freq': 2, 'normalized_score': 0.028985507246376812}, {'wqid': 'Q1241240', 'wkdt_class': 'Q41176', 'freq': 3, 'normalized_score': 0.011278195488721804}, {'wqid': 'Q5209252', 'wkdt_class': 'Q41176', 'freq': 3, 'normalized_score': 0.21428571428571427}, {'wqid': 'Q1481050', 'wkdt_class': 'Q38723', 'freq': 1, 'normalized_score': 0.0020920502092050207}, {'wqid': 'Q6670230', 'wkdt_class': 'Q64578911', 'freq': 1, 'normalized_score': 0.005952380952380952}, {'wqid': 'Q2018322', 'wkdt_class': 'Q79007', 'freq': 1, 'normalized_score': 0.014925373134328358}, {'wqid': 'Q4592132', 'wkdt_class': 'Q24354', 'freq': 1, 'normalized_score': 0.003278688524590164}, {'wqid': 'Q7012708', 'wkdt_class': 'Q11483816', 'freq': 1, 'normalized_score': 0.03225806451612903}, {'wqid': 'Q146436', 'wkdt_class': 'Q2755753', 'freq': 1, 'normalized_score': 0.011111111111111112}, {'wqid': 'Q220144', 'wkdt_class': 'Q45400320', 'freq': 2, 'normalized_score': 0.0013351134846461949}, {'wqid': 'Q1869055', 'wkdt_class': 'Q11483816', 'freq': 1, 'normalized_score': 0.07692307692307693}, {'wqid': 'Q208152', 'wkdt_class': 'Q7897276', 'freq': 2, 'normalized_score': 0.002770083102493075}, {'wqid': 'Q730706', 'wkdt_class': 'Q7897276', 'freq': 1, 'normalized_score': 0.0020876826722338203}, {'wqid': 'Q6669736', 'wkdt_class': 'Q17343829', 'freq': 5, 'normalized_score': 0.7142857142857142}, {'wqid': 'Q128468', 'wkdt_class': 'Q1049757', 'freq': 1, 'normalized_score': 0.0002538715410002539}, {'wqid': 'Q6342081', 'wkdt_class': 'Q1616075', 'freq': 1, 'normalized_score': 0.03125}, {'wqid': 'Q23891196', 'wkdt_class': 'Q4671277', 'freq': 5, 'normalized_score': 0.5555555555555556}, {'wqid': 'Q213560', 'wkdt_class': 'Q7897276', 'freq': 1, 'normalized_score': 0.0024096385542168677}, {'wqid': 'Q15378797', 'wkdt_class': 'Q3917681', 'freq': 1, 'normalized_score': 0.0625}, {'wqid': 'Q60578265', 'wkdt_class': 'Q27990982', 'freq': 6, 'normalized_score': 0.1935483870967742}, {'wqid': 'Q378991', 'wkdt_class': 'Q34442', 'freq': 1, 'normalized_score': 0.0036101083032490976}, {'wqid': 'Q3274670', 'wkdt_class': 'Q1248784', 'freq': 1, 'normalized_score': 0.008}, {'wqid': 'Q129761', 'wkdt_class': 'Q220505', 'freq': 1, 'normalized_score': 0.017543859649122806}, {'wqid': 'Q169101', 'wkdt_class': 'Q1901835', 'freq': 1, 'normalized_score': 0.0012195121951219512}, {'wqid': 'Q5370437', 'wkdt_class': 'Q17350442', 'freq': 1, 'normalized_score': 0.045454545454545456}, {'wqid': 'Q284091', 'wkdt_class': 'Q105390172', 'freq': 1, 'normalized_score': 0.0058823529411764705}, {'wqid': 'Q189960', 'wkdt_class': 'Q2755753', 'freq': 2, 'normalized_score': 0.0008900756564307966}, {'wqid': 'Q768935', 'wkdt_class': 'Q41176', 'freq': 1, 'normalized_score': 0.02631578947368421}, {'wqid': 'Q6670725', 'wkdt_class': 'Q7372078', 'freq': 2, 'normalized_score': 0.02}, {'wqid': 'Q6670747', 'wkdt_class': 'Q494230', 'freq': 1, 'normalized_score': 0.005747126436781609}, {'wqid': 'Q1861679', 'wkdt_class': 'Q3917681', 'freq': 1, 'normalized_score': 0.3333333333333333}, {'wqid': 'Q202059', 'wkdt_class': 'Q7897276', 'freq': 1, 'normalized_score': 0.002369668246445498}, {'wqid': 'Q7308066', 'wkdt_class': 'Q902104', 'freq': 1, 'normalized_score': 0.011904761904761904}, {'wqid': 'Q106119', 'wkdt_class': 'Q1248784', 'freq': 1, 'normalized_score': 0.0036496350364963502}, {'wqid': 'Q5602825', 'wkdt_class': 'Q494829', 'freq': 1, 'normalized_score': 0.09090909090909091}, {'wqid': 'Q6670816', 'wkdt_class': None, 'freq': 1, 'normalized_score': 0.125}, {'wqid': 'Q1786933', 'wkdt_class': 'Q2087181', 'freq': 1, 'normalized_score': 0.005291005291005291}, {'wqid': 'Q174570', 'wkdt_class': 'Q38723', 'freq': 1, 'normalized_score': 0.00018709073900841907}, {'wqid': 'Q6669869', 'wkdt_class': 'Q11483816', 'freq': 1, 'normalized_score': 0.038461538461538464}, {'wqid': 'Q318043', 'wkdt_class': 'Q1154710', 'freq': 1, 'normalized_score': 0.0022727272727272726}, {'wqid': 'Q124234', 'wkdt_class': 'Q2755753', 'freq': 1, 'normalized_score': 0.0036101083032490976}, {'wqid': 'Q28136122', 'wkdt_class': None, 'freq': 1, 'normalized_score': 0.5}, {'wqid': 'Q30119692', 'wkdt_class': 'Q750215', 'freq': 3, 'normalized_score': 0.037037037037037035}, {'wqid': 'Q683076', 'wkdt_class': 'Q41253', 'freq': 1, 'normalized_score': 0.0037735849056603774}, {'wqid': 'Q6670546', 'wkdt_class': 'Q77115', 'freq': 1, 'normalized_score': 0.08333333333333333}, {'wqid': 'Q801128', 'wkdt_class': 'Q55488', 'freq': 1, 'normalized_score': 0.0031645569620253164}, {'wqid': 'Q1093950', 'wkdt_class': 'Q543654', 'freq': 1, 'normalized_score': 0.011904761904761904}, {'wqid': 'Q867663', 'wkdt_class': 'Q1497364', 'freq': 1, 'normalized_score': 0.04}, {'wqid': 'Q4120330', 'wkdt_class': 'Q494230', 'freq': 1, 'normalized_score': 0.003745318352059925}, {'wqid': 'Q14996965', 'wkdt_class': 'Q41176', 'freq': 1, 'normalized_score': 0.02}, {'wqid': 'Q211', 'wkdt_class': 'Q3624078', 'freq': 1, 'normalized_score': 0.00010536297545042672}, {'wqid': 'Q4093', 'wkdt_class': 'Q515', 'freq': 1, 'normalized_score': 6.692096633875393e-05}, {'wqid': 'Q786032', 'wkdt_class': 'Q15640053', 'freq': 1, 'normalized_score': 0.003952569169960474}, {'wqid': 'Q1372700', 'wkdt_class': 'Q172754', 'freq': 1, 'normalized_score': 0.004291845493562232}, {'wqid': 'Q7594071', 'wkdt_class': 'Q1060829', 'freq': 1, 'normalized_score': 0.024390243902439025}, {'wqid': 'Q1333411', 'wkdt_class': 'Q4989906', 'freq': 1, 'normalized_score': 0.018518518518518517}, {'wqid': 'Q7374421', 'wkdt_class': 'Q16917', 'freq': 2, 'normalized_score': 0.004618937644341801}, {'wqid': 'Q6670807', 'wkdt_class': 'Q17343829', 'freq': 1, 'normalized_score': 0.2}, {'wqid': 'Q1126189', 'wkdt_class': 'Q811979', 'freq': 1, 'normalized_score': 0.0033222591362126247}, {'wqid': 'Q23000866', 'wkdt_class': 'Q3270419', 'freq': 1, 'normalized_score': 0.125}, {'wqid': 'Q29001838', 'wkdt_class': 'Q6813020', 'freq': 1, 'normalized_score': 0.014492753623188406}, {'wqid': 'Q2361392', 'wkdt_class': 'Q2755753', 'freq': 1, 'normalized_score': 0.002967359050445104}, {'wqid': 'Q16733625', 'wkdt_class': 'Q18414273', 'freq': 1, 'normalized_score': 0.047619047619047616}, {'wqid': 'Q1748936', 'wkdt_class': 'Q847017', 'freq': 1, 'normalized_score': 0.0038910505836575876}, {'wqid': 'Q1429312', 'wkdt_class': 'Q16970', 'freq': 1, 'normalized_score': 0.004098360655737705}, {'wqid': 'Q6696010', 'wkdt_class': 'Q15210668', 'freq': 1, 'normalized_score': 0.016129032258064516}, {'wqid': 'Q3323858', 'wkdt_class': 'Q2755753', 'freq': 1, 'normalized_score': 0.007352941176470588}, {'wqid': 'Q16843032', 'wkdt_class': 'Q18414273', 'freq': 1, 'normalized_score': 0.03571428571428571}, {'wqid': 'Q5633181', 'wkdt_class': 'Q11446', 'freq': 2, 'normalized_score': 0.6666666666666666}, {'wqid': 'Q16991744', 'wkdt_class': 'Q3917681', 'freq': 1, 'normalized_score': 0.25}, {'wqid': 'Q28163686', 'wkdt_class': 'Q62447', 'freq': 2, 'normalized_score': 0.6666666666666666}, {'wqid': 'Q1749569', 'wkdt_class': 'Q2940297', 'freq': 1, 'normalized_score': 0.14285714285714285}]}], 'place_of_pub_wqid': 'Q203349', 'place_of_pub': 'Poole, Dorset', 'with_publication': True}]}]}
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index d692742f..9c11e6c3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,13 +1,39 @@
+import pytest
from typing import Any, List
from typing_extensions import Final
-NO_SKIP_OPTION: Final[str] = "--no-skip"
+INCLUDE_ALL_OPTION: Final[str] = "--include-all"
+INCLUDE_RESOURCES_OPTION: Final[str] = "--include-resources"
+INCLUDE_TRAIN_OPTION: Final[str] = "--include-train"
+INCLUDE_APP_OPTION: Final[str] = "--include-app"
def pytest_addoption(parser):
- parser.addoption(NO_SKIP_OPTION, action="store_true", default=False, help="also run skipped tests")
+ parser.addoption(INCLUDE_ALL_OPTION, action="store_true", default=False, help="run all tests")
+ parser.addoption(INCLUDE_RESOURCES_OPTION, action="store_true", default=False, help="include tests dependent on large resources")
+ parser.addoption(INCLUDE_TRAIN_OPTION, action="store_true", default=False, help="include model training tests")
+ parser.addoption(INCLUDE_APP_OPTION, action="store_true", default=False, help="include HTTP API tests")
-def pytest_collection_modifyitems(config,
- items: List[Any]):
- if config.getoption(NO_SKIP_OPTION):
- for test in items:
- test.own_markers = [marker for marker in test.own_markers if marker.name not in ('skip', 'skipif')]
\ No newline at end of file
+def pytest_collection_modifyitems(
+ config,
+ items: List[Any],
+ ):
+ if config.getoption(INCLUDE_ALL_OPTION):
+ return
+
+ if not config.getoption(INCLUDE_RESOURCES_OPTION):
+ skipper = pytest.mark.skip(reason="Skip unless --include-resources or --include-all is given")
+ for item in items:
+ if "resources" in item.keywords:
+ item.add_marker(skipper)
+
+ if not config.getoption(INCLUDE_TRAIN_OPTION):
+ skipper = pytest.mark.skip(reason="Skip unless --test-train or --include-all is given")
+ for item in items:
+ if "train" in item.keywords:
+ item.add_marker(skipper)
+
+ if not config.getoption(INCLUDE_APP_OPTION):
+ skipper = pytest.mark.skip(reason="Skip unless --test-app or --include-all is given")
+ for item in items:
+ if "app" in item.keywords:
+ item.add_marker(skipper)
diff --git a/tests/sample_files/batch_jobs/1880-1900-LwM-HMD-subsample50.csv b/tests/sample_files/batch_jobs/1880-1900-LwM-HMD-subsample50.csv
new file mode 100644
index 00000000..da36ab91
--- /dev/null
+++ b/tests/sample_files/batch_jobs/1880-1900-LwM-HMD-subsample50.csv
@@ -0,0 +1,50 @@
+NLP,issue,art_num,title,collection,full_date,year,month,day,location,word_count,ocrquality,text,decade
+3406,1208,art0039,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1432,0.9245,"137 IV. aciuz. CHAPTER A few years before the date on which this ""narrative opens there had come to Burslem from nother part of the Midlands a young medical gentleman, fresh from his professional studies, and from the cultured teaching of Rugby public 'school. He had gradually become known in the town, and being allied with one of the stablishcd medical gentlemen of the neighbourhood his winning manners and professional ability soon enabled him to make his way. Perhaps the most prominent family of the day in this part of the Potteries was that of Mr. John Wood, who builtthe mansion at Brownhills, and adjoining to it a pottery, at which he was so prosperous that the ample fortune possessed by him was increased, and he became a man of great means. Happy in their father's !prosperity his children grew up around him. Air. Wood was one of the best of masters of old 'days to his workpeople, and when the festival 'days of the district came round they enjoyed 'themselves on his grounds, and no name became more respected than that of John Wood, of the Brownhills. His favouite daughter was Maria, The heroine of our story. Her gentle mannel:s and beauty were known to all. She was the 'companion of her father's walks, and she read to him on an evening when the cares of the day *ere over. Dr. Oliver, through the illness of Mrs. Wood, was called in, as the rising medical man of the town, to administer to her-, and. his professional attendance being frucoessful an intimacy sprung up which resulted infrequent visits to the hospitable mansion. Attracted by the beauty and graces of Miss Wood these visits beceme still more frequent, and his gentlemanly manners and intelligent discourse l'::1;ade him a weloome visitor. Tile attentions of Dr. Oliver to Maria were at first favoured by her father, but a change took place, and the father commanded that these attentions mast cease. The high-spirited young man resented this sudden change and angry words passed between them; lover and the father, the former being ordered' to leave the Hall, and forbidden ever to eager sLgain. Drownhills Hall, late in the last century, was earrounded by pleasant sylvan scenerye Beyond the Chatterloy Vale frowned the forest Cof Bradwell, and beneath the Hall itself van a meet valley, down which the brook tumbled near the road that then connected the growing communities of Tunstall and Burslem, but the 'ingenuity of man had contrived to make the stream do useful work as it flowed along. Near the wayside inn •it ran through a mUI poot, and from that it rushed down to make the miller's wheel go round. Standing near the fence andhedgerow that guarded this pool stood a. tall, handsome young man, and a parnglacly of gentle manners and refined comely uppearanee. There was something about them phich told—as the same characteristics have Wveays told—that they were lovers. Yet as the Shadows of the summer evening were falling it could be seen that there was little or no joy of leerre's early dreams about them. . Their con-I.versation had been earneet„end a flush was. llpron the face of the ladva—etlechanically she pineked a spray from a 'hawthorn bush, which stood near her, and threw it in the mill pool. Her lover did the same. The green spray which name from the hand of the lady was caught and held fast by the rushes in the pool; but the iotber, feeling the influence of the current, rushed iotrward and was lost in the eddy of water dashing towards the mill wheel. 'Maria,"" said the young man, ""if you were patted from me, my own life, like the .spray of 'hawthorn, would be lost in a whirlpool of ruin. Why should they come between us and our love'? have not much wealth, but as a member of an honourable profession, and with you as my wife, career is before me that you, I know, would 'feel proud of. Say that we shall not be parted, for I dread the future. I know that I cannot live without you, but I shall go down and be ocsit I' Wrought upon by his strong emotion, the young doctor—for this was Dr. Oliver, the 'unfortunate hero of our story—bowed his head, turd though he struggled with his grief, yet one half-rest/rained sob escaped him. ""Oh, Milward,"" said Maria, ""do not talk like ftlAt, all may yet come to pass as we wish. I will be true to you, I will plead with my father again; will go down on my knees and pray him not to . `separate us, but I cannot disobey him, Milward, foe I know that he loves me with his whole heart, and that heart would break if his favourite danghter were to disobey him. I know that your trial is great, but surely you will admit that mine is even greater, for how can I, with the love that I bear for my father and you, give up either 'and be happy. Let us be patient, and if kept `apart for a time I will be faithful to you. Nilward, alone,""and she placed her hand tenslerly upon her loyer's shoulders. Dr. Oliver looked up, took her hands in his (own, and looking into her eyes, his own dimmed Ivitl.l the tears of a man's deep feeling, he said, in husky tones_ ""some men whdae passion lies upon the -Lirface, find it so easy to be patient because disappointment ruffles them but little • but life to me is a living death, through the dread that . iall will not come right as you .hope it wilL I pknow thatmany would fold their disappointment sand grief, and put it as a napkin aside, would rush to give medicine to the puling child, and Nvonld smile sympathy over the imaginary aches land pains of some old dotard, but my grief and 'pain go with me, and the thought that we may lose each other for ever is with me by night and day. My brain at times seems on fire, and ghosts of thou[rhts enter my troubled mind that rnelce 6ne dread the future."" The distressed girl at his side seemed as thongh she could make no reply to these passionate and heart-rending words, and her troubled looks seemed to strike him with a new thought, for continuing, he said— ""I will, however, strive to follow your advice eiitd be patient. I know it must griteve you to iear me talk as I have done. I ought to be )atient, having yeur faithful love to think upon, pUt 3t is this dread of losing you that haunts me d:ay and night."" ""Milward, trust in me. I dare not cross my father; but will I take him in somegentle mood of his, when he etrokes my hair, and calls me his, peteand I will, with teaes in nay eyes, ask him uoTto crush the flower that he loves, and then may catch his gentle humour, Milward, and. he may yet, for my sake, forget and bury his GF'jiidices against you? .n ""heaven grant your hopes may be realised, rortSver SinCe he ordered me away from your house in anger I -have been afraid that his '?Felii,lices would bury me. I could net live; iv;thout you, and to be parted from you for ever ri.:Ald be my sentence of death. With you lifts uld be like or* long, summer time, but WithPu.t,you my da7 of life wetkki early oleos iüd.kad starza2l. Saytug' els TifilWard'Olivtze-agalrefOolinli I Cs own the two. hands o raria, Wood, saltj gazing beseeohingly Into he eyes,.,l9- etslalmei passionatelyi,„ saver raef laHa, ave. your Miserable rimer from tileruin. which threatens him if !deprived of you...""^ ""'lwill, Milwaxd; do- all' tltat T cap to. soften the, heart of -my, father, for-without you my' Own heart would.break, and surely he will_pity luet,-and =vale day smile-upon our loveZ, _ . CTTAPTER IL At the date when our story opens there migh have been seen riding towards the rapidly growing community of Burslem, then emerging from the village to the dignity of a town, a spare, keen-visaged man, whose long grey locks clustered on the top of his riding coat. This was one of those riders who are said to arrive at their destination before the _ beast which",1880
+3406,1208,art0031,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,766,0.9678,"CREWE. The charge of forgery against William Barr ow labourer, of Audley-street, Church Coppenhall, with intent to defraud Joseph Faulkner, of Waldron'-lane, Church Coppenhall, of £l4, is adjourned until tie 18th inst. THE PARNELL DEFENCE FUND.—The members of ' the Crewe Liberal Club, and the members and friends of the Crewe branch of the National League have each subscribed £5 towards this fund, the receipt of which has been acknowledged by Mr. Briggs, of the Liverpool Reform Club, who is the treasurer of the fund. TUE LATE MR. STEPHEN REAY,—who had been connected with the. London and North-Western Railway for 40 years, and for the past 22 years has filled the position of secretary—took place on Monday. Amongst those present were Sir Rich,lrd Moon, chaihnan ; Mr. Bickersteth and Mr. Cawkwell, deputy chairman ; several directors, Mr. Findlay, manager ; Mr, Harley, assistant secretary; Mr. Neck, superin- • tendont, and other officials of the company, Lord Wolverton, Colonel Hamilton, and representatives of the Great Western, Great Northern, South Western, ane other railway companies. HoPE FOR Tun PIGTAILS.—In the course of a long and interesting address on mission work in China, at the Methodist New Connexion Church on Sunday evening, the Rev. J. W. Townsend (Ashton) spoke of the antiquity of the nation. Whilst the mighty empires of Greece and Rome had faded away centuries ago China held its own still; in fact, it was only just commencing to develop itself. Its mineral resources were almost illimitable, and vast coalfields were yet to be worked, whilst gold and silver, and other minerals, were to be found. in abundance. Mr. Townsend. said, that whilst England would, in space of time, be worked out, and its natural resources exhausted, the home of the celestials would become the country of the future. DIORAMA OF SCOTL4ND.-01 all the many attractions and entertainments that have been crowding on the Crewe people during the last fortnight, no .one has been more worthy of support than Birrell's great Diorama of Scotland and Scottish Concert. The opening night was on Thursday, when there was a fair attendance. Want of apace prevents us giving the detailed account which the entertainment deserves. The chief attraction is Miss Griselda Wess ""the famous Scottish Nightingale."" Her Scotch songs were rapturously applauded, and encored. Mr. Birrell's performance on the violin also highly reeommended itself to the favour of •the audience,. Miss Margaret Birrell, the juvenile and graceful highland dancer, is also a feature in these entertainments. Miss Jeinima Wise gave two highland dances, and Professor Roselle, the Caledonian wizard and ventriloquial entertainer, contributed their part, along with other artistes to a very varied entertainment. The diorama consists of representations of some of the most beautiful scenes in Scotland, and so those who patronise this exceptionally interesting entertainment are gratified by what they see as well as by that which they hear, and are at the same time instructed, and amused. We ,:heartily recommend our readers to patronise the entertainment. LITEZARY ENTZItTAINIIZNT.--011 Wednesday night, in connection with the Union-street Literary Guild, Professor Valiance, Master of Elocution, Glasgow University, gave one of his ""Grind Literary Evenings' in the Baptist School-room, Brown-street, to a small, though highly-approeiative, audience. The vicepresident, Mr. Pedley, occupied the chair, in the unavoidable absence of the ttev. S. Cooper. The professor recited the following pieces with all tte skill and power of an accomplished elocutionist :—"" Horatius defending the bridge"" (Macaulay), ""Courtship of Godfrey Grubb"" (Macrae), ""The editor's guests"" (tiarleton), ""The road ta heaven"" (a story of a London waif, by Sims), ""Jemmy Butler and the owl"" (Cowan), "" pne more"" (an old skipper's yarn, by Overton),"" Cooley, his boy, and his dos"" (Max Adeler) ""Nottman ""(a story of the rail, by Anderson), ""The Rapids of Niagra "" (Gough), .• Caddie Doon"" (Anderson), How I puzzled the Interviewer"" (Mark Twain), ""The women of Mumble's Head (Clement Scott). The audience loudly applauded the reciter during the evening. Mr. C. Jones proposed, and Ur. E Tonkineon seconded a hearty vote of thanks to Mr. Valiance for his services. Mr. Valiance, in acknowledging the compliment, asked the mewing to join with him in thanking the chairman; and this having been replied to, he then gave, in a very graphic manner, in which the elocutionist's best powers were brought into play, ""The fireman's wedding."" The quality and tone of the entertainment, together with the satisfaction given to all who were present, lead to the hope ,that a return visit from Mr. Valiance will take place at an early date.",1880
+3406,1208,art0034,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,148,0.968,"MEMS. ABOUT THE COUNTY COUNCIL. With the exception of Chester, Birkenhead, and Stockport, which have been made county boroughs, Cheshire has already been divided into fifty-seven electoral divisions, 41trincham, being one of them. Fourteen of these are comprised in the boroughs, viz., Congleton, Crewe, Hyde, Macclesfield, and Stalybridge. Each division will return one member, and the members thus elected will then elect nineteen aldermen, either from v. ithiu or without their own body, thus making a county council consisting of seventy-six members. Where they will meet is uncertain, and will be in their own discretion. Chester, Crewe, Macclesfield, and Knutsford have been prominently mentioned It may be of interest here to state that the average distance to be travelled by .'ach of the 57 members will be, in the event of '..he3ter being selected, 28 miles, Macclesfield 24, Crewe 22, Enutsford 21., Gnd Altrincham 20 miles;",1880
+3406,1208,art0049,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,755,0.9483,"[airs. TIIE HAIS-YEAELY ESTIMATES AND RATES. The Clerk submitted the half-yearly estimated expenditure from September, 1888, to March 25, 1889. The principal items were :—L475 for highway repairs, £146 water supply, £lOO lighting expenses, £33 public library, £364 instalments and interest on various loans, and £l4O miscellaneous expenses, making a total of £1,425. Towards this amount the income from water rates was estimated at £3OO, market tolls £l6O, credit from former rates £l6O, leaving a balance of £BO5 to be raised. The rateable value of the property assessable for the above rate was £19,720. and the amount of rate necessary for such purpo3es is. in the X. The clerk, in answer to a que'Stion whether a rate of lid, in the pound would not suffice, said that a shilling rate would barely suffice to meet the expenditure. There were several items in the estimate which contributed to swell the expenditure, such as the interest and principal on the loan for re-paving the streets, and Free Library, &c. On the proposition of Mr. Cope. seconded by Mr. IChesters, the estimates were accepted. A. CREWE MAN MEETS WITH AN ACCIDENT A YEAR AGO, AND DIES TWELVE MONTHS AFTER IT. So far back as the 31st of December, lest year, Ambrose Price, an old man, who lived at 34, Market Terrace, Crewe, was injured seriously at London-road Station, Manchester, and on Friday last he died at his residence. Mr. H. C. Yates, coroner, and a jury sat at the Swan Hotel, on Monday'afternoon, to determine the cause of death. Mr. Rowbotham was foreman of the jury. The wife of the deceased was first called— Catherine Price. She said that her husband had been a joiner in the. employ of the L. ck. N.-W. Railway, and on the 31st of December last year he met with an accident at London Road Station, Mancheater, where he was working, which resulted in his arm having to be amputated. Ho was taken to the Manchester Infirmary, and lay there six weeks. There his wife often visited him. Afterwards he was taken to the Cheadle Convmlescent Home, where lee remained five weeks. Then he came home, and was attended until his death by Dr. Atkinson. He had not been =,ble to do any work since the accident, although Dr. Atkinson had told her that he would be able to do work in a few week's time. He told his wife that he had been KNOCKED DOWN BY A WAGGON, which bad reeled him round. For the last , few ,weeks he had not eaten anything. He was injured internally as well as being injured at the arm. He never blamed anyone for the accident, but said it was dark when it happened. William Leech, a goods Tiorter at London Road Station, saw the accident happen to the deceased. About nine o'clock on the morning, he (Leech) with another man, was drawing a loaded waggon out of the arch into the goods yard. The waggon had attained a good speed. wl,.en Price stepped right in, front of it. He was Imoelred down, the axle of the waggon caught his legs, ancl drew them over his face. The front wheel passed over deceased's left arm. Witness went to deceased-s resistance, and he Was afterwards removed &.o Manchester Infirmary. He was able to speak, but did not say how the accident occurred. The deceased's arm was hanging when he was pioked up. When he was knocked down he was in the act of crossing the metals to go to hi; work. nu COULD nAVE SN TEL TRU= COMMG if he had looked. It -,vas a rather dull morning to the hest of witne?s's recollection. The deceased was aware that shunting operations had to take place on that line. If witness had seen the deceased before he stcpp-,d on the lice it would have been impossible to step the wagon.—By the Coroner: Witness did not know whether the deceased was deaf or not, but it was quite possible that he would Dot hear the wagon coining if he was at all deaf.—A Juror stated that the deceas,d ros deaf. Dr. Atkinson, surgeon, said about sevan months ago the deceasefi wss sent to him from the Royal Infirmary at -24:Int:I:enter. Ile was turneil out of the inf.rmary r.s fit to some out, bct he wae not quits well, About three weeks ego he rrianifestoci signs of abstraction of the bowels, and",1880
+3406,1208,art0014,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,489,0.9419,"GOING TO OPPOSE ANY PUBLICAN to whatever party he might belong, a public.. opinion would have been created, and it would then have been easier to have prevented any mischief. As it was, no amount of opposition that they could bring to bear would do any good to the temperance cause, and, in the case of Mr. Welch, would simply end in a Tory being sent, and that Tory would be sure to be against Sunday closing. So they would simply oppose one man to let in another no better. Therefore, it would be the most foolhardy and suicidal policy to take any such action. He hoped that the lesson would be learned to act promptly in the future, and not be behind hand. He could not see that anTrgood would come, BIM ONLY BVIL, from passing suah a resolution. Mr. Davies said that all the eatdidates had not keen selected—therefore they wee not too late in that . _ . . _ _ _ respect. Again, Mr. \Veleli had been selected to reprecent the Liberal party, and to sonic out as a Liberal. They were considering whether it was advisable for them, as' a temperance party, to take action, and that was altogether a different matter. (Hear, hoar.) .11r. Siddell thought they should assert themselves as a party, irrespective of either side of polities. (Hear, hear.) Mr. Mann's said they had nothing to do with either Liberal or Consetutive party. Mr. Mellor: I beg, in the interest of order, to say that I never said you had. 1 opposed the resolution, first because it is too late— • The Chairman: Far whet? Mr Mellor: For effective purposes. What I wean is this: If you had started earlier you might have prevented a publican coming ours at all— Mr. Mann: I don't sae how we are to blame seeing that we did not know TER LIBERAL PARTY WAS BACKING UP A PUBLICAN to represent us—who will only misrepresent US lii every aspet-t of the ease. Mr. AL•llor : That is not a true statement. Mr Mann: If it cones to the test of a vote in the town. Mr. Mellor will see that temperance principles have more support than he gives them credit for. Mr. Cooper thought if all the candidates had not been selected; the temperance party WAS quite in time to ran a candidate of its own. Mr. Thomson thought that as thero were so many temperance men among the Liberals they might hare let them know of what action they intended to take. He thought they needed their help to icarry their eandidate against an opposing one. Mr. Ainsworth *aid the temperance patty had power of its own in Crowd. if they could only colleat it to make it felt. (He.t,r, li:mr.) For his, part, he made the temperance question the primary consider*, tion in any vote he gave. They had",1880
+3406,1208,art0024,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1016,0.9348,"BIGAMY AND ATTEMPTED SUICIDE. A man of gentlemanly appf.arance, described in the calendar es well educated, who gave the name of Arthur Gordon Lennox, pleaded guilty to bigamy, aid to an indictment for attempted suicide, but denied a charge of false pretences, which was not proceeded with.—Mr. H. Lloyd, for the prosecution, said that the prisoner, who was 32 yearr of age, married on September 5, 1886, a Miss Sellick, at St, James Church, Westminster, giving his name as Arthur Gordon Lennox. He lived with his wife for tome thee, and she gave birth to a child. Shortly after that it appeared that he had no means whatever, and Mrs. Lennox was compelled to return to her relations. Lennox then travelled, and was next heard of at Clifton, near Bristol, where he made the acqaaintaneo of Miss Emma Ford. A corresdondenee between hire and Miss Ford was kept up for some time, and en April 16th last he was married to her under the name of Arthur Algernon Lennox. BIS TIRST WLFB WAS TIM? ALMS. Miss Ford knew nothing about the accused, and as Burned him to be single. She, too, had te leave him because he had no means, though he showed every kindness to her. Subsequently he turned :np at Gengleton, where he is alleged to have obtained food and lodgings by false pretences. He was arrested, and whilst in custody he attempted to take his life.—Mr. Marsha,il, who had been retained for the defense, pleaded in extenuation of his client's conduct that he had been singularly unfortunate in regard to employment. Eighteen 'months after his first marriage his wife left him, because he could get no work. He made several attempts by correspondence and journeys to induce her to return to him, but she positively refused to live with him any more. It was under these ciraumstsnces that he formed the acquaintance of Miss Ford. with whom he subsequently went through the form of marriage. He had ever since been most kind to her, and his conduct, etherwise, had been ATOILLL, TIMPBRATZI, AND STBADT. As to the charge of suicide, Mr. Marshall laid emphasis on the distress of mind which the defendant must have suffered from being in custody on such a charge. —Mass Ford, a well-dressed young lady, then entered the box, and bore testimony to the unvarying kindness she had experienced at the prisoner's hands, and said his general conduct had been most exemplary. The Judge sentenced Lennox to a fortnight's hard labour for the attempted suicide. The other chugs, he said, was a far more serious one. The young lady had been led to believe thatihe was single, and as was induced to go through the form of marriage with him. It was a wanton act, the seriousness of which mut have been present to the mind of a man well educated. The sentence would be that he be kept to hard labour for nine calendar months, tha shorter term to run concurrently. TUESDAY. HOU-EBREAKING AT BIRKENHEAD. Joseph Byrne, 22, labourer, and John Cahill, 20, labourer, were indicted for breaking and entering the dwelling-house of Isaac Coulthard, at Birkenhead, and stealing various articles. Edward Byrne, 60, tailor, was charged with receiving the property knowing it to have been stolen. Joseph Byrne and Cahill pleaded guilty, and having been previously sonvicted; were each sentenced to 12 months' imprisonment. Edward Byrne denied the charge, but was found guilty. He had been repeatedly convicted for theft, and had served three terms of penal servitude, and he was now sentenced to penal servitude again for seven years. A CHILD DROWNED THROITGH A MOTHER'S DRUNKENNESS. Elizabeth Booth, a middle-agsd woman, surrendered to bail on an indictment charging her with killing and slaying her daughter, Ada Booth, fifteen naJuths old, on 9th Ootober, at Bowdon. Prisoner pleaded not guilty. Mr. Marshall, who prosecuted, stated that she prisoner went to Altrincharu, taking with her her son, a bey of five years, and the child Ada, who was at, her breast. She visited several public-houses, and bt came the worse for liquor. She had to cross the river I3ollin by means of a bridge, and there was a thick mist at the time. bhe was seen going in the direction of the bridge, staggering about, and taking the children along With her as well as she could. At seven o'clock on the following morning she went to the ;police-office along with her husband, and told a circumstantial story to the effect that as she got near the river SUE IT TWO ROUGU-LOOKING NEN, who asked her for money. She said she had none, and offered them her basket containing some previdiens. This they refused, and the bigger of the two men snatched her baby from her and she saw him drown it in the river before her eyes. The police immediately went to to the rioer and searched, but could nut find the body. While there the pri.oner, who was sitcompanied by her little boy, amplified her etor) by gi,iug several fresh details. The police sergeant turned to the little boy and asked, "" Where is your little sigter r and he replied, ""Mother dropped it Out of ;her arms into the river."" ""Oh, no,"" said the prisoner, ""that is not true; those men collared me; I did not do it myself."" The police then detaitted the prisoner, and the following day she told the polieeumui that the story visa Mut O;ersi • A PURE C01403""1.11 GI • t • - - and then she stated that on returning _bora, 'he got lost in the fields. She heard the water running, sad wen:; forward to find the river. Then she fell Into the river. She said she stuck to it as long as she eanid until she found that she war- going under, and she couli not get lier,hreath. She scrambled up to he",1880
+3406,1208,art0003,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,4,0.9825,PRICUS IS RESPECTFULLY SOLICITED. ,1880
+3406,1208,art0002,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1211,0.9426,"WJAR ONLII 1 1 -Tcb lA. D D 0 C 17. WONDKII FUL BOOTS. ;,S,TTTIGEI T REET p 1-1. cREWE. NONE CIIEIFIE33 I NONII BErry,l IV: E. MA_IVL WHOLESALE Gminr.AL FURNISHING AN 13TTILDI3R'S IRONMONGER, 19 & 21, lITGEt STREZT, C ' "" No one who is willing to Allow freedom of action in electeral matters will attempt to deny the Temperance party in Crewe the right to make its influence felt in the coming elections I for the County Councils: On the other hand, I the Temperance party' will do most justice to I itself, and the causeit espouses, by being practical. There is no doubt that in Or. Hodgson Ithey have one: who will fairly represent the ; views of those who believe in Local Option. His recent action on the Sunday Closing question proves his soundness' upon these matters. They have also an opportunity, if they choose, to run a candidate in the East Ward, but their complaint, as made at the meeting on Monday night, is that the Liberal party have decided to support Mr. Welch for the South Ward, No one can deny that Mr. Welch had a right to offer himself, and it is due to that gentleman to say, when candidates were first being talked of for the South Ward, he signified his willingness to waive any claims that he might have to contest the ward, in favour of Ald, McNeill. The latter gentleman, however, was at first unwilling; because of personal and family reasons, to become a candidate; and then Mr. Welch intimated that he should seek election. Alderman McNeill was afterwards persuaded to waive his objections to seek to represent the ward in the County' Council, and then commenced the complieations. Councillor Welch was determined upon going to a contest. It was then seen by the Liberals that if two such men sought election, a Conservative would be certain of being returned. And so it was that, after some negotiations, the friends of Councillor Welch and the friends of All McNeill agreed to decide, by lot, which should be the candidate of the party, The result was favourable to Mr. Welch. Can the Liberals be blamed, therefore, for abiding by such an arrangement? and ought any temperance men of the party who were privy to it, be charged with ""compromising their principles ?"" It seems to us that the Rev. W. Mellor was unfairly treated at the meeting on Monday night, and he. had a perfect right to indignantly repudiate the imputations cast upon him. If the Temperance patty run a candidate in opposition to Mr. Welch, they can gain nothing; for if they succeeded in defeating Mr. Welch—which is improbable—they would only do so at.the cost of returning the Conservative candidate, who would -lee sure to be opposed. to their principles. We thoroughly believe in what the Temperance party seek to obtain, but too often, by their impracticable methods, they injure the only political party which has proved itself willing to advocate their principle of Local Option. The Rev. Mr. Potts talked somewhat glibly about ""compromising principles,"" and yet did not hesitate to avow himself a Liberal Unionist, although he knows the Liberal Unionists to-day are keeping in power a Uovernment that has done its utmost, during the last session, to give the publicans a vested interest in licenses. This attempt was 'defeated by the party -of which Mr. :McLaren is a member, and the representative for. Crewe has strongly contended against the robbery of the ratepayers in favour of the great brewers and the owners of licenses. Yet Mr. Potts tells us that he has never acted with the VJ iberals since Mr. McLaren became a member. What is this but sompromising his temperance principles! It sees as though some teetotalers consider it their privilege to belong .to the recreant section that supports a Tory government, and legislation for the publicans, and yet to blame the Liberals if they do not drive from their ranks all who differ from the temperance party on the licensing question. Ile:utter absurdity of such a position is not worthy of men laying claim to practical judgement. It is a mere parody of principle, ridiculous in its assumption, and contemptible in its overbearing spirit. The ""Crewe Star,"" will support local option when the time arrives for the battle to be fought, but we .shall not hesitate to point out the folly of men who want to eat their cake, and to keep it. Liberals, while favouring teneperance reform, have a right to consid'erthat it is better to obtain the return one of their party upon the • County Council who will net with them on all other questions than local option, rather than see a Conservative returned who would oppose them in everything, The Liberel party. through its; temperance legislation,as proposed by Mr. Pierce, arrayed against them the major portion of the publican interest throughout the country, andelargely through this the party that was alleged by I iisraeli to have "" harrassed all interests was defeated in 1874, arid a hindering and niischevious Tory regime of six years followed. We have no doubt the temperance Liberals will see the common sense of the positien in the town, and act accordingly. They eerie., iuly will he hidispoeed to accept Mr. Potts as their guide after the candid admission that he has not worked with the Liberals since Mr. McLaten, who is a 'locall out ionist, was elected for Crewe ; for it ie clearly open to suspicion that some of those iwilo ialk about temperance principles being the Alpha .and Omega of politics do not act up to this. when' other convictions—such as the denying of the right of self government to Ireland—begin to operate, and do not hesitate to commit the inconsistency of severing thenaselves from the party that is represented in Crewe by a member of Parliament ia favour of their temperance views. —:o: The Brookside Habitation of the Primrose League does not seetn to be in a T,ery healthy. condition just now. Mutterings are being heatd about the annual ball, which it, is evident is not to come off.th's year. Last year it was indefinitely postponed on account of the.. illuess of the secretary, and we are .inclined to suspect that their present. secretary (have th-y one ?) is not so energetic as was Mrs. Kennedy. it would be a great' pity, from their point Of view, if the death of the secretary . should' also me n the death of the •Habitation ; but if their cbji.-et is the spread of Conservative principles, then we have no hesitation in saying that not all the chatter of Mr. Chntte6on will ever be abie to infuse new lire. into its members. Rumour says that it is the intention of Major Kennedy to leave Brookside. If this is true, the prospects of the league are no:, of thebrightest. • -:0: Air. Cleveland's message to Congress goei far beyond the question of Free ...In country where the -Capitilist cuts a bigger figure, p.rhaps, than anywhere else .in the",1880
+3406,1208,art0043,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,2631,0.9261,"CHAPTER XL: ,r It log the first cold breath of feu for his own 'life that William Palmer had felt; and, notwithatand_ _ they bestride, for he leaned forward in a fag his iron nerve, and the conviction which had long thoughtful attitude, and only occasionally 'sustained him that he had so carefully guarded himself awakened to the fact that he was on the Kin to haveeleiettlweitchauwsheiefohrmar* readstoefverneed edeeetmecetdiont,ethhee King's 89 stern highway when it was necessary to give the animated, his resolute and direct actlon, shook hLs requisite turn of the rein to keep himself and feeling of security, and made the poisoner fear that his his horse in the right track. The grave deeds would be unravelled and exposed, until Nothingth the e mmnoirheupassed stopped taw: tehnethesedoor two t 1e l passengers ahe t expression of the rider's face was now and then lightened, as pleasant thoughts seemed to be 4anis ; and, whilst Sevens alighted to enter the Inn, flashing through his mind. Palmer went into his own house to get that cup of He had climbed the winding and ascendinc, coffee which he had told Mr. Stevens he required. old road, which at that time connected with and It ewn taerendo tt hl oen tlhoweverhet s ,h be er ft oar se thhee etrioms see d thea e rat; :der, 1 its sinuous course the scattered houses at Hanley his arrival, Mr. Stevens had left the Talbotwin ,searahl Green with the larger and older potting corn- of the solicitor, and Palmer learned from his friends munity of Burslem: The stranger looked in the hotel that he had made inquiries for a Rugeley towards the latter place as he reigned up his .21azymeerrnamhedhGadardeint ehro. hoped aTthois weras vedxestciousvnewts to horse at Cobridge, before directing it through father ;owe:ploy hyis friend, JerPrY sSuma all. clo 8 8 opLane, past the Cobridge schoolhouse, When Mr. Stevens returned from a fruitless search then recently erected. after the lawyer, Palmer went to see him in his sitting ""There is one of the first vineyards of the l'ioiom. hAftertia few trivial remarks had passed between Lord that it was my privilege to plant, ""it is a very unpleasant thing for me, Mr. Stevens, when years ago I commenced to carry the about these bills. What in the world shkal I do? It messages of my Master to the neglected and will bring ruin upon me if the £4,000, for which your stejlotnhiweaksirtersf ohntsitbelet,elalre edeemlyrdpedelfmroemr degraded people of my country. Brother mt that sin c e Stephens tells me in his letter that the wilder- I first saw you Ighavo heard re:ther a differe,nt account ness has become a garden. I well remember of r. Cook's affairs,"" said Stevens coldly. how these poor potters once mocked, yet Palmer knew more than ever from this remark, wondered, at the earnest deliverance of the story how keen WAS the investigation this prying old MOM of the Cross. Still, I never felt in the peril that wa""s imbakwini ; I hope it will be settled pleasantly, at I have passed through at the other end of this country. Truly the Lord's arms have been around me, and such manifestations of His providence and care over me show that my lifework is not completed. When I reach the hospitable roof of Brother Stevens' I will set what I have passed through within the last few days, that the servants of God may knew in future years how He watched over His people, and turned the ravening wolves into lambs."" Having thus soliloquized, the rider said, ""Go on, Trusty,"" at the same time gently touching and giving the reins ta the horse., ""We will reach our resting place before the darkness comes upon us."" The horseman now galloped briskly along, past the pottery of Mr. Warburton, at the bottom of what is now called Elder-road, through Hot-lane, or Hut-lane, and up into Burslem. The house he sought was at the top of the Star Croft, and overlooked the grey old church tower that for centuries had stood in the vale below. His arrival was expected. At the sound of Trusty's hoofs the front door, which was approached by four stone steps, was opened, and there was seen at the door an'old man, whose. appearance was very quaint indeed. He had a tarn coat buttoned from the chin to below the ' waist. A pair of large horn spectacles covered the bottom part of his forehead, and his thin grey locks struggled from under the skull cap that he wore. Behind him stood a young girl, who now advanced quickly down the steps; caught hold of Trusty's reins, and said in a musical and joyous voice, Oh, we are so glad to see you, sir. Grand father has been talking for months of your coming,"" and then she called out loudly, ""Tim, Tim,"" when from the backyard of the house there emerged one of the most curious specimens of the ostler tribe that eyes ever rested upon. In the meantime the visitor bad guided his horse towards the stone steps, and taking advantage of their assistance he dismounted at the open door. Tim led away the horse, and It was a picture to see those three standing at the open door in the grey of the approaching evening. Their forms were thrown into prominence by the bright fire that burned in the room. any rate."" •"" It will only be settled, sir, in the Court of Chancery,"" replied Stevens. ""Oh—indeed,"" remarked Palmer abruptly, in a lsne tone, and with a pause between the two words. ""Who could it be that had told Stevens of Cook"" affairs,"" he ruminated,during the uncomfortable silence which followed. And then he ventured the quastinn, ""What friends, Mr. Stevens, had Mr. Cook in. London ? "" ""Several,"" ""I mean, who did he usually stay with when he went there ? "" Mr. Stevens gave him no satisfactonry answer, foe he rightly divined that the drift of these questions was to ascertain who had told him of Cook's financial affairs. The next day, although it was Sunday, Palmer persistently intruded upon Mr. Stevens, and he advised him not to take a solicitor with him to Hednesfond. ""Why not ?"" came the abrupt query. ""I should rather you would not."" ""Sir, I shall use my own judgment upon that"", ""Of conase you will."" ""Of course I shall,"" said Stevens, curtly. After the evening service at the church, which lettls had attended, Dr. Palmer again went into Mrs Steven's sitting room. The old man was writing, and Palmer approached him, holding a piece of paper his 'hand, as if he wished to give it to him. I*. Stevens still went on with his writing, and although he saw what was Palmer's object, he took no notice) of the paper, but said, . ""Pray, Kr. Palmer, who is Mr. Smith, ofithis place ?"" "" Smith—Smith--Smith ?"" said Palmer, repeating the name several times, in a voice that seemed to indicate that he did not know that there was snob a. person in Rugeley. ""A Mr. Smith, who sat up with my son one Digllb, or slept in the same room."" ""Oh, yes, of course. He is a solicitor in the town."" ""I asked you the question, sir, because as my son's pocket book is missing I shall want so know who was with him during his illness. Is he a solicitor ix practice here?"" ""Yes, he is."" Mr. Stevens then went on with his writing, and, after a sligiit pause said, ""Pray, Mr. Palmer, did you attend my son in a medical capacity ? "" ""Oh dear, no ! "" ""I asked you that question, sir, because I am determined to have his body examined, and if you ilaad attended him medically, I suppose the gentlemen I shall call in would think it proper that you should be present."" Paleaser who sat behind this cold, quiet, and systematic stepfather of him who lay dead in the nanses room of the hotel, knitted his brow, and east a malicious look upon his accuser. ""Can you tell me who is to perform the examinetion 7"" ho asked, in a hard voice. ""1 cannot—l shall not know myself until the morning. But I thought it right to tell you of it. I shall have the examination made for my own tatisfactionv and whether you are present or not is a matter eal indifference to me, i the gentlemen employed think IA right you should e timid."" "" ..So it is to me William Palmer left the room, crossed the road and entered his own house. He at once sent one of his servants with a message to Mr. Newton, the assistant to Mr. Salt, surgeon, requesting him to ""call upon Dr. Palmer."" Newton, who happened to be within his apartments when Palmer's messenger arrived, soon responded to the call. He found Palmer in his Mohan, reading, by the firosidoi After inquiring as to the young doctor's health, and requesting him to ttOns glass of brandy, which Newton accepted, Palmer said, "" Oh, Newton, what would be a dose of stryohn?ne sufficient to hill a dog 7"" ""A grain l"" ""Good, and—would it be found in the stomach after death? What would be the appearance of the stomach ?"" ""There would be no inflammation, and I do not think that the strychnine would be found 1"" Palmer gazed into the fire. and muttered quietly to himself, ""It's all right,"" and snapped his fingers. The perpose for which he had sent for Newton had been served, and, as if to cover the object of thl interview, Palmer continued the conversation upon general matters. When Newton bad left, Palmer wrote a Uttar to Pratt, the money lender, once more cautioaing him not to reveal anything as to Cook's finaneial affairs. Palmer had not questioned Newton whether traces di strychnine could be foand after the death of an animal to which it had been administered, because he was unacquainted with the effects of V;at poison, insa• much as abundant evidence was afterwards adduced that he had made a special study of poisons in gesserai, and more especially of this deadly extract.. But Newton's assurance increased his confidence that, strong sa the suspicion might be against him, conviction would be impossible. Accordingly, the next morning—when Dr. Haslet:lJ. a physician of Stafford; Dr. Monkton, a liugley physician; and Mr. Devonshire, an assistant to the latter, with Dr. Bamford and Mr. Newton, were willel together, to conduet the examination—with all that calm assurance which, in a guilty man, springs from the belief that he is safe from detection—Palmer prepared to go to the meeting of doctors. Watching from a window in his house, he saw Dr. Harland as lie eanie from the station, and quickly joined him in the street. After the usual greeting, 110, said, The old man grasped with both his own the extended hand of the stranger, and as he bowed his head he said in trembling tones, "" Hea-ven comes with you, sir. I feel that my Father is good to let me grasp your hand once- more."" The other hand of the visitor was placed, as if in blessing, upon the head of the fair girl, whose eyes were bright with a welcome she did not utter. The deft hands and ready domestic ways of Hannah Stephens soon prepared the refreshing evening Meal for the weary traveller ! and as the tired look wore from his face -the beholder could see that it was the countenance of unusual pcwer, yet there was a mildness of expression that must win the love of any who looked on such a face. ""Your letter that I received this morning ! told me of the perils that you had gone through ; said old Mr. Stephens, ""While you had bon preaching in South Staffordshire. If not asking too much, sir, it would greatly interest me and my grand-daughter to bear the story. ""Perils."" said the traveller, ""that have been •undergone in a good cause are not such sad remembrances that the heart would fain keep to itself, and as I believe that God has dalivered ,me from many dangers, and turned the hearts of angry men whose hands were upliftqd to strike me, I cannot but look upon what I have passed through during the last few days as an evidenoe God's mercy and love towards me."" ""Please do tell us all about it, sir,"" said Hannah. (To be continues.) EilNitiiiELl2',,t-:S C; r,:N ,i3U 1 V DER. It is announced that the Government has at last Pett!ed in a white, almost smokeless pov.-der for use in ani:dharms. The Irepsrtsece of this staLena«nt is evident, in view of tkle fact that until the powder had been decided npon it was inipc*Kible to ascertain actiirato:y the length of the ectrtridge, a: ii, consequently, the pronortiena of the convietg weapon. 'Veer. is, therefore, no longer any obstaclo to the manufacture of tho new mavazine rifles, the prochiction a which will ntake rspid progreva after the new year. It is said that the powder practically gives out a very small report, not much greater than that of an air-gnit. ""I am glad you have come to make a _post riertent examination. Someone might have been sent whom I did not know."" "" What is this case 1"" queried Dr. Harland. "" hear that there is a suspicion of poisoning."" ""Oh, no; I think not. He had an epileptic It on Tuesday night, and you will find an old discs.. in the heart and the head."" The two men proceeded to Dr. Bamford's, and while there Dr. Harland remarked that, although he had been summoned to the post mortent, he had .nnfertnnately come without his instruments. ""Oh. I will lend you mine with pleasure,"" said Palmer, gushingly. Dr. Bamford, however, offered the Poen of his ia• Etrumonts to the Stafford physician, aid this was accepted. In the course of the sonversation, Palmer rentatised, when the'subjteet of the alleged poisoning of Cock;firei, being discussed, . , A, queer old man seems to smrpeat me, I do :not know whet he s'acous, fey poor .Cook wOttP bp mo valuable to re anis than Asa, because hearpAtiNtelal rasnoueitillliofa 11130111111 akillami kis 416.41...... 'Obis",1880
+3406,1208,art0032,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,146,0.9512,"SANDIIACH. USING OBSCENE LA.NGUAGE.—At the Petty Sessions on Wednesday George Major, labourer, Elworth, was summoned for being drunk and using obscene language at Elwortn on the 17th of last mouth. P. C. Moss proved the case, and he was fined ss. and costs. VacclNATiox.-1 paper with this title was read before the Wesleyan In.peovement Society on Monday night by Mr. A. W. Lea. Mr. Bonas was in the chair, and there was a good. attendance. Mr. Lea's paper was listened to very attentively His arguments and statistics were very warmly contested by Mr, Steve's, ef Elworth, who moved an amendment to Mr. Lea's motiosi, which was to the- effect that ""vaccination was both beneficial and necessary."" Mr Steven's amendment was carried by tea to six. A vote of thanks was given to Mr. Lea on the motion of Mr. Stevens, seconded by Mr. P. Billyard.",1880
+3406,1208,art0011,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,586,0.9566,"ThJ NANTWICH, SANDBACH AND THE CIEEIVE STAR. SATURD Y, DECEMBER 8, 1888. Although the first Public Libraries' Act was passed in 1850, up to last year only 116 towns had availed themnlves of these useful institutions, Now Natitwich must be addt•d to the number of enlightened communities that have &cided to give wings to knowledge, and increased freedom of mind, In these days the phrase "" land-hunger "" has become familiar, for various causes it is difficult to satisfy on the part even of the inhabitants of our rural districts, but book-hunger,"" by means of Free Libraries, can be more easily satisfied. Considering how much good books cm do to brighten and elevate the bum-drum life of the people, by making them heirs to all the accumulated tkought of humanity's great minds, it is astonishing that more communities in this country have not established free libraries. A greater space than is measured by the years separates this generation limn previous generations as -to the extent cf education and knowledge possessed by the people, but far from sufficient facilities are given for continuing the education received in schools, and no better means of such continuation are afforded than public libraries; accessible without fee or irksome cpuditions of entrance to all classes of the people. There are already five free libraries in Cheshire, Nantwieh now makes the sixth, and in a short time another will be opened at Winsford. Ihe founding of Northwich Free Library was due to the .munificence of Mr. Brunner, and the same-gentleman has given a great impetus to themovement which has led to the e6tablishtnent of the Free Library at Nautwich. He was therefore fittingly selected t open that institution 011 Thursday. The speeches &livered were worthy of the occasion, and the public interest manifested shows that all classes in Nantwich are gratified that they now possess an institution which twill do much to help in the solid and intellectual growth of the town. It will be a centre of light and of sweetness, it will not only feed, but create, a taste for reading, and will aid materially in refining and elevating the people of the town. A very foolish letter, full of railing and rancour appeared in the issue of a local con temporary on Wednesday, from one signing himself a "" Liberal Ra,epayer."" In spite of the bitternes of its tone, it is an amusing epistle in which the wrier succeeds in proving that he deserves to be described by some of the epithets he so freely uses against the Nantwich Radicals who voted last wt!ek for Mr. Willett being their candidate for the County Council. According to this Liberal Ratepayer"" they are ""grotesque and irrational"" ""gnatlings"" and ""blockheads who have made the ""biggest blockhead their , idol."" Mr. Willett can afford to treat with contempt this splenetic and illmannered attack. We agree with the anonymous letter writer that ""the blockheads of the world are a large and powerful body "" ; atid there is no greater blockhead than the man who is ever rsaoly to sweepingly denounce and stigmatise others, because they act from crifferent mairos, or come to other conclusions than ; those that commend thernselves to his own , effort at judgment. He is one of the "".very day popes"" who !‘ thinks • himself infallible,""' and proceeds to ""curse, at large,"" . and: anathematise all who act and think independ— ently of himself.'",1880
+3406,1208,art0001,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,45,0.9162,", ..-__ MILL STREET, CREWE,. CRADDOCK'S STOCK OF BOOTS AND HO ES KINDLY FAVOUR US WITH A CALL. W. E. IVIAWLE, HAS A ,LARGE ASSOTaTMENT OF SPECIAL DESIGNS IN r:LEGTRO PLATED GOODS TABLE AND POCKET CUTI:2R LAMPS, COAL VASES, HEATING STOVES, COMPARISON OF QUALITY AND",1880
+3406,1208,art0015,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1017,0.9299,"IHIRE IGNORED BY THE LIBERAL COIIMITTDE- he did not say intentionally—and had he been there he should have moved the selection of a total abstainer—although, he sapposad it would have been difficult te have found such a man in Crewe. If they were consistent total abstainers, they should try to de Something to enforce their principles, and to vote accordingly. (Hear, hear.) The Chairman: It is not too late at all. There Is plenty of time for us to act effectively, and the arguments advanced by Mr. Mellor to sheer that we are too late do not hold wc.ter at all. There have been intimations given previous to the meeting of the Liberal excecutive that it was the duty of temperance people tn oppose the nomination of publicans, for at the Sunday Closing meting, I took the opportunity to make the announcement that we should look after our interests. ---------Mr. Mellor : Then practical steps should have been taken before to-night, The Chairman (continuing): I think the friends on the oxceautive who are responsible for this nominslion SHOULD NOT HAVE CO MPROATISED THEIR PRINCIPLES to the extent of the nomination of a pablican. Mr. Mello; (warmly) : I wish to firmly protest against that. I have not compromised my principles. You are compromising yours from the statement that you made in the Town Hall. If you talk about me compromising my principles, I object to it. The Chairman: I am not talking about Mr. Mellor personally : I speak of those temperance members ore the exceontive— Mr. Mellor: lam one of them. You had better be candid, and say it then. The chairman: Well, I wlil apologise. Mr. Mellor : You had better. Why, you are on the Liberal executive yourself. The cLairrnan : lam a nominal member of that committee, and I have ADHERED TO IT FROM OLD LI\DED.A.L SYMPATHIES. But I have not attended a single meeting sine:ells. McLaren became member for Crewe, and as an outcome of this reference, I will take care when the morroW CODISS to sever the nominal connection. I have no wish to sail under false colours _ _ _ Mr. Mellor : Da you withthaw the charge that I compromised my principles, because if you du not, thel I am obliged to make the same charge against yor. You are a member, as I am, and your staying awq adds to your guilt, if there be any guilt in it at all. The chairman : But I hardly see it in that light. 11,. Malor : But I do. The chairman ,I am not ashamed to say—l marts well say it—that lam a Liberal Unionist, and gat explains why I haven't been. Mr. Mellor : Will the meeting say whether it approves of the chairman's charge that I have comyoinised my principles, after the explanation I lave offered, and after I have shown, clearly and disthrtly, that nobody could have done more than I did, I think IT IS A MOST UNGENEROUS CHARGE, not to say unjust. If it be sanctioned by the meting, I will submit to it, and make an apology. The chairman : Well, I will withdraw it fromtbis meeting, as it seems to wound Mr. Mellor's feelngs. Still, I maintain that the temperance members e the executive compromised their principles. A resolution was then immediately passed, the tbo gentlemen present should form an election commttee, with power to add to their number. Mr. Mellor: I want to know where it leadsb. I am perfectly willing to help in this cause, but OS know what it means. The chairman (curtly) : It will load to whsever action the committee think fit. We can get no frther than that. • Mr. Aluffworth : I should like Mr. Mellor,ii he could, to be on that committee. _ Mr. Mellor: It is a most painful thing to mesa be stigmatised in such a way as to make it almost iipossible for Wle to join this committee. I have doe as mush work for the temperance party—and so ha my wtfe—siLee I came into this town as any minist:, and as mush as most laymen, and I THINK IT OITTRAOROUS 110 W to be charged wita having compromised w principles, and then to be asked to join a commiee—to compromise my principles again, I suppose. lannot, as a man of common honour, consent to serve n such a committee until I have the charge that hurl over my head coinpletsly withdrawn, I shall no expose in) oelf to another such charge on the samehseeless ground. If that charge be absolutely withdrvn, then I will act. Mr. Ainsworth : It was withdrawn. Mr. Mellor: The chairman said he withthe it from this meeting. What did that mean? -Mr. Wallace: There were other prominit temperance loaders present as well as yourself, aid be did not apply that observation to you. Th chairman knows perfectly well th..it there are Mr. Bley— Mr. Davies: I think that the chairman as. • RIGHT TO klls OWN PIaVATE ONION. I move that this meeting understands tlt Mr Potts has withdrawn that remark—(hear, hear and velem, He has done ""). Mr. Mellon: Then what was the rotuifig of the remark that he had withdrawn it from t) in -*sing The chairman: That is all that thf meeting has to do with it. Mr. Pedley, Dr. node, and other temperance men cannot go heart and rill with aa and still support Mr. Welch. Mr. Ainsworth: I don't think the chmals* intended be personal. Mr. Mellon: I was there, and I mid not suppose that he would make an exception in I Cade. Again, I pretest against it en behalf of °the, Mr..A.ins worth: I think that theieroPeranee did not think they were eompromiein their prinsiples, but at the same time, I wonder hoithey, as Uwe!, ance men, could have voted as they d. Mr. Potts then dismissed the leeting, wiilt the opinion that they had "" every. plOot ,Q 1 Acing gust work."" •",1880
+3406,1208,art0028,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,429,0.9548,"BVIiNIXO TECHNICAL CLAS62I9. If they were satisfied with their Free Library, and if they desired to go on still fufther in the good work that they had that day begun, he earnestly hoped that they woald spend,/ another penny of the rates to provide these free evening classes for instraetion in special braise:les of trade, in which mechanical skill was required, or iu drawing. (Cheers.) Mr. InfcLa.ren then pointed oat that it was their own Library, for which they were sot indebted to charity, and they should feel proud of it as being one of the best institutions it was possible te have in their midst. (Cheers.) He firmly believed that healthy, first-class novels were, in themr way, as desirable and as useful as science or any other class ef literature, and that it was necessary for all to have rcereatiox sof the mind. (Loud cheers.) He rejoiced that that town had had the energy, and the courage, and the self-sacrifice to devote a portion of its rates towards fouading an institation which woald do so much good to every inhabitant of the town, and that reflected so much credit upon the public spirit and energy, which had been put forward in bringing the scheme is such a satisfactory eonclusiea. (Lend cheers.) Very interesting addresses were also delivered by Mr. C. S. Roundel'. Mr. Chatterton—whose speech was a thoughtful and appropriate one—and Colonel Cotton, M.P, Mr. Chatterton specially spoke of the influence that reading had on the reader's life, in helping to mould his character. The lessons of life, too, were more acceptable through being enforced by reading rather than by bitter experience. His advice was to read all classes of literature, and not one exclusively. "" Whit "" he coild imagine some purist say,"" read the bad,the vicious,and even the prufame."" He answered, "" Read everything,"" '__ If they were reading anything really vicious, the small, still, voice of conscience would be sufficient to warn them ot the danger, and the antidote to the poison would bo forthcoming if their reading had been sufficiently comprehensive and extensive. Above all, however, ho advised them to stedy Shakespeare and the Bible. They should carefully and thoughtfully road; look up, or they would look down. Nothing in nature was stationary, and if they didn't aspire thoy would grovel. (Cheers.) The artistes who contributed to the programme were, Mrs. Shaw, Fraulein 'Tichy, Miss Carew, and the Misses Dutton, Master Bowyer, M. A. Withinshaw, and Mr. Head. The evening passed away pleasantly, and the concert was a groat success.",1880
+3406,1208,art0012,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,406,0.9603,"DECIDED TLIAT TUE MEETING WAS A PUBLIC ONE, and so our reporter, who was the only one present, remained. Mr. Siddell then proceeded to propose the following resolution :—"" That, in the opinion of this meeting, it is eminently desirable that the friends of temperance in the various constituencies should use their best exertions to secure the electi,,n, upon the comity couucils, of gentlemen who would employ their official influence for the discouragement of the liquor traffic, and for resisting the claims for compensation where public-house property is concerned."" Mr. Mellor asked if, in voting for that resolution one would be committed to any that were following, and the chairman said the resolutions were distinet. 1 he one before the meeting was then seconded aid carried. A resolution was then read which commenced, ""That, as there are a • SEVERAL GENTLEMEN WHO A.RA DIRECTLY CONNECTED WITH THE LIQUOR. TRAFFIC"" • who were to stand as candidates, that meeting pledged itself to resist their election. Mr. Davies asked if it was known that there were Bev 418.1 gentlemen who were likely to beeome such candidates He thought—and others present similarly expressed themselves—that they should be careful how they worded their resolution. (Hear, hear.) Mr. Mann stated that they were curtain ef one— Mr. Welch—and Mr. Lumb and Dr. Atkinson were rumoured to be likely candidates. The former, of oourse, was directly interested ía the liquor traffic and so was the latter, as being owner of public) hoist) property. Mr. Ainsworth said it was objectionable to speak of things that hadn't come to pass: they might get pulled up in the papers about it. It was well known to whom the resolution referred, and 60 far as THEY COULD AVOID PEASONILITISS, he wished them to do so. (Hear, hear.) Mr. Mann said that possibly they might even pre vent the certain one (Mr. Welch) from coming our, if he was met with active opposition from them. (Hear, hear.) Ultimately, after some farther discussion, the following resolution was proposed by Mr. Ainsworth, and seconded by Mr. Davies: ""That, in the event of candidates being selected who are directly or indirectly interested in the liquor traffic, it becomes the duty of all friends of temperance to oppose and defeat their election, and this meeting pledges itself to use every legitimate effort for the purpose."" Mr. Mellor said he agreed with the resolution in the abstract, but",1880
+3406,1208,art0047,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,31,0.9665,"NANTINICH LOCAL BOARD. The fortnightly meeting ef this Board was held at the Town Hall, on Friday evening. Present—Mr. J. Walley (chairman), Messrs. Joseph Willett, W. Lea, J. Cope, P. Chesters. ",1880
+3406,1208,art0023,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,93,0.9305,"THE NANTWICH PERJURY CA.BE. All the witnesses for the prosecution Inthe above ease, ten in number, were in readiness to be called, but the Grand Jury, after bearing Mr. Lisle and Mr, Herbert Gentry, found a true bill against the defendant, who will be required to surrender to his bail on Friday, (to-day) which day has been specially fixed for the trial. Mr Marshall and Mr. Lloyd are the prosecuting counsel (instructed by Mr. Brooke as public prosecutor), and Mr. §wetenham has been retained for the defendant (instructed by Mr. Martin). ",1880
+3406,1208,art0008,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1,1.0,WANTED. ,1880
+3406,1208,art0007,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1280,0.946,"CHOICE CHRISTMAS FRUITS AND ITALIAN WAR E HOUSE GOODS The provision department includes Finest Branded Wiltshire Smoked Bacon, Hams, both Chops and Irish Rolled 'Bacon, Stilton, Gorgonzola, and Cheshire Cheese, also Finest Danish Kiel Butter. THOMAS GLOVER PROPRIETOR. SOLE AGENT FOE GILBEY'S WINES AND spims. SIS,pLE BOTTLES AT WUOLEA.LE PRICES. DISSOLUTION OF,PARTNERSHIP RICIIBIOND AND "" PAWNBROKERS, JEWELLERS, AND CLOTHI.ERS, 158, MILL STREET, CREWE. MR. LEE be to inform the public of Crewe an neighbourhood that he hus taken over the above Business, on his own account, and hopes, by strict attention to business and low'charges, to merit the same patrouage that has been given to the late firm. In order to cope with the greater demands for room Mr. Lee will conduct the business, in future, at 79, Mill street, and corner of Station-street, in the large shop formerly occupied by Mr. C. Ryder, and lately purchased by Richmond and Lee 5.1 r. Lee further desires to say that, as ho is wishful to entirely clear out the accumulated stock of Forfeited Pledges, he will do so r.t an enormous sacrifice. The stock consists of first-class gold and silver jewellery, gold and silver watches, diamond, wedding, and other rings, a large quantity of clothing, dresses dres , 3pieces, blankets, carpets, etc., itc. NO REASONABLE OFFER REFUSED. Note the Address : 79, MILL STREET. CREWE. THE LAUNDRESS'S TRUE FRIEND! ARROP'S INEN G LAZE • (LEGISTERED GIVES A BEAUTIFUL GLOSS TO STARCHED LINEN. . _ Mnslin curtains, ciresses,—Joilars, cuffs, and all fine linen fabrics retain their gloss, stiffness, and snowy -whiteness. much longer when a little of Harrop's Linen Glaze ic added to the starch. • Imparts a beautiful tinish to the fancy prints so generally worn, fixing and giving a greater brilliancy to the •celours. On trial will prove its great superiority over all other preparations. It prevents the iron sticking to the and will not injure the most delicate article. Extract from ""The European Mail,"" Nov. 2, Mg. ""As Auxtukay To STARCH—In bringing to the attention of our readers the Linen Glaze prepared aed sup. plied solely by Mr. W. H. Ramp, we do so in the belief that it is thoroughly serviceable oompolnd. Of its eilicacy in glazing, stiffening, and whitening all kinds of starched linen, muslin, &c., there can be no question."" Sold in Packets, id. 3d. 6d., and is. each, an large boxes for hotels, foundries, ct s-, ss. each. PREPARED ONLY BY W• H • HARROP CHEMIST, ORZWE. 11By Order of the Executrix of the late Mr. Noah johnson.l VALUABLE FREEHOLD DWELLING HOUSES, BEDFORD STREET, CREWE. MR. T. E. GIBSON will offer for sale by PUBLIC AUCTION at the ROBIN HOOD INN, Nantwich-road CREWE, on WEDNESDAY, the 12th day of December, 1888, at Seven o'clock in the evening, prompt, subject to conditions, TIIIIITY-ZIVE LOBBIED flousEs, singly or in lots to suit purchasers, as may be agreed upon at the time of sale. The above houses aro Ms. 10 to 50 (even numbers), and Nos. 19 to 45 (odd numbers) in Bedford-street, off Gresty-road, in the township of Shavington, but adjoining the Borough of Crewe. They are close to the Cattle Market, and within three minutes walk of the Railway Station and Steam Sheds. They are large, roomy houses, particularly well built, and are in good repair and well-tenanted. Each house is let at a gross, annual rent of £l3 17s 4d (5s 4d per week), with the exception of Nos. 23 and 39, which are each let at £l4 6s (5s 6d per week), and No. 33, which is let at £l4 14s 8d (5s 8d) per week. Bedford-street will shortly be cut through into the Nantwich-road, and so become an important thoroughfare, while an improved system of i drainage or this and adjoining property a being arranged, the cost of which will be met by a small annual rate over the whole of the township of Shaving. ton. _ _ - An exceptional opportunity is therefore offered either to the working man wishing to become his own landlord, or to any person desiring a lucrative investment for their money. A very large proportion of the purchase money may remain on mortgage. For further particulars and to view, apply to Mr. D. Johnson, No. 1, Wood-street, Crewe, the .Auctioneer, High-street, Crewe, or to MR. C. H. PEDLEY, Solicitor, Weltminster Buildings, Mill-street, Crewe. THE WEEKLY STAR, SATURDA)"", DECEMBER 8, 1888. A ""Liberal Ratepayer : says : ELECTION ADDRESS. PUBLIC NOTICES. -:0: ""Well may Edmund Burke exclaim, ""Woe to that country that considers a low education, a mean contracted view of thing., a sordid and mercenary cecupation as a preferabie title to rule."" and then he concludes his letter by declaring that, ""The elector who votes merely for political ends and disregards the question of fitness degrades his privilege, and proves himself unworthy of his right to vote—"" What business' had this vituperative Ratepayer to imply that Mr. Willett takes ""a mean contracted views of things,"" or that he is one who has had "" a low ,education,"" or that his occupation is more sordid and mercenary than that of the legal gentleman who lost the show of hands at the meeting last week 7 The men who have risen from the ranks by their in dustry, and force of character are in reality nature's aristocrats, who have proved their fitness to come to the top by the acts and deeds of their lives. Mr. Willett is one of these; and he has been educated in the school of practical life in a manner that will 'enable him to bring a shrewd judgement to bear upon the affairs of the county when he is elected on the council as the representative of Nantwich. Besides, is a self made. man more mercerfar than a member of the legal craft? We do not for a moment insivue,te that Mr. Willett's competitor is one of the lawyers who cares I over much for the loaves and fishes; but a Liberal ratepayer knows that some lawyers have betrayed mercenary motives"" in many of their public actions. There can scarcely be a more ""mercenary occupation"" than that of the law. The Cheshire council will have plenty Of landlords, certainly a sprinkling of lawyers, we trust some farmers, but not many of the selfmade men in touch with the •artizans of the towns; and this "" biggest blockhead"" as this ill-natured and discourteous letter writer calls Mr. Willett, will contribute an element of thought and a way of looking at questions which are necessary for the representation of all .sections of the community ; ind that not because- he is. a. mercenary fool, but because lie will Sympathise more with the workingmen than any 'lawyer that will be sent to the -Council. The Radicals of Nantwich have not disregarded the question of"" fitness,"" Neither in the choice of their candidate, nor because they have supported him on political grounds. "" Political ends "" mean, in the case- of -Mr. Willett, that he will be found voting for the cause of the poor against the rich,- he will not he afraid of displeasing the landlords who will, at- first, monopolise the seats on the 'council. He will be more independent on that council than any lawyer can afford to -be, and than most farmers will dare to be ; and therefore, in spite of this irate ratepayer; the - Nantwich Liberals have acted wiseley in selecting him to fight thqir battle and represent their views:",1880
+3406,1208,art0010,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1817,0.9469,"PASSING EVENTS. (From a London Correspondent.) Ii.WITETT ABOUT SITAKIH—MR. _ CHAMBERLAIN'S, HONEYMOON—THE BEZAKDOWN OF A Ca USG Giclabre—LpßD TA:NNYSON—AmIJSEMENT IN THE CommoNFILL-Mn. IRVING AS A BUSiNVES Mug— BIJEY CIVIL.SERVANTS—TAVO INTERESXING ENGAGEMEN'II3. • Titn position of the garrison at Suakim is most critical, and a conversation I had with someone who has recently returned from the town, shelved me beyond doubt that we shall be very fortunate if we escape a grave disaster. The forts are held by a mixed force of Egyptians and Soudanese, who have been proved on 134 many occasions infinitely inferior in fighting qualities to the Arabs; while the dervishes are cirad.ually gathering a large number of fanatics around the port.. The garrison has been fur sonic time cut off from the wells, and are dependent for fresh water on condensers, and if these fail they will have to force their way out of the town. Officers who have served in the former Egyptian campaigns tell me the outlook is vog ominous, and I quite bulieve them. Mr. Chamberlain is not coming hack Angland just yet. He and his bride have determined to tratel some time in Italy, and have selected tire ItiYiera its a suitable place in which to spend the latter part. of their honeymoon. The sunshine of Italy is certainly more in keeping with the romantic loves of this young couple than the dark and gloomy atmosphere of England. it! • The abrupt way in which the Walpole breach of promise case broke down is much discussed here, and there is a good deal of dissatisfaction because the matter was not decided oil its merits. Miss Wiedemann was very foolish to allow her temper to get the better of her in the way she did, as the questions she refused to answer were most pertinent to the case. But she is eviduntly an excitable woman, and a foreigner, and it is quite possible that :she did not realise the effect of her obstinacy. It Is very unfortunate, too, for Mr. Walpole that he had not an opportunity of telling his story. Facts are at present too strong against him for anyone to believe in his innocence, and nothing willalter people's opinion unless he can come forward and tell a story which throws 3 new light on the case. He is therefore practically condemned, and it may be wrongfuily, so that a new trial is quite as much to be desired by him as by the complainant. It is reported that Lord Tennyson has a new volume of poems ready. I hope the story is untrue. When the Laureate published ""Locksley Hall : Sixty Years Alter "" it was only too plainly apparent that he was no longer able to do himself justice. At the age of 79, a poet may well leave his reputation to the care of his country. I am told, however, that Lord Tennyson has a craving for composition which age cunnot quench. The House on occasions is very like an assembly of schoolboys. Mr. Sydney Buxton Presented a bill for the protection of sand grouse. On such occasions the proper etiquette is for the member to advance to the Bar, and there wait un.til the Speaker summons him ; he then should make three profound obeisances and hand in his bill. But Mr. Buxton forgot these little details, and walked straight up to the Speaker, bill in hand, like an importunate tradesman. Then a terrible uproar arose. With one accord the House began to "" boo "" and shout in a terrific manner. Mr. Buxton stood aghast for a moment. He evidently could not make out what enormity he had committed. The longer he stood the louder the "" boos"" became, until, at last, he could stand it no more, and scuttled back to his seat like a frightened rabbit amid roars of laughter. His friends explained his mistake, • and he again advanced, but this time in due) form. The House would not let him off, however, and at each obeisance a delighted. ""boo "" shook the roof. This is how our legislators amuse themselves. Mr. Irving, in his speech at Birmingham, made a point which appealed to the business then of which his audience was largely composed. Ife said that he supposed that the highest example of the combination of the business faculty with the greatest mental gifts was furnished by Shaksspere himself, who, ""if he lived now, would sbew that the poet's eye in fine frenzy rolling need not overlook the most judicious investments."" This was a very happy remark, considerinc, the 4locality in which it was delivered. considering people still labour under the delusion that men. of genius are Incapable of managing their own business afhirs, but there is no greater delusion. Mr. brim ii 'an inatanoo in noint.. lie is parlaakit the only actor who has any touch of real genius, and yet he has made his theatre a small gold mine. I believe the average annual takings at the Lyceum nearly double those of any °their theatre in London. The Merchandise Marks Act has thrown; u might Have been expected, an immense amount of extra labour on the Custom House authorities. Up to the 31st of March upwards of 5,200 papers were registered in reference to the Merchandise Marks Act alone. The larger proportion of these papers involve special reports from the stations and outposts at which the goods to:which they relate are landed. The result has been that the qieSf engaged upon registry, and the gontitemon to whom has been specially assigned the duty of dealing with this new branch of work, have been employed for several hours beyond the usual official day, and many of them have devoted a considerable portion of their time at home to mastering the provisions of the Act, and to making every effort to prevent the work from getting into arrear and the mercantile community from being unduly inconvenienced. • Some of these gentlemen, I understand, are urging that their salaries should be increased or their hours reduced to their nor ma] length. No time has been lost in announcing the engagement between Lady Idena Nevin and Ms. Brassey, the eldest son of Lord Brassey. They only ""plighted their troth,"" as Mr. Gt. P. P.. James would say, at the beginning of lest week, and the engagement already appears in the morning papers. Lady Idena is the daughter of Lord Abergavenny, one of the ""ancient nobility "" in whom Lord Beaconsfield so delighted. He is a red-hot Tory and Unionist. Lord Brassey is an equally strong Radical and Home Ruler, so that it is to be feared that the two fathers-in-law will not hit it off very well together. By tho way, another interesting engagement will ho shortly announced between Miss Arnold, daughter of Matthew Arnold, and Mr. Virodehou so, second son of Lord Kimberley. Miss Arnold isaclever.bright-looking girl, with dark hair and cyes, and a peculiarly sunny manner. She has a great look of her lato father. The fund for providing an annuity for Mrs. Arnold is now closed, and,:l hear, has reached the very considerable sum of £B,OOO. It is not one penny too much, but that so large an amount should have been contributed :thews the respect inwhidh Mr. Arnold was held. CREWE TEETOTALERS AND THE COUNTY COUNCIL THEY WANT A CANDIDATE OF THEIR OWN. THE LIBERALS ARE BLAMED FOR THEIR SELECTION.. On Monday evening last, a meeting was held in the Co-operative Hall, Crewe, ""to consider what steps should be taken by the Tetriperance Party in Crewe, in relation to candidates en the County ceunoil."" The meeting was convened by a circular signed by the Rev. A. W. Potts and Mr. D. Mann, and about twenty vspouded to the call. Mr. Potts presided, and amongst those present were the Revs. W. Mellor. S. Cooper, and W. Davies, and Mr. J. Ainsworth. The chairman opened the ineeti,g by explaining the duties of the county councils. Their work very largely had t do with publisaus' licenses, and so the meeting would see that it was a very importanl thing for the publicans to have their men on the council, and doubly important that they (the Temper‘nce party) should do their best to prevent them getting there. If they (mild net have candidates of thcir own, then by all means let them have osnkiidates who were nufv.voutra.de to the liquor trait°, and favourable to temperance legislation. They were perfectly on neutral ground—irrespective, in their action, of party politics. They had not been spoken to by any party, and therefore they hxd a right to take a stand of their owe. The speaker then referred to the CANDIDATURU OF MR. WELCH, and said that they should form a committee to do their best to prevent one of the strongest publieans in the town becoming a member of the County Council. (Cheers.) Mr. Mellor: Is this a public meeting? The chairman: Oh, yes.—Mr. Mellor: To be reported? The chiirinan : Yes, I see no ri.a.on against it It is a public meeting—at least it was intended to be one.— Mr. Mellor: 1 think it is rather unusual for a preliminary meeting to be reported, I think it is a vory ill-advised thing to have these proceedings reported until we know what is onr decision, and when we are unsettled as to our policy. If you think yo 4 will have the meeting reported, well and good. but I DON'T THINK IT IS wiss • until we know our policy.—The chairman: -We pro pose to take the opinion of the meeting by resoluLion as to our policy.-11.r. Ainsworth : I think it wuuld have be-n better to have allowed the meeting to have become a sort of convention„—The chairman: I don's think there is anything to conceal.—Mr. Mann : I think the mare we can ventilate the qiestion in the press, or in any way, the better it is. The people will not know about it unless we make it public.—Mr. Cooper : As we have NOTIIHIG TO LOOSE BY PUBLISHING Oft SMITING, I propose that we proceed with the besiness, and allow the proceedings to be reported.—Mr. Davies seconded this.—Mr. Mellor, in proposing an amendment, said that the meeting was decidedly a preliminary one, and it was not within his experience to have a reporter present at a preliminary meeting, before they had deliberated to the smallest extent. They lid not know what they were going to do, and they ran a considerable risk of exposing their own inefficiency and weakness.—The chairman: Mr. Mann and myself certainly desire that this meeting shall be carefully reported. Upon a vote being taken, it was",1880
+3406,1208,art0025,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,2494,0.9612,"THE WEEKLY STAB, SATURDAY, DECEMBER 1 888. bank, and rested there, as she could not find her way. ""The reason,"" she continued, ""I did not tell the truth about it at first was because I was frightened, and I didn't want my husband to know. All the drink I had was four glasses of beer."" Dr. Luckmann, examining the body, found that it had died from drowning. A lady- from Bowdon asked that the prisoner might be leniently dealt with, stating that both the prisoner and her husband had taken the pledge since the affair. His Lordship said this was a most painful case. There was not the slightest doubt that the child's death was caused by her misconduct, and that misconduct was drunkonness. He hoped this terrible event would be a lesson to herduring the rest of her life. He should deal as leniently with her as he possibly could, because, now that she was sober, he could not help thinking it was almost sufficient punishment for a mother to reflest•that by her drunkenness she had sacrificed the life of her child. She must be imprisoned for six weeks. CARRYING OUT HIS THREAT. • Peter Goulding, 29, labourer, W4S indicted for having at Tarp ,rley, on the 30th October, feloniously set fire to a stack of straw the property of Martin Goulding. It appeared that on the day indicated prisoner had some words with his father, and he was. heard to utter the threat "" I will set fire' to you. to-night."" The same night a stack belonging to his father was dig./ covered to be on fire. Prisoner wag seen near the spot at the time, and he had matches in his pocket. For the defence it was argued that the prisoner was weak in intellect, and that since he came out .of the lunatic asylum some time ago his conduct had been very eccentric. The jury found him guilty, but recommended him to mercy on account of the weak state of his mind. His lordship agreeing with the verdict passed sentence of three months' imprisonment, and advised the prisoner's relatives to have his me nta condition ascertained. OPENING OF THE FREE LIBRARY, NANT WICH. On Thursday afternoon the Nantwich Free Library was opened by Mr. J. G. Brunner, MX., when there was a large attendance. The Rector presided, and after addresses from him, Mr. Lisle, and Mr. liarlock, who gave a financial statement, Mr, Brunner delivered an interesting address. Mr. Brnnner said that of late there had been quite an epidemic of libraries, and this had been partly brought about through the local and national patriotism evoked -by Her Majesty's jubilee. Ile fully believed that this had done good work, and they would see the good for themselves. They would take a pride in the good Work exhibited there, in Nantwich, and he very cordially and thankfully supported the decision which had been come to, to so wisely expend the moner amongst themselves. (Hear, thear.) But the neighbouring town of Winsford, he remarked, would only be a week behind hand, and it would soon be placed in the same position as Nantvrich. He had that day received a letter from a neighbouring clergyman asking for help for a free library, and for a recreation ground. He would like it to be known publicly—le thought that it was net widely known--that at the instance of Sir John Lubbock—than whom no greater friend;of education, and especially that in the form of free libraries, existed—an act !especially framed for the benefit of village libraries 'passed. It provided that there should be no need for the local authority to buy a building, but that they might rent one. Tha bill possessed other advantages which now made it easy for TIM SMALLEST VILLAGIM TO lIAVR A FREE LIBRARY of its own. (Cheers.) They would next year be hard at work again upon the extension and improvomout of the local Government bill for the country. His honourable and gallant friend—who was present to-day —the member for Wirrall,—hadi assisted him in his. efforts to improve the act which had been passed during the past session, and he looked forward with pleasure and with hope to having ibis agsistance in the year to come. (Hear, hear.) One of idle faults— though perhaps his honourable and gallant friend would not acknowledge it—was that the villages of the country were net placed in the same position of advantage as the towns. • (Hear, hear.) Yet he cordially hoped—his was no party matter, for in connection with the Local Government Board Bill they had worked together as if the party element in polities had disappeared—to have the help of both sides of the House in his effort to carry out his ardent and earnest wish that 'there should be in every village in the country one body to whom gifts could be made for tho village. (Cheers.) He believed that in a great many instances—and not a few in Cheshire— GIFTS WOULD HAVE .BEEN MADE TO COUNTRY TOWNS if there had been a properly constituted body to whom they could be given. (Hear, hear.) Those present lived in a country town, and it had ',seen the fashion among a good many writers to make fun of country life. Sidney Smith had given as his reason for not living in the country that "" there were not enough people in it to make a brilliant circle."" Well, he wanted to point out that they now had in their possession that which would most contribute to what was alleged te be lacking in country towns in Sir Sidney Smith's days. (Loud cheers.) Emerson, the American writer, and a, great lover of books, bad said"" Consider yois have in the smallest library a company of THE WISEST AND TUB WITTIEST OP lIEN bat could be picked out of all civilised tountries ix a thousand years, for this company have set to their best use the results of their learning and wisdom. The men themselves were hated, solitary, embarrassed and fenced by ettiquette,but the thoughts which theydid not uncover to their bosom friend are hero written in transparent words, to us, the strangers of another age."" (Loud cheers.) He had little time to spend with them, for he had a tugging at the heart strings to take him away at the earliest moment. 'Unluckily —although, that day, the news was good—he had a boy lying stricken with fever at Cambridge, but he was happy to say that the news that clay was very good. (Hear, hear.) They were there that day, in the words of one of the foremost men in Europe, Prince Bismarck, in the interests of tolerant and practical christianity. (Hear, hear.) He had great pleasure in declaring the library open—(loud choers)—and he joyfully dedicated it to the happiness of the children, to the progress of youth, to the health and prosperity of manhood and womanhood, and to the comfort and the consolation of old age. (Loud cheers.) Mr. Flarlock then handed over the deeds of the institution to Mr. Brunner, who, in his turn handed them to Mr. Walley, chairman of the Local Board. Mr. Walley accepting them in a very gracefally termed speech, said he felt sure they had lacked a place where their young people, especially those who had a desire to cultivate their minds and to attain knowledge, could pursue their praiseworthy inclinations. (Hear, hear.) At last he thought they had accomplished that, and they should be proud, for it was something f.r them to accomplish. But, most certainly, they would not have accomplished it—for some years to come, at any rate—if it had not been for THEIR KIND FRIRITD, MR. flaunt.= (Loud cheers.) It was his timely help, as Mr. Lisle had well expressed it, that bad set the ball, rolling, and caused the erection of that building. He sinserely trusted that now that It was erected, and that the shelves were so well supplied witk books—that those who were entitled to enjoy the newly-created advantages would use them. and that well. By using them well, he meant that they should use them diligently and thoroughly. He wished to earnestly advise all young persons to store their minds with useful knowledge, for in after years they would reap its crowning advantages. (Cheers.) They perhaps had not a very large library—about 2,000 velantes,—but it was a fairly representative one. He earnestly begged of them to use it. The sensa of sight. by looking upon a beautiful picture, wigi gratified; tho sense of tee**, by some luscious sweetmeat, was pleased; and the sense of smell was ministered to by the fragrance of a summer flower; bet gratification for the mind mast also be attended to. (Cheers.) He trusted that they would all be pleased with the means for this necessity, in tie shape of that Free Library which had been provided to them that day by the generosity of those friends whom he saw around him. (Loud cheers.) Mr. Harlock then presented Mr. Brassier with a prettily. bound volume of the history of Nantwich, by Mr. Hall, upon the back of which book there was an acknowledgment of the recipient's ""LIBERAL ITIELP IN TIM TIME OF NEED."" Mr. Harloek also proposed a vote of thanksgto Mr. Brunner, which was seconded by Mr.llensloy, who expressed :the) hope :that the modern representatives of aneimat families that abounded in Namtwich would follow the worthy example of Mr. Brunner to help on the Free Library by leaving it a very (handsome endevvment. (Cheers.) Mr. 13ranner said ihe very gratefully received the acceptable present, of which he would be very proud. (Hear, hear.) Ho wished to say a few wards in correction of his friend, Mr. Lisle. He trusted that they would not remain content with the income they now had, but that they would feel it to be a duty to help to add to that income. He trusted that the public spirit that :ley had hitherto exhibited would not desert them, but that subscriptions would be earnestly sought alter every year, in order that they might not only maintain that building, but add to it and its contents, and improve it. (Httar. hear.) He was very thankful to them for their kindness that day, and with !all his heart he wished them well. (Loud cheers.) Mr. H. Tollemache, M.P., moved a vote ef thanks to the Free Library Committee. He thought it mast have had many difficulties of a social, architectaral, and financial eharacteute overcome, and i the real* of their work reflected was so excellently rendered whilst those who were eager to hear the views of competent critics on matters literary were also satisfied. The Rector (The Rev. F. G. Blackburne) presided, and his official capacity was no sinecure, for during the evening he had several times to exert his authority to prevent a noise from the rear of the hall, from those who were unable to get in—for the building was packed. As a "" dernier rossort,"" an overflow meeting was held downstairs, addressed by Mr. C. S. Roundell. but that gentleman, later in the evening, announced it to have been a dismal failure. The chairman, during the interval in the musical portion allotted for speaking stated that the actual receipts for the free library amounted to £1,085, promises £135, making a total paid and promised £1,120. Their liabilities came to £1,120 leaving A BALANCE OF ;MO to be spent in tie purchase of first-class books. Mr. McLaren, who was received with loud and prolonged applause, then rose to address the meeting. He commenced by hrowing out the excellent suggestion contained in Vie words ""If at any time the free library shoald be in want of funds it is pretty clear that it only needs to organise a concert such as this, and charge a by sum for admission, when it will raise any amount of norley that the wants of this town can demand.' Air, MeLam proceeded to say that whilst such as Mr. Brunner deserved great credit and thank, it was not those who gave the money, but those who did t boo work deserved tho greatest credit and praise--(hear, hear)--and he then proceeded to speak in high terms of THE EN/MGT OP THE RECTOR AND Mt. HARLOCR. (Cheers.) Mr. McLaren next congratulated the committee upon the wisdom of their choice in the matter °lithe librarian. Miss Jackson, who, was admirably qnalified for the work, and who would do her work both with love of the work, and with a desire to make the library in every way a snecess.--(cheers)--and he believed the committee had taken a step which subsequent events would justify. (cheers.) Nantwich was one of the smallest towns In England to have adopted the Free Libraries Act, and this was a matter of the highest credit to them, for small country towns were placed at a gteat disadvantage in the small amount which the penny rate yielded, and had it not been kir the generosity of frienas,it would have been manifestly impossible to have brought affairs to anything like their advanced state. Because of the disadvantage of which he had spoken, the committee had wise ly decided to BEND LARGILY UPON ANNUAL SUBSCRIPTIONS, and he was mistaken, taking that meetinf.(as the basis of hii opinion, if Nantwich people did not rally round the Frae Library, and contribute very largely to its fans. Mr. MeLaren went on to say that there had been a marked increase during the last two years in tie promotion of :Free Libraries, but still they were fewer than they could wish, aud one reason of thie was the large developement of Mechanics' Institutions throughoet the country, for wherever there was ' A GOOD IMIIIANICS' INSTITUTION, AS IN CREWE, a Free Library was there felt to be unnecessa,ry, far, by paying a small subscription, a member could obtain the use of the admirable libraries which they collected. Tkey, however, who had adopted the Free Libraries Act were in a much better position than towns whore Mechanics' Institutions alone existed, for, whilst the anneal sebscriptions to the latter institutions could not be less than 61. or Bs. A year, the payment of a penny rate, anxually,to sixpence, eightpence, or a shilling, could enjoy the very same unrestricted advantages. (Lend cheers.) To his mind, the town thatladopted the Free Libraries Act showedithat itlwas determined to spend its money in the best possible way, to yield the best and most lasting results. (Cheers.) During the past Session the Government had brought in a bill, whisk, gunfortunately, time prevented from beieg tarried, but he believed that it would become law next ynar, for allowing any town which had adopted the Free Librarie's Act to spend another penny on the rates ix support of",1880
+3406,1208,art0033,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,743,0.9414,"CREWE COUNTY POLICE COURT. MONDAY. Before Mr. T. L. Boote, Mr. C, S. Roundel', Major Kearsley, and Mr. G. E. Wiekstead. rUll MOULD-AGAINST TIII3 PUICIS Floury Davenport was ""kicking up a shinty,"" as the vulgar say, in the Black Horse Inn, 11-spital-street, Nentwich, on Saturday n:ght, when P. C. Watkinson catne ii. Davenport desisted in kicking up a shinty, and, instead began to kick shins,—which belonged to P. C. Watkinson. Of course, Davenport was drunk, and R. C. Watkinson took him a walk down Welshrow to the police station. Davenport had to pay 2s. 6d. for"" overestimating his capacity, aui indulging in unseemly behaviour,""—which is, we believe, the polite way of speaking of the degree of D. D.—, and, for his amusement in kicking the poor P C.'s shins, he had to pay 10s. and c sts, or, if he could not conjure up sufficient filthy lucre, 21 days holiday, all expenses paid, was the alternative. PLAYING BATTLEDORE AND SIITITTLECOCX WITH THE KITCHEN FENDEIt. Richard Hassan, an old man, of Wood-street, Naramidi, is not, we hope, in the habit of miscellaneously propelling fenders at ' folk, especially when they happen to be noel-fenders, as was the case when he threw such an article of kitchen furniture at his sister, Ann aßsford, an elderly person. On Friday last Richard returned home, sad to relate, drunk, and in the words of his sister, ""After I had prepared his sapper he threw the fender at me."" It certainly would have been less remarkable if he had done this before he had had his evening meal, and if he had been told that there was no ckance of him having any. But, to say grace after meal in such a manner is truly, cad happily, rare. His sister generously said that if he had not been drank she believad ho would not 'nave done such an outrageous thing, as it wtts the first time he had assaulted her. We hope it will be the Alpha cud Omega of his drunken freak, aud if anything should contri:.ute to this, the 10s. and costs, or the ""default ""of 14 days' hard labour ought to, 7011. BEING ON ENCLOSED PEEMISES, TILLY 4E3 SENT TO ENCLOSED PILEMISES. James Davenport has the bump of caution well developed. George Cornea hasn't. These two worthies were, late on Saturday night—about a quarter past the midnight hour—on a vary suspicions expedition. They were making a survey of Mr. Schofield's premises, at the back of Hospital-street. Then they opened the door-way of the building, and entered. ""Is anyone there ? "" Davenport asked Cornea. ,"" No,"" replied Comes, with more readiness than truth. Davenport didn't seem to be satisfied and so said, "" Let es be sure before we start; strike is match!"" Before. however, Cornea had time to strike a match, they were both struck dumb, by seeing the figures of I'. C.'s Edwards and Jackson,—adorned in the daily habiliments of the peaceful civilian, which means they were plain clothes, bobbies protent.—ernerge from the Cimmerian gloom that had hid their presence. P. C. Edwards took charge of Davenport, and Cornea was affectionately embraced by P. C. Jackson. In the possession of the men were found bags and sticks. Then the trimitphal march to the police station took placa. We may say, to enlighten our readers, that as several ""shrill clarions of theinorn ""—which means socks,—and likewise hens had been missing from Mr. S.-liofield's premises of late, the two P.C.'s had undertaken tho ruse- which was so successfully carried out. 13.ecioguislug the saying, as being a true one, that any excuse is better than mine, ingenious Mr. Davenport ple.nded teat some children had told them that some ono was making a noise in the barn, and they had coins to see what it was I "" Clever, very clever, but not quite clever enough,"" thought the magistra,tes. and se, during their six weeks' incarceration. Messrs. Davenport and Cornea will have time to reflect, both that ""being on enclosed premises for an unlawful purpose"" is dealt with lawfully, though to them not agreeably, and Daeauport will still further see that though cautious, he was not citations enough; or he would have begged to have been excused that evening altogether, and, moreover, Ise will perceive that the beaks were not quit. of,sueli green plainage as to be faseinated by such a plea as o—tt which the abseeco cf 'belter one caused him to offer.",1880
+3406,1208,art0030,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,4,0.9775,LOCAL AND DISTRICT NEWS. ,1880
+3406,1208,art0017,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,2,1.0,AGRICULTURAL NOTES. ,1880
+3406,1208,art0009,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,5,0.946,"TO LET. FOR SALE, MONEY. ",1880
+3406,1208,art0048,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,472,0.983,"TIIE CIIEESE FAIRS, Mr. Gentry (inspector) said he had been in conversation with the market lessee, Mr. E. Peoley, and they thought it would be better to have the prize cheese in the Market Hall, rather than run the risk of exposing it to frost and rain in a tent outside. Mr. Chestere was of the same opinion. They had measured off a portion of the Market Hall, at the south end, which woald give 60 square yards more space than the potatoe market, in which the prize cheese had hitherto ' been exhibited. This could be separated from the rest .of the Market Hall by means of canvas aft. high stretched across the pillars, and the remainder of the , space would be available for the ordinary show of cheese, as well as the fish market and the potatoe market if required. Mr. Pooley said he did not think they could dispense with a tent for the ordinary cheese, as in addition to the prize cheese space would be wanted for the company who wished to attend the distribution of prizes. The matter having been discussed, it was decided to adhere to the Original resolution with regard to hiring a tent, and Mr. Pooley and Mr. Gentry were authorised to consult with Mr. Willis, secretary to the Cheshire Farmers' Association, and place the whole of the Market Hall at their service if required for the exhibition of the prize cheese and the distribution of prizes. A letter was read from Mr. George Willis, stating that he had laid the letter before the council of the association, as to what was considered requisite for promoting and carrying on efficiently the cheese fairs at Nantwich. He was authorised by the council to say that the nature and extent .of such accommodation as was most desirable was cry clearly stated by himself and a few of the members of the association when they met a deputation from the Local Board a few months ago. Less than shedding, lofty and well lighted, to cover the whole area behind the market they considered would not meet the requirements of the case, and the idea of putting planks across the ' butchers' stalls appeared to the council a very undignified suggestion to be put forth by the Local Board. They trusted that the Board would favourably con• cider the matter, and provide ample and respectalele accommodation for the Christmas Show, and monthly fairs for cheese, and in return they would be pleased to eontinue the fairs at Namtwich rather than move them elsewhere, as they Might be compelled to do. The Board instructed the clerk to inform Mr. Willis , that the Board would do what they c,,uldfto meet the wishes of the association before the next spring cheese",1880
+3406,1208,art0042,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,14,0.9464,A RACE TO RUIN I THE STORY OF WILLIAM PALMER. BY W. 0. COLLINSON.,1880
+3406,1208,art0013,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,281,0.936,"QS OBJECIIND TO IT because, in the first plaee, it eame too late, and next, becaase it would prove itself to be utterly ineffective for any good purpose, and if passed, woald simply end ie displaying the inefficiency of the temperance party in the town. He said he opposed it because it was too lute: if anybody had had any idea of making the temperance question the pivot upon which the struggle of the eleetione turned, he should have begun his work, at the least, six weekcearlier. The weakness ofathe temperance party—or a those who oemetimes took the lead—lay in the fact that they were too erratic and too uncertain, and so 17TTEELY WANTING IN SYSTEM. He spoke from knowledge of both sides of the question, and he intended to be perfectly straight in the matter (Hear, hear.) There was the candidature of Mr. Welch: that was at the root of the whole matter. Several temperance men were on the committee that _ _ hod to do with the choice of Mr. Welch, from among others. (. Shame,"" from the chairman) He believed that the chairman of that meeting was a member of the exceeutive, and he was sure that two or three other ministers were on the excecutive, and there was A CONSIDERABLE *UMBER OF TEMPERANCE MEN upon it. Mr. Mellor then explained how the selection of Mr. Welch, as Liberal candidate for the South Ward, came about. No one present at the executive meeting held the opirion that the county councils were to be elected solely on the drink question. If the. temperance party had met sooner, and had raised its voice to say that it was",1880
+3406,1208,art0029,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,687,0.9338,"AMATEUR DRAMATIC PERFORMANCES AT CREWE. On Tuesday and Wednesday evenings, two representations of Dion Boucicault's powerful though somewhat faded Irish drama,- ""The Shaughraun,"" was kindly given by the members of the Crewe Ama,teur Dramatic Soeiety. in the Crewe Town Hall. in aid of the widows and orphans of Court Linton Dale, No. 1556, Ancient Order of Foresters. The Society on this occasion was assisted by professional ladies. In consequence of the pressure upon the public purse in Crewe at the present time by entertainments, the assembly on each evening was not so large as on former occasions, and knowing the undertaking to be an expensive one,we were sorry to find so good and charitable an object did not receive better patronage. Mr. Councillor T. Latham played his old part, Captain Mendota, which suits him admirably, in his usual quiet and gentlemanly maener. At first a little nervous, and uncertain in action and speech, be became bolder as the play proceeded and acted the part with natural easiness. He was quite sucoeeful in the love scene with Claire Ffolliott. Mr G. Norton, who was entrusted with the thankless part ef Robert Ffolliott, was not a success, but allowance mast be made because of this being his debut in a. Principal character. His chief faults were timidity and indistinct utterance, through talking too fast. Mr. Norton evidently forgot that his audience was. wtshful to understand what he said. The character of Father Dolan was very creditably sustained by Mr.' H. Lawton. Although the conception of the character , was not completely that of the typical Irish priest, Mr. ' Lawton proved himself a skilful actor. He is undoubtedly an acquisition to the society. It was evident Mr. T, H. Badger had attempted too groat a task in the , heavy part of Cory Kinchela, the aquireen ; his delineation °tithe character was too ejaculatory and choppy His confrere, Harvey Miff (Mr. W. Ellis) was more successful and zealously worked up the character. Conn, the hero of the piece, was again undertaken by Mr. J. Astley, who revels in Irish plays, and we consider that upon this occasion he excelled any previous performance of the part. •He kept the audience alive throughout the piece. Sergeant Jones proved the means of Mr. J. Lawton making a favourable debut, and the: supernemary parts were well sustained. The peddaquier is to be congratulated upon the success attained by the make•ups, which were good. Of the ladies we compliment the Society for having at last been enabled to allot a leading part to a local fainateur lady, and Miss Ireland deserves praise for her impersonation of Arta O'Neill, which was a capital performance. Miss Marie °liming as Claire El olliott and Miss Louibe Scott as Moya, sustained their respective .parts in an artistic fashion and quite equal to their profession. Miss Lizzie Cooke is deserving of high commendation for her extraordinary, bniliant, and realistic rendering of Mrs. &Kelly and Bridget Madigau. Some portion of the scenery has been newly painted expressly for this production by Mr. Phil Lovell, of the Crewe Lyeemet Theatre Company, which were very skilfull. Mr. A Besaman's baud occupied the orchestra and discoursed some lively Iri-12 music in a satisfactory manner. The stage managers were Messrs. J.. Astley , and W. Allis ; the meeban cs.l arrangements were produced by Mr. H Maybnry, and did that gentleman infinite eredit. The old tower revolving scene was very one. The dresses an.i costumes were supplied by Messrs. Carr & Co., and the wigs, etc., by Messrs. Fyans, Liverpool. The pertermance upon the whole mat be adnsitted a success, the various parts being I erediaably sustains& And ,tho audience manifested iti apprediatien by freq-uon't heirty applause'. S. listed dimes vas hold after .thef pirfetaatice On Wed: ' sudsy, which was fairly patrontzecl. Mr. R.J. Mell4r, tite.Seeie.tary, nf .the Foresters. cfrteisted as M.C. We gloocitetyboo- Omit a good soul sem here been reinsad 1,, se. Sonsilten.deble an e;geett. . .",1880
+3406,1208,art0045,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,21,0.6995,"M A P.lt TAG!! . On tl/. sth Inst. at Cl.ricat crevve, William I:llsten td Annie Lowndes ; lwils of Crewe. ",1880
+3406,1208,art0050,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,201,0.8033,"HIS DIED EXIIAUFTED On Friday. Witness was net in a position to say whether the deceased would have had abstraction of the bowels if his arra had not been anaputatod. He would not pass any opinion as ~to whether it WAS ridla t be amputate the arra or wit, as tho operation was performed at the iusrmary. The deceased's arm had been going on fairly well, but he was a man of footsie health. He had met with a previous aceident, and he was .then attended at Manchester Royal Infirmary. He eonsidared Price had died from exhaustion in sonsqql;tenacji of tj,et loss a the limb. The jury returned a verdiet in accordance with the medical icitimoily that the deceased had died bora exhaustion in soneeinenee of the amputation otis NEURALGIA AND TOOTHACHE are speedily cured by READE'S NERVE PILLS, KAI'S EXTRACT OF LIX:EgD, FOR COUGHS, COLDS, &a. (Prom the original Recipe ei J. nun READrs LAVENDER la-rA11414 A lasting terfsinte. 247111 T MBDIDINBS AT WMOIAWLIP PRICI3S. J. E. READ (LLTII KAT), 421 Ithaa atIRBST, MYRA rzelatefrii4 -puhliabett by, WILLI:k3i Ow, at the Aim 61, Nantwieh-road, Crewe, hi the count/ of .Iki.l-•",1880
+3406,1208,art0035,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1611,0.8815,"CHURCH COPPENUALL DIVISION. An influential meeting was held in the Co•operative Hall, Church Copp-nlia 1, on Saturday, for the purpose of selecting canAidate to represent the Church Coppenhall Division on the entinty council. Mr. R. Mellor was voted to the chair, and among those present werc—Messrs B. Pedley, T. Smith, Walford, C. Jones, Joseph Owen, G. Jackson and several others. The chairman having explained the object of the meeting, laid stress upon the importance of having a gentleman as • their repmsentative who thoroughly understood the regoirem :Fats of the division in matters pertaining to county management. Several other gentienien having addressed the meeting, a resolution syts unanimously passed, on the proposition of Mr. Pedley, seconded by Mr. Joseph Jackson, adopting Mr. 1.% 11. Davies, of Eardswick Hall, as the candidato for the division. Mr. Davies, haviag been :prised of the resolution, expressed his willi :.t.:)e.4,-; to stan:l a, a candidate, provided the other po of ti;e division extended to him an equally cordial invi • atioa. Later in the aftarn 71a meeting of ratepayer + from the other part of Ow • was held in the 'Town Hall. A deputation si so present from the Coppenhall meeting. Thf.. ,;.I.'j..!t. of having a representative chosen from'amen,2- .••, agricultural class was discussed, and found gv lerai acceptance. The Coppenhall delegates (Mcssrs Mikr and Pedley) having reported the choice o te ratepayers at their meeting, it was proposed by Mr. Jones. of Woolsonwood, and seconded by Mr. tforliett, ,:f lenhnll, that Mr. Daviea, be invited to stand —carried. FACTS AN i) FANCIES. ""Do you evE,r nodding towa4 C:•• repli.-4 the »i10t.i.;.;, • ~: _ . dOxiCal 3.8 it mar bay no."" And ; lit I!,:ver deep into the dark .03 thr,-8 filrerS tt..; ca v ernous swallowing A clergyman being after his appointine: asked how 4ong when answered, "" ed to, in Jess than a year ,) [flit a stove in the church, ssor had been there, and years,"" he said--"" Well, he church during his titnel"" ""but we had fire you novel- had afire No sir,"" replied tlic in thi3 pufpit then !"" At the time of the late H. J'. Byron's disastrous manarrement of three theatres in Livcrpool, an _intimate London friend, who met him suddenly in the street, was much struck with his anxious look and altered appearance, and asked sympathetically, ""What's the matter, old fellow—liver ? "" ""Yea,"" said Byron, languidly, ""Liverpool."" Youth: ""I've got some poetry here I'd like if D have you look over."" Editor: ""Yes, sir. Have you got your licence with you?"" Youth: ""My licence?-"" Editor: ""Yes • your poetio licence."" Youth: ""N -n-no. I didn't know I had to have one."" Editor: ""We wver look over poetry without first seeing the poet's licence. Good-day."" ""Why can't they make these dummies more like? "" said a facetious fellow, halting with a friend in front of a clothing shop, and slapping a figure a vigorous blow on the cheek. The ""dummy "" (whp was hired by the day) turned suddenly, let fly his left, and the facetious man went down On the pavement as though struck by lightning. ""Well, QuisbY,"" said the funny man, ""now thatt you are married, I suppose you no longer have to sew on buttons as in days of yore."" Then a sad light shone in Quisby's eyes as he slowly replied: ""Young man, it is cruel of you to twit me thus. Now, I not only have to do my own inendinF, but my wifys too. She is a poet and is above such things."" ""Yes, gentleman,"" said the colonel, as he returned his glass to the counter, ""the true soldier is nevqr averse to discipline. No matter how objectionanle orders from a superior officer may be, they must te ,obeyed promptly and without question. The ,trqe soldier never—"" ""Pa,"" said the Colonel's little boy, opening the door, ""Ma says you're to Wine home right stray."" ""Gentlemen,"" said the Colonel, ""good clwy."" Travelled Lady (at hotel table) : ""I suppose you have sever been abroad."" Country Merchant: ""No, ma am."" ""I go over nearly every summer, but I can't become accustomed to the motion of the vessel. I get seasick every time; always; never fails; it is positively awful. Oh, I can't describe the feeling !you have never had it, of course."" ""NI!, ma'am, cent when hearing returned tourists talk about it at the table."" A man went to a neighbour.a Seotehman, and asked hi In to endorse a: n, murder to rai ot money :it a bank. Tue neighbour refused, saying, ""If I w4s to pit nay name tillt, ye wad get the siller fra the and when the time cam rodu' ye wadna be ready, and I wad hue to pay, sae ten y:in and I would quarrel; sae we Illatlll jnt as weel quarrel the one as 141103 the siller's in ivy ooueh."" Brown, sitting next to Jones at dinner. remarked that the spoons were solid silver, and worth 30,. each. Jones': "" Why did you out tell me soomi!.. I would have had one."" Brown : ""1 imve one in my boot."" Jonies : "" I can't take ono now, they are all :ooking."" After dinner Joie-, is aNked to hut excuses hive. self, and (der do a coniliring -trick instead, He tak,ii a it down the back uf his neck, wriggles hitnsi•,;. declare,s ;hat be will find it in Blown's bout, which rtcv.ptacle he pulls the A public meeting at Avignon, in which Naquet, St. Martin, and Laguerre spoke in favour of General Boulanger, was held in the largu courtyard of an inn. While Senator Nhquet—who is rather deaf—was bolding forth, a hen was about to lity an egg, and began to cackle vigorously by way of announcing the fact. The indignant orator, believing the noise proceeded from one of his heal ers,exelaimed : "" Those who interrupt me in such a fashion are the worst enemies of the republic !"" The statement was received with tremendous applause aud loud laugh4r. A coolness, growing out of the following conver4ation, has sprung up between Jones and Smith.: ""I had a splendid time last night,"" &aid Jones. ""I spent the evening at a little social gathering at the Goodman mansion."" "".Are the Go,,dnulAs nice people ?"" queried Smith. "" Well, I should sly so. They are very aristocratic. To g..t into their circle one must have either a gr,...at deal of money or a great deal of genius."" ""You don'b tell me so! And you say you were there? "" ""Yes."" ""You were invited, were you?"" ""Of courew"" ""And to be invited a man has to have plenty of moiety or a great deal of genius ? "" ""Precisely. "" Well, Jones, I am very glad to hear you have become rich 'all of a sudden. Lend me five pounds."" During one of the French wars, a general was lead, ing his regiment through a steep and narrow pals. He did his best to encomuge his inen under the danger, fatigue, and hunger, which sorely tried their courage and endurance. ""It is very easy Cr him to talk,"" growled one of the soldiers to those near him, ""he is mounted on a fine horse, while we must walk."" The officer caught the wan's remark, and immediately proposed that the grumbler s.laalld exchange places with him. The soldier immediately agreed; but he had scarcely mounted when a ball from the enemy posted on a neighbouring height struck and killed him. ""Yon see, my men,"" said the officer, as he again mounted, ""the most elevated place is sometimes the most idangerous."" At an afternoon performance of Bizet's opera, ""Carmen,"" at the Crystal Palace some time a4o, a very amusing incident occnrrad. There was considerable delay before the. curtain rose on the first act. The audience grew impetient. At last one of the actors appeared, and clannod their kind indulgence for a short time, as the gentleman who had to open the scene had not arrived. ""But,"" said Oa actor, ""I hold in my hand a telegram from him, which, with your kind permission, I will read to ygn. 'Stuck in a twin( 1. Impossible for me to get to thil Palace before twelve ruinut.'s past three.'"" Ttie audience roared for several minutes. One gentleman shoute4 out, at the top of his voice, ""Can you tell me, sir, where that favoured tunnel is situated?"" A volunteer cavalry reKinieztt was being r?Ab through the drill sxercise.s bt fore the inspectirer otHcer. When the lnig?e iff:I.VP the signal to stop, noticed that the line bully kept., ins/1y of the horses going two or thrLe yards beyond it. Thie inspector found loth with ti.-ns, hut the excuse was wade that the horses, not being so well trained &a army horses, coe:d not he sturp,l e,t proinnt;Y• Tkie commanding officer deti-rint,u-6 to find out whether horses or men h!anie,ond when a cavalry troop of regeiart, cuil 0 into the neigi,litturli._iod. be ordered the with their horst s. For a while ever.t thing , spietidy. NI; hen the ruen -were guin rniicv trot, sitc,.t- lily a halt wait sounded The ;•?,""' h. tognals and stoppe-ri wei,t h ug s•:e ike sacks of A botch i. I, .01 ::, i lall,l %• the •ove 1.•. 1.•, i,•!",1880
+3406,1208,art0022,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,135,0.8836,"THE OILED JUR] TO BITTEN A TRUE Dux in order that the matter might be further inquired into. He was sorry to tell the Graud Jury that there was no less than 16 eases for offences against women. Since the passing of the Criminal Law Imsailment Act in 1885 these prosecutions had increased in somber. 'lbis Act made a most important change in the law; it altered the age at which a you/1r person could consent to acts of immorality from 15 to 16, and the consequence of this change was that a large number of these cases were brought before the seurt which were no offences at all until the year 1885. These cases were anxious cases for judge and jury te try, as they depended upon the testimony of very young persons.",1880
+3406,1208,art0037,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,2,0.66,"-,-,,,.-,..-...."" jr",1880
+3406,1208,art0038,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,13,0.7977,"rat, OLIVER'S CRIME., TAIE FOUNDED UPON A =AIL TiRAGEDY OF THE LAST CENTURY. ",1880
+3406,1208,art0051,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1,0.72,toAl ,1880
+3406,1208,art0036,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1,0.89,".........-,..... ",1880
+3406,1208,art0040,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1,0.46,"f."";L7",1880
+3406,1208,art0019,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,2,0.665,"~ 1.,",1880
+3406,1208,art0005,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1088,0.8594,"THE PENSION FUND. I'm glad that the grievance concerning "" the pension,"" Is having the workmen's immediate attention ; Their aim is to shew that it never was wanted— The scheme in itself, nor the money they grouted. The labouring men, whose wages are low, Destitution's exterior plainly do shew. Their pants are oft patched, of vests some have none; • All this goes to show the low scale they are on. Crewe artizan's tempers, are generally level, Yet oftimes they wish this fond at the devil, If a ballot were taken this scheme to erase, The votes totalled up would the founders amaze. The crooked made straight a great wound it would heal; If the workmen of Crewe this scheme would repeal. Now this drastical scheme in my own opinion, 'Ought never to come in this railway dominion. Two shillings deducted from six days' hard toil, Makes the labourer's wives' calm temper to broil; • On Friday the women oft pull along face, And sometimes at tea they forget to say grace, . The grace you would hear, I'll venture to say, • Would soon be much better if they had their way. The nioney returned t,o the men, they would grant The Pension Scheme founder a thickish ash- plant. His hair they would:pluck I'm sure by the roo-ts— I would not for something be placed in his boots. What his ideas were when he started this plan, It puzzles each woman as well az asch man: The rules are quite 'clear—to mon that are sane,— Ton pay forty yews without any gain. When sixty-five Years old, and your hair ha 7, gcmo grey, You will wish that this scheme long had vanished sway. The cum you receive, it will just feed tt monse— It will scarce keep you out of tho people's big house. The place where the akilly, so thinit given-The place where earth's paupers find their last earthly heaven. ONE WIIO SUFFERS. Crowe, 6th December, 1888. TEE LAST SAD SCENE OF THE BETLEY TRAGEDY. • On Friday afternoon the curtain fell on the Betley tragedy, so far as the public—sad to say—is interested. Shortly. after two o'clock, "" four coffins—two of small size, end containing the remains of Harry aed Mabel Jervis, and two of larger size, in which lay the bodies of Sarah Ann and James Jervis—left the cottage that had so letely been the home of a happy family. It is not a matter of wonder that .quiet Betley-e-heppily so unused to such sad events—was in mourning. • • The little Staffordshire yillaere is so ;Belated to itself, and has so little connection with the busier outside woeld, that it seemed to nurse its grief, and net, in the proverbial nine days, recover from the sad wouderment in which it had been thrown. Consequently, signs of heartfelt end. peep sympathy and sorrow were to. be seen on every baud. And these signs lay not alone in the formal custom of-the letting down of blinds, and other such outward signs. hut the tear-stainod faces of many •villNers (showed that their sorrow was more than skin-. deep. Betley seemed to be wrapped in gloom, and its • very atmosphere seemed to be sorrow-leder-1. Many people followed the mournful procession to the church yard, and all seemed to appreciate the solemnity of the occasion; No 'hearse or conch was present in the ceremony ; the commendeble assistance of friends did not necessitate these, for the coffins Of the deceased children were rarrisi. by eight boys, and those of their parents by e'ght men. The coffins were of Engrish oak, with et monntinree. and were :covered with floral tribetete They hors the following inscriptions : James Jarvis, born May 33. 1855 ; died November 26, ""Sarah Ann Jervis, born jane 17. 1552 ; died November 25, 1853."" "" Earry Vincent jell-is, born September 4, 18S1 ; die November 26, 1838."" ""Mabel Jervis. born September 21, 1387 ; died November 26, 1383."" The chief mot/rearm were Nora, Edith, .Ereest, "" end Florenee*Jervire children of Sarah Ann end Tames Terns ;Mr. 7..10nez end Iliss Jane Jones, brother and Ritter ef jervie ; Alfred Jervis-, brother of Jetties Jervis: Lfrs. Themae, sister of rtrrs. Jarvis ; Mr.;. sentie, nephew ; Miss Elizabeth certain ; Mr. ;Tarnee Eilems, Pr. John Elle.nte, and Edwerd Re tele!, une!es, Mr. Edward Itielphs,letn., Mr. Philip Stvandern, and Mr. F. N.Varhatn, cqueina. Many of jereis'a renew railway conpleyens were presses, ani.i'ttemeert them were the master of Befley Road Station, lie. Jlr.ohshav, arid Mr. G. Ir. Mountford, of Itindeley. Whea the proeeseion reached the chereli raf S. It was met by the Bev. R. Borighiey (ricer), tovi the Rev. G. Bailey (curate-in-charge of St. Sevronee ehnrch, edeley). These two clergymen t* then went Ihron the 'first portion of the sad ceremony, and, $1.1; its eon:lesion, tho procession refanned. and preeendsel to the greve. Hero the emotional natures oi the relatives end • hyetatedere' themeebes. 01143 RIAt27 a sob was heard coming; front those vibe, with POT erential end sorrowful mien. stood. a ronna ha rave. As the coffin of r,oris. J nes wee Leteg lowered into the grave, the arise ""Oh r1""""""1 14*I'lms."" stomped in piteous • tenes, tress he el-.lllreve. It seeeied as thong all .fietley wee preeent. for the church'. yard was fall of ?-ople, tied they Leese:lea to ha full of sympathising serreve .ritne CO et were being lowerod into the graYe, virnatlol n' pretty newer: were dropped open there. Then. at tat a list leek had been tuoben at all *het we:: nsibl to 1.60,41 the sad tragedy, the chitechrad erica niece reeerned its appearance of roonauful DilltirrrPraclm.—On ly*Ha satnrany tho pric‘a .t Ibi trlarket-..wor ros !elloirs :—Brater, is 31 to Is pc..r lb; rgga,. 7 fcw 1.; fonle,4 Gd to 5s ad pc.r corple ; 11.:e17e, 5F5 Oul ?3 bt 64 rr: oexpin: beef, 7a to 9d peT ;tLi. Tti K. 94 ?er lb ; 001, r.l Sd ror lb: pork, 6;1 to 8d per 111; ros*>,PG, 6o cd pr 1\; 1101,30 r,-rotpoo, sta Per lb ; hpialer, la to Icl per lb ; pertro,la te 5a war lb ; turnips, li per ; oarroto, rts lb. file*""6lweets 2d to 4d emial.",1880
+3406,1208,art0046,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,304,0.8837,"clftrMitc aTioThrtspect-s-Mat.l Mara -15WaStfe he.* but Cook had:: no betting-book thatt woold be a uaoto anyone."" /reicher of the other two doctors seemed Co to sneer into conversation with Palmer, and 13 • Iry aspasially reticent. Noting this Palmer Die left ahem) and prooeeded to the Tati;ot Mute. lib' tne examination was to tare place. There ha /net with Newton, who was in the entranco hall of the hotel. Palmer said to him; 'lt will be a very dirty job. I ahall go and have a little brandy. Como aerm to my house, Newton, and have a drop.' Newton consented, and at Palmer's invitation drank two more glasses' full of neat brandy, and Palmer thank the same. ""You will find this fellow suffering from a diseased throat. He has been taking mercury for ids tongue I' °lndeed!"" said Newton. EYes, ho has taken a great deal of mercury."" • Palmer, watching through the window of hi 3 house. saw the other doctors enter the hotel, and he and lqewton hastened to join them. All the doctors at once proceeded to the room when the medical examination was to be made, upon which so much depended, as to the exposure of the Rugeley doctor's fearful crimes. The first thing that the doctors remarked. though no audible comment was made upon it at the time, was the unusual stiffness of the body. We do not propose to conduct our readers through the ghastly details of this examination, but several incidents happened during its performance. Palmer was present the whole of the time ; aria. seeing, that the intestines and the stomach presented a healthy appearance, he could not refrain from re marking to Dr. Bamford, in a loud whisper, thav viran't bsaur us (To be Continued.)",1880
+3406,1208,sect0001,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,2986,0.877,"NEW SENSLTIONLI4 AND PATHETIC LOCAL ST CRY 001IBIENCES IN THIF3 Wn2z's STAR Entitled, DR. OLIVER'S mIIE, A Tale Founded upon a Local Tragedy of the Last Century. FOR MANTLES ETS, lIVSTERg I‘,l* V CHESTER ;,c. 0 ROUSE, 52, lIIGH ST., CREWE. THE CREWE REMEDY, Cures Coughs, Colds, Bronchitis, Influenza, St4e„; Throat. Difficulty of Breathing, Iloarsen.ess, ctc. PRICE 18. lia., AND 2s. 9d. Pint porn.D. PREPARED DT A; P. NEWMAN' CHEMIST 43, NANTIVICH_RaiD, &: 6, VICTORIA STREET, CREWE. Who has purchased the Original Recipe, from Mr. • J. MCNEILL, formerly of Crewe. ASTON'S WINTER NOVELTIES. LADIES' AND GENTLEMEN'S GLOVES, HOSIERY, UNDERWEAR, TIES, COLLARS, ?cc. TUE FAMOUS PEICENIX.ISILK UMBRELLAS ALL GOODS AT FAIR PRICES. R. y. ASTON LATE GELDART.) 31, HIGH STREET, CREWE, TO PII:RCH.A.SERS OF PATENT MEDIC IN-ES. B 0 0 T H being a Large Buyer dire/-t • from the Makers, can sell cheaper than any house ha the district. Note a few out of many6d. Pear's Soap, 3id. each, or 3 for 10d. 4173. 6d. Bottles Warner's Safe Cure for 3s, 2ia is. Weave's :Food for,lnfants, Bd. Is. Dr. Ridge's Food, 9d, 18. Van Honten's Cocoa, 91d. is. bottles Greensill's Genuine Mona Boquet, 541 2s. 6d. St. Jacob's Oil for is. 2s. 6d. Seigel's Syrup for"" is. lOid, le. boxes Seidlitz Powders for 514 d. Atfd every article at 'equally low Prices, Tr! 1I)00TH, ILle CHESTER :BRIDGE, CREWE. - • SPECIAL NOTICE! GREAT SALE! • TURNER'S NORTHERN SALE ROO BIS MILL-.STREET, CREWE. • WIIOLESALE AND RETAIL, BARGAINS EVERY DAY. C'V.ROOATS, CLOTHING; DRAPERY, CARPETS CURTAINS Stocks by Tender, in Liquidation, £2,000 WORTH, Vhioh will be offered to the Public at Bargains. Never anything like it. MR. TURNER, AILLL STREET, CREWE, llas Great Pleasure in Announcing that his BIG SALE -ion Tins WINTER Is now on. Splendid :Stock of MENS, YOUTHS', AND BOYS' CLOTHING 500 OVER•COITS, Theltyal Navy Suit (lie boys' favourite), including Lanyard and Whistle. Tholoyal Tar, white cord trousers, duck-jackets, sage jackets, cord and mole trousers, reeler jackets, pilot trousers, mechanics' clothing, &c. Also a portion of GE). WALKER Ax D Co.'s STOCK, Consisting of Carpets,ioo pairs of curtains and curtain net 3, 5,000 yards 3lack Cashmeres, coloured ditto, Fancy pre-Goods, Meltonf, 13eiges, Brussols ana Ta?atry Carpets, Prints. Gs.'atea, White ad Grey calicoes, Ladies' Jackets, • &c. 280 Quiltasoo pairs of Sheets, 200 pairs of Blankets, &1 Piths Flannels. 80 pieces cottoA Shirting, 'llene, hands, Wool and cotton Shirtings, Hosiery, Velvds, 'inbrellas, Shawls, 'Waters, Handkerchiefs, Braces, itc. Tr Tier asks the Public to go to the Sale andee for themselves. Weekly Payments t'tkell• Clultickets taken.as cash. TURNERS SALE T.DOOIIS,' -WARD BROS COACH PROPRIETORS, HIG-HTOWT.N, CREWE. Modern HEARSE, with GLASS SIDE, CALUVED S/DE and CLora SIDE, MOURNING COACHES, WEDDING in OTHER CARRIAGES Pro-Nic OMNIBUSES, WAGONETTE, LANDAUS, and TRAPS for Hire. TERMS REASONABLE. GEORGE T FLACK. • MILL HOUSE, WILL STREET, CREWE, Begs to announce that he is Agent for the Ales and Porters supplied by the Uttosetor Brewery Company, Limited, and solicits a share of custom from the Trade and Families. PRICE LISTS ON APPLICATION. Liberal Discount to the Trade: 5 per cent off Private Orders. Families supplied with 9, 12,18, or 36 gallons at one day's notice. ORDERS ANIS NOW BEING BOOKED FOR C FIRISTMAS. FURS. • TIES. FUR CAPES, &c. Newest shapes in Capes—The Dorothy, Princess, Sylvia, &c., in real Beaver, Skunk, Mink, Seal, Stone Martin, Monfflon, &c. A large stock of Squirrel Ties, 'very cheap, large sizes from 9s. 6d.—See these. T. L. JOHNSON, Tun LONDON HOUSE, CarESTER CREWE. SATISFACTORY FURNITURE. FOR NEARLY TWENTY YEARS past we have supplied the most durable and reliable goods to thousands of residents in this town and surrounding districts. Many of who= have, unsolicited, expressed tlteir utmost satisfaction and approval of our mode of doing businesir, viz :—A really reliable article at lowest possible price.. In the future, as in the past, our utmost efforts will be directed to supplying Furniture and Bedding unequalled at the price. OrOULD, ,T • FDENITtE D AND BEDDING MAIMPACTITEUR, 40, High-street, and 79, Market-street, CREWS. IL WA_ L TJTIT OR WATCHMAKER AND JEWELLER, • 9 CHESTER 3iiDCE, CREWE. WATCHES CLOCXS- -AND JEWELLERY OF EVERY DESCRIPTION 20 TO 30 PER CENT. BELOW THE USUAL PRICES. CLOCKS. • ITheroom timepieces .. .. from 23 Od Lever clocks .. n 33 Od Cottage alarnms - „ 3s Gd Eight-day striking clocks „ 153 61 Long regulators • • .„ 25s Od Fteuch marble timepieces .. „ 183 Od WAJULANTBDC oOD TrMF,TE.r.PELS. WATCIIES. The marvel watch .. • • .. 53 &I Ladies' silver Geneva watches, from 18s Od Ladies' gold watches, keyless 30s Od Gentlemerue silver watches .. /8s Od Gentlemen's gold watches N 55s Od ' Gentlemen's silver lever watches 30,3 Od Good English silver lever do. ~ EN Od Vary superior finish do. • • « 843 Od WARILANTBD TO Snip CORRECT DIM. JEWELLERY. • '4adies' gold ear-rings.. from 2,3 od Ladies gold dress rings 2,1 6d Ladies wedding rings, 22 carat « Ss 64 Gents' gold hoops • . N 55 Gd Gold guards .. Or 30s Od Silver aborts .• 53 9d Also a good stook of gold and silver lockets, necklets, bracelets, charms, pencil eases, tootll picks, scarf pins, &a., sA equally low prices. rr.v.arno-rnituun Smartt, Fount, SALT; Cann?: dtc.. OPTICS. spectacks and eye glasses, in gold and rteul frames to snit all sights; reading glasses, telescopes, microscopes, opera and field gktsabe , of every description. AGIN. T FOI% TEES "" Eto wE ,"" "" ECLTPSISA"". AND OTHER SEWING VACEINES. WILLETT'S CHRISTMAS BAAR ' . OPEN FREE TO ALL; 11/EASE WALK THROUGH: We have great pleasure in inviting the public to inspect our Stock of Fancy Goods, specially selected for the Christmas Season. CHRISTMAS CARDS. 50,000 Xmas Cards, all new designs, from One Penny per Dozen. Splendid Value in cards at id, id, 10, 2d, 3d, 4d, 6d, and 8d Each. These prices represent half the published price. Envelopes given with all above cards. Our 3d. packet contains 12 good cards. 6d. It SI 25 excellent , „ is. 50 superb „ We have also hares containing nine large cards and nine envelopes for 61., and double the quantity for is. TOYS, GAMES, AND FANCY GOODS. The largest and best assortment ever offered in Crewe in Building Bricks, Dolls, Mechanical Figures Drums, Money Boxes, Skittles' Purses, Satchels Albums, Brackets, Plants and [ots, Moss, besides Thousands of !other articles which will be tastefully displayed. ' RIBBONS, LACES, GLOVES, to., &a. We have now a grand stock of Ribbons, bought at special prices, in plains, stripes, and checks, which we are offering from 2i'd. for full width. Neck Ribbon from id., all colours. Evening Shades of Silk Gloves from 10N. Feather Fans, all shades, 60., is. Old. (feathers both sides), is. llid. and 2s. tlid. DONT MISS THE FREE BAZAAR AND FANCY FAIR, 28 AND 81, VICTORIA STREET, CREWE. CHRISTMAS pRESENTS.. • . NEW YEAR'S GIFT S WANTED, a. Youth with some knowledge of shorthand.—Apply &Lin Office, Crewe. 10 LET a oornPr Shop and House, 92, West-street. —Apply, 88, Nantwich-road. TIO LET.—Twelve houses in Adelaide-street, Crewe, six rooms; rent, 5s 6d clear; also five houses in Allan-street, five rooms; rent, 4s Gd. Will be thoreughly cleaned and repaired for suitable tenants.—Apply at No. 33, Adelaide-street. Lure To al SoLe.—lnC hire-road (Underwooi-lane), Coppenkall. Price le; per yard. Free conveyance. £2O to £25 will purchase a plot with twelve yards frontage, and large enough for two houses, with large gariens. Apply, before 6th December, to J. J11213t210N 36, Nantwich-road, Ciewe lIISTABLISHED 1872. IJ.. capital h7se tolmammaanceilLavdienslroan3 large gad advancing same in snms from £5 to £5OO to Farmers, Tradesmen, cab and cart proprietors, householders. ani all persons in want ef eask accommodation, ON THEIR NOTE OF It.A.IID ALONE, and on most reasonable terms, in the stris.est privaey, at a few hours' notice, in any part of England- or Wales. Easy repaymcnts.—Apply personally. or by letter, to Mr. C. EATON, 2, Sidleson-read, Crowe, Cheshire—private house. Crewe Metual Allotment Building Society, Bank • Buildings, Market-street. rphis Wert 'Allotment will be by Ballot, on' December IL 3rd, at the Society's Office, at 8 p.m. Members joining at any time before the Next Allotment, and paying Five Weeks' Subscriptions, will be entitled to participate in the Ballot Shares, £lOO each. A. member may hold any number, from one to five, in one, Group. Subscriptions, 61. per Share per week. Entrance Fee, is. per Share. Weekly Subsaiiptions received every Friday, from 7 to 8.30 p.m. The Society has already allotted to the Members an aggregate sum of 12,800. Shares to the number of 625 have been taken up, constituting 225 groups. Shares issued daffy at the Society's Office, or at any of the Weekly Subscription Meetings. Advances made in full, without interest, on the Ballot and Sale principle. Borrowers can repay from 1 to 12i years. A. FLETCHER, Secretary. 109, West-street, Crewe. F. G. COPE, EfARLE STREET, Begs to thank the Inhabitants of Crewe and District for the supiort they have given him during the past ',Eight Years, and to inform them that he is • OPENING A BRANCH AT 77, VICTORIA STRErT, ON FRIDAY NOVEMBER 30th., With a Choice Stock of Hats and Caps, Shirts, Serge and Duck Jackets, Hosiery and Gloves, Umbrellas, and Household Drapery, F. G. C, will c0nt::.11,3 88 in the past, tl dupply GOODS OF RELIABLE QUALITY Al the lowest possible prices. NETS TIIE ADDRESS F G. C"" E 5, EARLE STREET, AND f 7, VICTORIA STREET. CBE-WE The only place in Crewe where you can obtain RELIABL l'.l HOME MADE F URNITURE, BEDDING, ctc., iS AT HOUSE'S COM,PLETE BOUSE FURNISHER , .(Wholesale and retail), HE AT EE ST_R_E_E T , CREWE 3513 n CENT. SAVED BY BUYING DIii.ECT FROM THE MANUFACTURER. By far the most extensive furnisher on the Easy Payment System in the -Provinces. Supply every requisite for the complete furnishing of cottages, hotels, or mansions 35 per cent, cheaper than the majlritT of those firms who sell for cash only. This lam able to do through'having a large capital at 'command and being the bonafide manufacturer of the principa goods I sell. NO SECURITIES REQUIRED, NO PUBLICITY REQUIRED i ALL GOODS DELIVERED FREE, BY MY I OWN VANS. , ' All goods are booked at the same price as ticketed in the windows and show rooms without any extra charge. GENER a., TERMS, Amount of purchase. 1' s. s. d 1 3 worth 5 depnit 1 6 weekly 5 ~ . •10 „ • 26 „ 10 ~ 20 „ 4 0 „ 20 40 ,i 70 „ ~ so ~ 70 ~ 100 „ 5 0 ~ 140 „ SO SI 100 ,4 10 0 f t 20 0 „ From £lOO and upwards a aiting cenvenience of - customers. MONTHLY OR QUARTERLY PAYMENTS ARRANGED. 2s. In the POUND DD-;COUNT allowed on all CASE{ PURCHASES. . The LARGEST and BEST SELBCTI3D STOCK s . in the COUNTY to choose from. , ... , . ...;. - .. . .-- --•- • ' • ', ~ ::'---.• -0.-- „I -5: ---7; --.7 `--. ''' ---,-* ' • . _ “RAWSON'S“ AT HOME AGAIN AT TIIE OAK _FARM HOTEL, E NIT E TOWN HALL, CREWE. TO-NIGHT (FRIDAY), AND TO•MORROW SATURDAY, December 7th, and Bth. Return vi6it oi BIRRELL'S NEW GREAT DIORAMA. OF SCOTLAND, AND GRAND SCOTTISH CONCERT,' by the Royal Caledonian Minstrels, Champion Dancers and ,Prize Reel players at the Norll3. Mies Griselda Wass, the Renowned Scottish Nightingale. and Mr. Thomas Bare'', the unrivalled Yecalist and Violinist, train Covent Garden Theatre, and Agricaltaral flail, Loudon. ""Miss Griaelda Wass has just been presented with a splendid pony and earriag by tie proprietors at • the • Scottish Nights"" periodical, as being the handine est woman, in their Bawdy Competition, in the United Kingdom.'—Vide "" Glasgow ScottishiNights,"" August 15th. 1885. Admission, 36. 2s. 15. and sd. Each evening at 8. Grand Afternoon perfurm.uco on Saturday, at 3. Doors open half-an-hour earlier last day, Saturday, December Bth. -COMMUNOING AT ALTRINCHAM, MOXDAY, DEC. 10TH. J. N. GER MAN HAIRDRESSER Ax 3) TOBACCONIST. Has the Best and Largest Assortment of PIPES, POUCHES, WALKING STICKS, AND CANES. UMBRELLAS SKILFULLY AND PROMPTLY REPAIRED. 77, MARKET STREET, CREWE. 13-LSTERS. MANTLES. JACKETS. New designs are being delivered every few days at TEB LONDON Housu, Chester Bridge Ulstors in the New Russian and Cape shapes, Bs. lid,, 10s. 11d. special; 12s. lid., 142. 11d., 162.11 d. Grand value these; also best goods up to 305.; a nice lot in black— children's from 2s. T, L. JOHNSON) LONDON HOUSE, CHESTER BRIDGE, CREWE. IF YOU WANT YOUR PICTURES WELL AND CHEAPLY FRAMED GO TO JOSEPH powELL, 79, WEST STREET, CREWE: N.B.—Tho Trade stpplied with A.louldim.,,s, Mounts, ""arc. nine feet Lenghts from 41d. each ±IOOTBALL CUP TIES, USE ONLY J : P . GIIEGGS, SPECIAL INTERNATIONAL FOOTBALL (Entire Weight 12 oz.) DELIVERED FREE 7s. EACH. MATCH BALLS 6s. 3d, EACH. Unsolicited Testimonials may be seen by Application to the Manufacturer, 2, HILL STREET, CREWE. NB.—The above Ball is made specially for Cup Ties. FOB, MANTLES JACK ETS, ULSTERS, itcrc s • G• CTI-P4STER HOUSE, 52, HIGH eRENVE. EAST'S FOR GENUINE SMOKED WILTSHIRE BACON. 35, MILL sTREET, - CREWE; T"" DADde LADIES' TAILORS SS AND itIANTLE ;LIARERS, 79A, larri, LRT, 0REN17.24 LADIES! AND GENTLE7'.7.N'S GENERAL BOYS' SCHOOL, FANCY, Kik; D DR:S SU.U.II Special Suits mule to tilaor vr, the Shortest Notice. • Gtuts' ,:›uits, and all under Garments. HiSPECTION IS RE S 1-.ECTFI'LLY INVITED _ PARCELS SENT ON ArrRoVAL. TERMS—CA SU. SINGER S SEWING AIA.CHINES. Everywhere the most popular for Hr.N.mestic and manufacturing purposes. Over SnvEm MILLION of these reliable :alachines • now in use. raicE FRoTM £4 4 10 per Cent. discount for bash, or purchased on the Company's Easy System. Instruction Free. LOCAL BRANCHES: 27, HIGH STREET, CREWE AND 16, HIGH STRE T, NANTWICH. HEABS! HERBS! HERBS! --jo BOOTH • MEDICAL HERBALIST, MAR KET STREET, CREWE . B. has had TEN YEARS EXPERIENCE and PRACTICE in the Herbal Treatment of DISEASES, And having served an apprentice to a Chemist and Druggist, is COMPETENT TO SUDGE as to the merits of the HERBAL against the MINERAL TRULTICUXT. Advice Free. A week's medicine for One Shilling. TRY HERBS. TRY HERBS. Nature's Medicine. p. sWINT.ON'S, ,KNITTED 'HOSIERY AND GARMENTS Arai TIM CHEAPEST AN BEST. -HIGH STREET CREWE. FOR RELIABLE MOURNING TRY JOHNSON'S, Johnson's fast-dyed Black Cs.shmerce are the best, soft in finish, beautiful bloom of colour, and wear guaranteed. Prises— is. 4d., is. 6fd., 25.. and 2s. 6d. a yard. Crape and Crape Millinery all to match, T. L. JOHNSON, TIME LONDON HOUSE, CUESTHE MUD:GB; CREWE. Established 184-5. GEO. THOMSON, Family Wine and Spirit Merchant, BRUNSWICK HOTEL, NANTWICH ROAD, CREWE. Barclay, Perkins, and Co.'s Imperial Stout szid • London Porter. Burton, Salford, and Warrington Ales. Bass's Bitter Beer, and Guinness's Stout, supplied in half-pint and pint bottles. The choicest Wines and Spirits at the Lowest possible prices. PRICE LIST. AND SAXPLES SENT ON APPLICATION. F. SWINTOIN, Is now offering Spacial Value in SANITARY WOOL GARMBNTS Made from Nolx-SnitiNKABLE Woo A complete range of Ladies' end Children's Underclothing, Baby Linen, Pellises, Frocks, Pinafores, and Fancy Anrons. Berlin and Fancy Wools, Fingering and Allots. Knitting Yarns, and an IMMENSE STOCK or FANCY GOODS. HIGH STREET, CREWE. JOSEPH SIMN , ROBIN HOOD INN, NANTWICH ROAD, CREWE. ALES AND SPIRITS OF TED FINEST QUALITY. CEIOICN CIGARS. A FREE and EASY held every Friday and Saturday in Large Concert Room. Excellent accommodation for Football Clubs and Visitors. DANCING carried on as usual in Exchange Rooms, Mill-street and Francis-street. Special arrangements for select learners' classes. WORKINGMEN'S CLOTHING. CROOK ANI SON CHEAPEST CLOTHIERS IN CREWE. BOYS' SUITS from 2s. 63. LIENS' BLACK WORSTED SUITS from 19s. 6d, A!ENS' CORD TROUSERS, 4s. 6n. and ss. 11n CROOK AND SON, 22, HIGH STREET, CBIIWE. ENIRY-TH FAMILY GROCER, CIIGH STREET, CREWE, • • FINEST NEW FRUIT. aLINTLEY AND PATATISR'S XMAS SrECIALITIES CAKES AND OB,NAMENTAL BISCUITS. .‘ A:GIINT FOR lAIIIB ANL) WATT'S PURE FOREIGN AND BRITISH WINES. FOOTBALLS. FOOTBALLS. FOOTBASLL. CHESHIRE FOOTBALL MANUI'ACTORY. W. H. wooD,. 52, MARKET STREET, OiiEWE, AND • • HIGH . STREET, SANDBACH. ILANUFACTURER OF THE CELEBRATED ""COMPOUND,"" "" AND ""MATCH"" FOOTBALLS. Has the largest Stock of C f Footballs and Sliinguards irk h FOOTBALLS GUARANTEED 11.'...;`:71) SEWN (Only best English le,..ther The Compound Football was used latlt Season in the . °Hewing important niatches:7--West Broniwich A.lbion v. Derby Junction (semi-final tie ETti cup competition); West Bromwich Albion. V. St-:)ize for Staffordshire cup oompetition); Crewe v. Chester (f.nal tic Cheshire cep competition); the two semi-final ties for the. Cheshire cup, and many other tirst-e- ass Matches. , 44"" th, Poo STILL THY COME! STILL T 1.1 COME! ottlio • In their Thotuands weekly, for the bet!) foTtio SPLENDID SEISD, CURRANT AND SULTANA - atzl F CAKES, tti iea , for Sold by the he LONDON AND COUNTIES TEA. COMPANY srm. AT :ted STILL YOU MUS' COME! Ana try their SPLENDID NEW CIIRISTM AS FRUITS- I• r Currants, 4d, ;Ind 4 I Sultanas, 4th 4 Valencias, sth , Lemon Peol, 4d and ;A. eitran and Change Spi-:eF, .Bx. at Lovresi Possible Priee.i.' 240111 1118 ADDRB:F.S : • LONDON AND COUNTIES TEA. • P ANY, ; . 67, -.MA.RICI4E 5T1tE12.47, CREWIt ;",1880
+3406,1208,art0018,"Nantwich, Sandbach & Crewe Star",British Library Living with Machines Project,1888-12-08,1888,12,8,,1676,0.8944,"BY A PRACTICAL FARMER. THE DEAD YEAR. • So soon as December commences, all outdoor Operations seem to become checked, and as the year runs out. is course, NNork -becomes More and more confined to the neeeesary attendance upon outlying stock and the less important duties in and around the homestead. In fact, about Christmas. tide the fields wear a neglected and empty appearance, eloquent of the departure of another year. • The extremely mild autumn which continues even up to the date of writing, has tempted a number of farmers to persevere in wheat sowing far later than has been customary for many a year plait. They seem to hold the view that the plantiug or spring wheat should only be carried Out Iveen netcss-ity compels the stoppage of the drills in autumn-before all their work. has been done. The same temptation has protracted the storage! of the root crop far later into the winter than -usual, and now on many turnip farms the hardier loots are still being cleaned and pitted on the - land. The general mildness of the period of we weather through which •we have passed bag been very beneficial to the turnip crop, which in many localities is remarkably sound and heavy . yielding, It is unfortunately the case that at the present. moment many acres of turnips are being puled whilst theyatre still growing, and the • amount of green and succulent top which is being cut off would materially help to swell the yield of the roots, if only they could be allowed a little longer in the ground. But it is impossible that. this mild weather can last much longer, and so it is of the utmost importance to get all roots cleaned and covered up without ttny furth6 delay. The young wheals are already shelving a strong and healthily-coloured leaf,but the ground is often • Very dirty between their rows, and lam afraid will require sorne further attention before long. The pieces of rye Which I have come across are equally Promising. The great and heavy rainfall of this autuma has made a muddle of much of the ploughed land; and, in consequence, ploughing has had to be stopped for a thew. I was much interested to notice the really admirable manner in which a chilled plough, made by an English firm, after a wellknown Transatlantic model, did its work on a stiff and intractable clay stubble, Which was almost sodden by the rain. The Englieh Pattern plough had been stopped by the sticky state of the land and its little rival was set to work to finish the job, which it accomplished in the mnst satisfactory way possible, turning the land over, and laying it in straight, well sloped , sections, clean and regularly cut, as quickly and eakilv as po,6sible. The only thing that can be said 'against these ""American ""ploughs, as they continue to be called, although several patterns of them are made in England, is that their stilts are so short, that the ploughman has an insufficient command over his implement; in stony or irregular land this is very apparent, and it eeenas to require the greatest skill and strength to preserve an even and unbroken Perrone snEEP - • Are doing- very well indeed on the still Ventiful growth of grass of the permanent pasture% But upon the lesz; productive areas of ternporary and rotation pastures,which ',aye now elmeet ceased to grow for the winter, their keep illust be euppleinented by a certain quantity .01 roots. A very praiseworthy practice at this rime of the year, %;elien turnip tops are plentiful, to cart the tons from the fields where the turnips are lee:rig etored to the nearest pastures, and there to throw them u down in email heaps rege!arly twi a day. By placing the tops every 'nay in a, di ifen.nt spot, the sheep manure becomes Ilistrileute0
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+ assert len(geoparser.ranker.mentions_to_wikidata.keys())>0
- resolved = geoparser.run_text(
- "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.",
- )
- assert len(resolved) == 3
- assert resolved[0]["mention"] == "Shefiield"
- assert resolved[0]["prior_cand_score"] == dict()
- assert resolved[0]["cross_cand_score"]["Q42448"] == 0.903
- assert resolved[0]["string_match_score"]["Sheffield"][0] == 0.999
- assert resolved[0]["prediction"] == "Q42448"
- assert resolved[0]["ed_score"] == 0.903
- assert resolved[0]["ner_score"] == 1.0
+ text = "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though."
+ predictions = geoparser.run(text)
+
+ assert len(predictions.sentence_candidates) == 2
+ assert len(predictions.sentence_candidates[0].candidates) == 2
+ assert len(predictions.sentence_candidates[1].candidates) == 1
+ assert len(predictions.candidates()) == 3
+ assert predictions.candidates()[0].mention.mention == "Shefiield"
+ assert predictions.candidates()[0].best_match().string_match.variation == "Sheffield"
+ assert predictions.candidates()[0].best_string_match().string_similarity == 0.999494
+ assert predictions.candidates()[0].best_wqid() == "Q42448"
+ assert predictions.candidates()[0].best_match().cross_cand_scores()["Q42448"] == 0.903
+ assert predictions.candidates()[0].best_match().best_disambiguation_score() == pytest.approx(0.903, abs=1e-3)
+ assert predictions.candidates()[0].best_disambiguation_score() == pytest.approx(0.903, abs=1e-3)
+ assert predictions.candidates()[0].mention.ner_score == 1.0
+
+ # The predictions are Sheffield (Q42448), Leeds (Q39121) and London (Q84).
+ assert predictions.best_wqids() == ['Q42448', 'Q39121', 'Q84']
+ assert predictions.best_disambiguation_scores() == [
+ pytest.approx(0.903, abs=1e-3),
+ pytest.approx(0.913, abs=1e-3),
+ pytest.approx(0.972, abs=1e-3)]
- resolved = geoparser.run_sentence("")
- assert resolved == []
+ assert geoparser.run("").is_empty()
- resolved = geoparser.run_sentence(" ")
- assert resolved == []
+ assert geoparser.run(" ").is_empty()
# asserting behaviour with • character
- resolved = geoparser.run_text(
- " • - S G pOllO-P• FERRIS - • - , i ",
- )
- assert resolved == []
+ text = " • - S G pOllO-P• FERRIS - • - , i "
+ assert geoparser.run(text).is_empty()
-@pytest.mark.skip(reason="Needs large resources")
+@pytest.mark.resources(reason="Needs large resources")
def test_deezy_rel_wpubl_wmtops(tmp_path):
model_path = os.path.join(current_dir, "../resources/models/")
assert os.path.isdir(model_path) is True
- myner = recogniser.Recogniser(
- model="blb_lwm-ner-fine",
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
pipe=None,
@@ -148,13 +174,11 @@ def test_deezy_rel_wpubl_wmtops(tmp_path):
},
overwrite_training=False, # Set to True if you want to overwrite model if existing
do_test=False, # Set to True if you want to train on test mode
- load_from_hub=False, # Bool: True if model is in HuggingFace hub
)
# --------------------------------------
# Instantiate the ranker:
- myranker = ranking.Ranker(
- method="deezymatch",
+ ranker = ranking.DeezyMatchRanker(
resources_path=os.path.join(current_dir, "../resources/"),
mentions_to_wikidata=dict(),
wikidata_to_mentions=dict(),
@@ -183,14 +207,13 @@ def test_deezy_rel_wpubl_wmtops(tmp_path):
"overwrite_training": False,
"do_test": False,
},
- already_collected_cands=dict(),
)
with sqlite3.connect(os.path.join(current_dir, "../resources/rel_db/embeddings_database.db")) as conn:
cursor = conn.cursor()
- mylinker = linking.Linker(
- method="reldisamb",
+ linker = linking.RelDisambLinker(
resources_path=os.path.join(current_dir, "../resources/"),
+ ranker=ranker,
linking_resources=dict(),
rel_params={
"model_path": os.path.join(current_dir,"../resources/models/disambiguation/"),
@@ -198,6 +221,8 @@ def test_deezy_rel_wpubl_wmtops(tmp_path):
"training_split": "originalsplit",
"db_embeddings": cursor,
"with_publication": True,
+ "predict_place_of_publication": False,
+ "combined_score": False,
"without_microtoponyms": True,
"do_test": False,
"default_publname": "United Kingdom",
@@ -206,29 +231,65 @@ def test_deezy_rel_wpubl_wmtops(tmp_path):
overwrite_training=False,
)
- geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker)
-
- resolved = geoparser.run_text(
- "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.",
- place="Sheffield",
- place_wqid="Q42448",
- )
-
- assert len(resolved) == 3
- assert resolved[0]["mention"] == "Shefiield"
- assert resolved[0]["prior_cand_score"]["Q42448"] == pytest.approx(0.891, abs=1e-3)
- assert resolved[0]["cross_cand_score"]["Q42448"] == pytest.approx(0.766, abs=1e-3)
- assert resolved[0]["prediction"] == "Q42448"
- # assert resolved[0]["ed_score"] == 0.039 # TODO: reproduce this number.
- assert resolved[0]["ner_score"] == 1.0
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+
+ text = "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though."
+ predictions = geoparser.run(text, place_of_pub_wqid="Q42448", place_of_pub="Sheffield")
+
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.sentence_candidates) == 2
+ assert len(predictions.sentence_candidates[0].candidates) == 2
+ assert len(predictions.sentence_candidates[1].candidates) == 1
+ assert len(predictions.candidates()) == 3
+ assert predictions.candidates()[0].mention.mention == "Shefiield"
+ assert predictions.candidates()[0].best_match().string_match.variation == "Sheffield"
+ assert predictions.candidates()[0].best_match().string_match.string_similarity == 0.999494
+ assert predictions.candidates()[0].best_wqid() == "Q42448"
+ assert predictions.candidates()[0].best_disambiguation_score() == pytest.approx(0.766, abs=1e-3)
+
+ # Check the interim disambiguation score (i.e. before applying the REL model).
+ # The only difference is in the best_disambiguation_score result.
+ # Note: previously the term "prior_cand_score" was used to refer to the interim score.
+ assert len(predictions.interim_candidates()) == 3
+ assert predictions.interim_candidates()[0].mention.mention == "Shefiield"
+ assert predictions.interim_candidates()[0].best_match().string_match.variation == "Sheffield"
+ assert predictions.interim_candidates()[0].best_match().string_match.string_similarity == 0.999494
+ assert predictions.interim_candidates()[0].best_wqid() == "Q42448"
+ assert predictions.interim_candidates()[0].best_disambiguation_score() == pytest.approx(0.891, abs=1e-3)
+
+ # The predictions are Sheffield (Q42448), Leeds (Q39121) and London (Q84).
+ assert predictions.best_wqids() == ['Q42448', 'Q39121', 'Q84']
+ assert predictions.best_disambiguation_scores() == [
+ pytest.approx(0.766, abs=1e-3),
+ pytest.approx(0.755, abs=1e-3),
+ pytest.approx(0.734, abs=1e-3)]
+
+ # Compare with the interim predictions (produced before running the REL model).
+ interim_best_wqids = [c.best_wqid()
+ for scs in predictions.sentence_candidates
+ for c in scs.candidates]
+ interim_best_disambiguation_scores = [c.best_disambiguation_score()
+ for scs in predictions.sentence_candidates
+ for c in scs.candidates]
+
+ assert interim_best_wqids == ['Q42448', 'Q39121', 'Q84']
+ # Note that the interim disambiguation scores are higher, and are still available
+ # after applying the REL model, but the REL scores take precedence (see above).
+ assert interim_best_disambiguation_scores == [
+ pytest.approx(0.891, abs=1e-3),
+ pytest.approx(0.897, abs=1e-3),
+ pytest.approx(0.895, abs=1e-3)]
+
+ assert predictions.candidates()[0].best_match().cross_cand_scores()["Q42448"] == pytest.approx(0.766, abs=1e-3)
+ assert predictions.candidates()[0].mention.ner_score == 1.0
-@pytest.mark.skip(reason="Needs large resources")
-def test_perfect_rel_wpubl_wmtops(tmp_path):
+@pytest.mark.resources(reason="Needs large resources")
+def test_deezy_rel_wpubl(tmp_path):
model_path = os.path.join(current_dir, "../resources/models/")
assert os.path.isdir(model_path) is True
- myner = recogniser.Recogniser(
- model="blb_lwm-ner-fine",
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
pipe=None,
@@ -236,19 +297,17 @@ def test_perfect_rel_wpubl_wmtops(tmp_path):
model_path=model_path,
training_args={
"batch_size": 8,
- "num_train_epochs": 1,
+ "num_train_epochs": 10,
"learning_rate": 0.00005,
"weight_decay": 0.0,
},
overwrite_training=False, # Set to True if you want to overwrite model if existing
do_test=False, # Set to True if you want to train on test mode
- load_from_hub=False, # Bool: True if model is in HuggingFace hub
)
# --------------------------------------
# Instantiate the ranker:
- myranker = ranking.Ranker(
- method="perfectmatch",
+ ranker = ranking.DeezyMatchRanker(
resources_path=os.path.join(current_dir, "../resources/"),
mentions_to_wikidata=dict(),
wikidata_to_mentions=dict(),
@@ -277,14 +336,13 @@ def test_perfect_rel_wpubl_wmtops(tmp_path):
"overwrite_training": False,
"do_test": False,
},
- already_collected_cands=dict(),
)
with sqlite3.connect(os.path.join(current_dir, "../resources/rel_db/embeddings_database.db")) as conn:
cursor = conn.cursor()
- mylinker = linking.Linker(
- method="reldisamb",
+ linker = linking.RelDisambLinker(
resources_path=os.path.join(current_dir, "../resources/"),
+ ranker=ranker,
linking_resources=dict(),
rel_params={
"model_path": os.path.join(current_dir,"../resources/models/disambiguation/"),
@@ -292,7 +350,9 @@ def test_perfect_rel_wpubl_wmtops(tmp_path):
"training_split": "originalsplit",
"db_embeddings": cursor,
"with_publication": True,
- "without_microtoponyms": True,
+ "predict_place_of_publication": False,
+ "combined_score": False,
+ "without_microtoponyms": False,
"do_test": False,
"default_publname": "United Kingdom",
"default_publwqid": "Q145",
@@ -300,28 +360,297 @@ def test_perfect_rel_wpubl_wmtops(tmp_path):
overwrite_training=False,
)
- geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker)
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+ text = "The charming seaside town of Swanage is noted for its Town Hall whose distinctive façade was designed by Edward Jerman, a pupil of Sir Christopher Wren. Also the Grosvenor Hotel, with its clock tower originally erected at the south end of London Bridge as a memorial to the Duke of Wellington."
+
+ # Test with microtoponyms.
+ predictions = geoparser.run(text, place_of_pub_wqid="Q203349", place_of_pub="Poole, Dorset")
+ assert isinstance(predictions, RelPredictions)
+
+ # When the "without_microtoponyms" parameter set to False, there are four candidates:
+ assert len(predictions.candidates()) == 4
+ assert predictions.candidates()[0].mention.mention == "Swanage"
+ assert predictions.candidates()[1].mention.mention == "Town Hall"
+ assert predictions.candidates()[2].mention.mention == "Grosvenor Hotel"
+ assert predictions.candidates()[3].mention.mention == "London Bridge"
+
+ # Test without microtoponyms.
+ geoparser.linker.rel_params["without_microtoponyms"] = True
+ predictions = geoparser.run(text, place_of_pub_wqid="Q203349", place_of_pub="Poole, Dorset")
+ assert isinstance(predictions, RelPredictions)
+
+ # When the "without_microtoponyms" parameter set to True, only one candidate remains:
+ assert len(predictions.candidates()) == 1
+ assert predictions.candidates()[0].mention.mention == "Swanage"
+
+@pytest.mark.resources(reason="Needs large resources")
+def test_perfect_rel_wpubl_wmtops():
+ model_path = os.path.join(current_dir, "../resources/models/")
+ assert os.path.isdir(model_path) is True
+
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
+ train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
+ test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
+ pipe=None,
+ base_model="khosseini/bert_1760_1900", # Base model to fine-tune
+ model_path=model_path,
+ training_args={
+ "batch_size": 8,
+ "num_train_epochs": 1,
+ "learning_rate": 0.00005,
+ "weight_decay": 0.0,
+ },
+ overwrite_training=False, # Set to True if you want to overwrite model if existing
+ do_test=False, # Set to True if you want to train on test mode
+ )
+
+ # --------------------------------------
+ # Instantiate the ranker:
+ ranker = ranking.PerfectMatchRanker(
+ resources_path=os.path.join(current_dir, "../resources/"),
+ mentions_to_wikidata=dict(),
+ wikidata_to_mentions=dict(),
+ )
- resolved = geoparser.run_text(
+ with sqlite3.connect(os.path.join(current_dir, "../resources/rel_db/embeddings_database.db")) as conn:
+ cursor = conn.cursor()
+ linker = linking.RelDisambLinker(
+ resources_path=os.path.join(current_dir, "../resources/"),
+ ranker=ranker,
+ linking_resources=dict(),
+ rel_params={
+ "model_path": os.path.join(current_dir,"../resources/models/disambiguation/"),
+ "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"),
+ "training_split": "originalsplit",
+ "db_embeddings": cursor,
+ "with_publication": True,
+ "without_microtoponyms": True,
+ "do_test": True,
+ "default_publname": "United Kingdom",
+ "default_publwqid": "Q145",
+ },
+ overwrite_training=False,
+ )
+
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+
+ predictions = geoparser.run(
"A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.",
- place="Sheffield",
- place_wqid="Q42448",
+ place_of_pub_wqid="Q42448",
+ place_of_pub="Sheffield",
+ )
+
+ assert isinstance(predictions, RelPredictions)
+
+ assert len(predictions.candidates(ignore_empty_candidates=True)) == 2
+
+ candidates = predictions.candidates(ignore_empty_candidates=False)
+ assert len(candidates) == 3
+ assert candidates[0].mention.mention == "Shefiield"
+ assert candidates[0].mention.ner_score == 1.0
+ assert candidates[0].best_match() is None
+ assert candidates[0].best_wqid() is None
+ assert candidates[0].best_disambiguation_score() is None
+ assert predictions.rel_scores[0].mention == "Shefiield"
+ assert predictions.rel_scores[0].confidence == 0.0
+
+ assert candidates[1].mention.mention == "Leeds"
+ assert candidates[1].mention.ner_score == 1.0
+ assert candidates[1].best_match() is not None
+ assert isinstance(candidates[1].best_match(), PredictedLinks)
+ assert candidates[1].best_match().best_disambiguation_score() == pytest.approx(0.419, abs=1e-3)
+ assert candidates[1].best_wqid() == "Q39121"
+ assert candidates[1].best_disambiguation_score() == pytest.approx(0.419, abs=1e-3)
+ assert predictions.rel_scores[1].mention == "Leeds"
+ assert predictions.rel_scores[1].confidence == pytest.approx(0.168, abs=1e-3)
+ assert predictions.rel_scores[1].scores["Q39121"] == pytest.approx(0.419, abs=1e-3)
+
+ assert candidates[2].mention.mention == "London"
+ assert candidates[2].mention.ner_score == 0.998
+ assert candidates[2].best_match() is not None
+ assert isinstance(candidates[1].best_match(), PredictedLinks)
+ assert candidates[2].best_match().best_disambiguation_score() == pytest.approx(0.573, abs=1e-3)
+ assert candidates[2].best_wqid() == "Q84"
+ assert candidates[2].best_disambiguation_score() == pytest.approx(0.573, abs=1e-3)
+ assert predictions.rel_scores[2].mention == "London"
+ assert predictions.rel_scores[2].confidence == pytest.approx(0.178, abs=1e-3)
+ assert predictions.rel_scores[2].scores["Q84"] == pytest.approx(0.573, abs=1e-3)
+
+@pytest.mark.resources(reason="Needs large resources")
+def test_perfect_rel_predict_place_of_pub():
+ model_path = os.path.join(current_dir, "../resources/models/")
+ assert os.path.isdir(model_path) is True
+
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
+ train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
+ test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
+ pipe=None,
+ base_model="khosseini/bert_1760_1900", # Base model to fine-tune
+ model_path=model_path,
+ training_args={
+ "batch_size": 8,
+ "num_train_epochs": 1,
+ "learning_rate": 0.00005,
+ "weight_decay": 0.0,
+ },
+ overwrite_training=False, # Set to True if you want to overwrite model if existing
+ do_test=False, # Set to True if you want to train on test mode
+ )
+
+ # --------------------------------------
+ # Instantiate the ranker:
+ ranker = ranking.PerfectMatchRanker(
+ resources_path=os.path.join(current_dir, "../resources/"),
+ mentions_to_wikidata=dict(),
+ wikidata_to_mentions=dict(),
+ )
+
+ with sqlite3.connect(os.path.join(current_dir, "../resources/rel_db/embeddings_database.db")) as conn:
+ cursor = conn.cursor()
+ linker = linking.RelDisambLinker(
+ resources_path=os.path.join(current_dir, "../resources/"),
+ ranker=ranker,
+ linking_resources=dict(),
+ rel_params={
+ "model_path": os.path.join(current_dir,"../resources/models/disambiguation/"),
+ "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"),
+ "training_split": "originalsplit",
+ "db_embeddings": cursor,
+ "with_publication": True,
+ "predict_place_of_publication": False,
+ "combined_score": False,
+ "without_microtoponyms": True,
+ "do_test": True,
+ "default_publname": "United Kingdom",
+ "default_publwqid": "Q145",
+ },
+ overwrite_training=False,
+ )
+
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+
+ predictions = geoparser.run(
+ "A remarkable case of rattening has just occurred in the building trade at Stockton, but also in Leeds. Not in London though.",
+ place_of_pub_wqid="Q989418",
+ place_of_pub="Stockton-on-Tees, Cleveland, England",
+ )
+
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.candidates()) == 3
+
+ # With "predict_place_of_publication" set to False, the wrong Stockton is predicted:
+ assert predictions.candidates()[0].best_wqid() != "Q989418"
+
+ geoparser.linker.rel_params["predict_place_of_publication"] = True
+
+ predictions = geoparser.run(
+ "A remarkable case of rattening has just occurred in the building trade at Stockton, but also in Leeds. Not in London though.",
+ place_of_pub_wqid="Q989418",
+ place_of_pub="Stockton-on-Tees, Cleveland, England",
+ )
+
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.candidates()) == 3
+
+ # With "predict_place_of_publication" set to True, the correct Stockton is predicted
+ # because the place of publication is the favoured candidate:
+ assert predictions.candidates()[0].best_wqid() == "Q989418"
+
+@pytest.mark.resources(reason="Needs large resources")
+def test_perfect_rel_combined_score():
+ model_path = os.path.join(current_dir, "../resources/models/")
+ assert os.path.isdir(model_path) is True
+
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
+ train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
+ test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
+ pipe=None,
+ base_model="khosseini/bert_1760_1900", # Base model to fine-tune
+ model_path=model_path,
+ training_args={
+ "batch_size": 8,
+ "num_train_epochs": 1,
+ "learning_rate": 0.00005,
+ "weight_decay": 0.0,
+ },
+ overwrite_training=False, # Set to True if you want to overwrite model if existing
+ do_test=False, # Set to True if you want to train on test mode
+ )
+
+ # --------------------------------------
+ # Instantiate the ranker:
+ ranker = ranking.PerfectMatchRanker(
+ resources_path=os.path.join(current_dir, "../resources/"),
+ mentions_to_wikidata=dict(),
+ wikidata_to_mentions=dict(),
+ )
+
+ with sqlite3.connect(os.path.join(current_dir, "../resources/rel_db/embeddings_database.db")) as conn:
+ cursor = conn.cursor()
+ linker = linking.RelDisambLinker(
+ resources_path=os.path.join(current_dir, "../resources/"),
+ ranker=ranker,
+ linking_resources=dict(),
+ rel_params={
+ "model_path": os.path.join(current_dir,"../resources/models/disambiguation/"),
+ "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"),
+ "training_split": "originalsplit",
+ "db_embeddings": cursor,
+ "with_publication": True,
+ "predict_place_of_publication": False,
+ "combined_score": False,
+ "without_microtoponyms": True,
+ "do_test": True,
+ "default_publname": "United Kingdom",
+ "default_publwqid": "Q145",
+ "reference_separation": ((49.956739, -8.17751), (60.87, 1.762973)),
+ },
+ overwrite_training=False,
+ )
+
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+
+ predictions = geoparser.run(
+ "A remarkable case of rattening has just occurred in the building trade at Stockton, but also in Leeds.",
+ place_of_pub_wqid="Q39121",
+ place_of_pub="Leeds, West Yorkshire, England",
)
- assert resolved[0]["mention"] == "Shefiield"
- assert resolved[0]["prior_cand_score"] == dict()
- assert resolved[0]["cross_cand_score"] == dict()
- assert resolved[0]["prediction"] == "NIL"
- assert resolved[0]["ed_score"] == 0.0
- assert resolved[0]["ner_score"] == 1.0
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.candidates()) == 2
+
+ # With "combined_score" set to False, the wrong Stockton is predicted:
+ assert predictions.candidates()[0].best_wqid() != "Q989418"
+ assert predictions.candidates()[0].best_wqid() == "Q49240"
+ assert predictions.candidates()[0].best_disambiguation_score() == pytest.approx(0.225, abs=1e-3)
-@pytest.mark.skip(reason="Needs large resources")
+ geoparser.linker.rel_params["combined_score"] = True
+
+ predictions = geoparser.run(
+ "A remarkable case of rattening has just occurred in the building trade at Stockton, but also in Leeds.",
+ place_of_pub_wqid="Q39121",
+ place_of_pub="Leeds, West Yorkshire, England",
+ )
+
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.candidates()) == 2
+
+ # With "combined_score" set to True, the correct Stockton is predicted
+ # because the disambiguation score for the previous best candidate
+ # is curtailed by the combined score:
+ assert predictions.candidates()[0].best_wqid() == "Q989418"
+ assert predictions.candidates()[0].best_disambiguation_score() == pytest.approx(0.21, abs=1e-3)
+
+@pytest.mark.resources(reason="Needs large resources")
def test_modular_deezy_rel(tmp_path):
model_path = os.path.join(current_dir, "../resources/models/")
assert os.path.isdir(model_path) is True
- myner = recogniser.Recogniser(
- model="blb_lwm-ner-fine",
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
pipe=None,
@@ -335,13 +664,11 @@ def test_modular_deezy_rel(tmp_path):
},
overwrite_training=False, # Set to True if you want to overwrite model if existing
do_test=False, # Set to True if you want to train on test mode
- load_from_hub=False, # Bool: True if model is in HuggingFace hub
)
# --------------------------------------
# Instantiate the ranker:
- myranker = ranking.Ranker(
- method="deezymatch",
+ ranker = ranking.DeezyMatchRanker(
resources_path=os.path.join(current_dir, "../resources/"),
mentions_to_wikidata=dict(),
wikidata_to_mentions=dict(),
@@ -370,14 +697,13 @@ def test_modular_deezy_rel(tmp_path):
"overwrite_training": False,
"do_test": False,
},
- already_collected_cands=dict(),
)
with sqlite3.connect(os.path.join(current_dir, "../resources/rel_db/embeddings_database.db")) as conn:
cursor = conn.cursor()
- mylinker = linking.Linker(
- method="reldisamb",
+ linker = linking.RelDisambLinker(
resources_path=os.path.join(current_dir,"../resources/"),
+ ranker=ranker,
linking_resources=dict(),
rel_params={
"model_path": os.path.join(current_dir,"../resources/models/disambiguation/"),
@@ -385,7 +711,9 @@ def test_modular_deezy_rel(tmp_path):
"training_split": "apply",
"db_embeddings": cursor,
"with_publication": True,
- "without_microtoponyms": True,
+ "predict_place_of_publication": False,
+ "combined_score": False,
+ "without_microtoponyms": False,
"do_test": False,
"default_publname": "United Kingdom",
"default_publwqid": "Q145",
@@ -393,34 +721,208 @@ def test_modular_deezy_rel(tmp_path):
overwrite_training=False,
)
- geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker)
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
- sentence = "STOCKTON AND MIDDLESBROUGH WATER IVARD. The monthly meeting of the Sr-id:toe and bladtiltwitrough Water Lkerd was held at the Corp.acit:o.i liniklinga, Middlesbrough, on Monday."
- wikidata_id = "Q989418"
- location = "Stockton-on-Tees, Cleveland, England"
+ text = "STOCKTON AND MIDDLESBROUGH WATER IVARD. The monthly meeting of the Sr-id:toe and bladtiltwitrough Water Lkerd was held at the Corp.acit:o.i liniklinga, Middlesbrough, on Monday."
+ place_of_pub_wqid = "Q989418"
+ place_of_pub = "Stockton-on-Tees, Cleveland, England"
- toponyms = geoparser.run_text_recognition(
- sentence,
- place_wqid=wikidata_id,
- place=location,
- )
+ sentence_mentions = geoparser.run_text_recognition(text)
+
+ assert isinstance(sentence_mentions, list)
+ # Two sentences:
+ assert len(sentence_mentions) == 2
+ # Two toponyms identified in the first sentence:
+ assert len(sentence_mentions[0].mentions) == 2
+ # Three toponyms identified in the second sentence:
+ assert len(sentence_mentions[1].mentions) == 3
+
+ cands = geoparser.run_candidate_selection(sentence_mentions, place_of_pub_wqid, place_of_pub)
+
+ assert isinstance(cands, Candidates)
+ # The double space between sentences is lost:
+ assert cands.text() == ' '.join(text.split())
+
+ assert len(cands.candidates()) == 5
+ for c in cands.candidates():
+ assert isinstance(c, MentionCandidates)
+
+ predictions = geoparser.run_disambiguation(cands)
+
+ assert isinstance(predictions, Predictions)
+ assert predictions.candidates()[0].best_wqid() == "Q989418"
+ assert predictions.candidates()[0].best_disambiguation_score() == pytest.approx(0.350, abs=1e-3)
+ assert predictions.candidates()[-1].best_wqid() == "Q171866"
+ assert predictions.candidates()[-1].best_disambiguation_score() == pytest.approx(0.615, abs=1e-3)
+
+ ### Test on another chunk of text.
+ text = """Palmer, labonrce aged Y., costautted usiiide by hanging himeelf at his residence in Whittle's-eard. (lathe's street. lifiddlesbromh.
+Re threatened to hap:: himself on Elsitarday morning. and al night was found to have earcied ont hit threat with is piece of rope in his bed-room.,An inuesL was held on the licdy ad the Cleveland Pay Hotel, clegel and-s tree t, iddLesbronet, on :'!oart.ly al taru °oil.
+Fos Tint TIeETII AND 'kill ,A few drops of the W 4" FlorIllue" 'dee a wee eolli-bir neh prxi a. a: &pteaa Lai •-h thomv.hly de.ruitas tike tooth from all --Wee. harden% the WM; prevenlw '..esto.the froth e pOtilay itdatmd I had Fri. to the parlour. , Folkestone.
+Beech-street, London."""
- assert isinstance(toponyms, list)
- assert len(toponyms) == 4
+ place_of_pub_wqid = "Q989418"
+ place_of_pub = "Stockton-on-Tees, Cleveland, England"
- cands = geoparser.run_candidate_selection(toponyms)
+ sentence_mentions = geoparser.run_text_recognition(text)
+ cands = geoparser.run_candidate_selection(sentence_mentions, place_of_pub_wqid, place_of_pub)
- assert isinstance(cands, dict)
- assert len(cands) == 4
+ predictions = geoparser.run_disambiguation(cands)
- disambiguation = geoparser.run_disambiguation(
- toponyms,
- cands,
- place_wqid=wikidata_id,
- place=location,
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.candidates(ignore_empty_candidates=False)) == 9
+ assert len(predictions.candidates(ignore_empty_candidates=True)) == 8
+
+ # Test without microtoponyms.
+ geoparser.linker.rel_params["without_microtoponyms"] = True
+
+ sentence_mentions = geoparser.run_text_recognition(text)
+ cands = geoparser.run_candidate_selection(sentence_mentions, place_of_pub_wqid, place_of_pub)
+ predictions = geoparser.run_disambiguation(cands)
+
+ assert isinstance(predictions, RelPredictions)
+
+ assert len(predictions.candidates(ignore_empty_candidates=False)) == 5
+ assert all([not c.mention.is_microtoponym() for c in predictions.candidates(ignore_empty_candidates=False)])
+ assert len(predictions.candidates(ignore_empty_candidates=True)) == 4
+
+@pytest.mark.resources(reason="Needs large resources")
+def test_combined_score(tmp_path):
+
+ model_path = os.path.join(current_dir, "../resources/models/")
+ assert os.path.isdir(model_path) is True
+
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
+ train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
+ test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
+ pipe=None,
+ base_model="khosseini/bert_1760_1900", # Base model to fine-tune
+ model_path=model_path,
+ training_args={
+ "batch_size": 8,
+ "num_train_epochs": 1,
+ "learning_rate": 0.00005,
+ "weight_decay": 0.0,
+ },
+ overwrite_training=False, # Set to True if you want to overwrite model if existing
+ do_test=False, # Set to True if you want to train on test mode
)
- assert isinstance(disambiguation,list)
+ # --------------------------------------
+ # Instantiate the ranker:
+ ranker = ranking.DeezyMatchRanker(
+ resources_path=os.path.join(current_dir, "../resources/"),
+ mentions_to_wikidata=dict(),
+ wikidata_to_mentions=dict(),
+ strvar_parameters={
+ # Parameters to create the string pair dataset:
+ "ocr_threshold": 60,
+ "top_threshold": 85,
+ "min_len": 5,
+ "max_len": 15,
+ "w2v_ocr_path": str(tmp_path),
+ "w2v_ocr_model": "w2v_1800s_news",
+ "overwrite_dataset": False,
+ },
+ deezy_parameters={
+ # Paths and filenames of DeezyMatch models and data:
+ "dm_path": os.path.join(current_dir, "../resources/deezymatch/"),
+ "dm_cands": "wkdtalts",
+ "dm_model": "w2v_ocr",
+ "dm_output": "deezymatch_on_the_fly",
+ # Ranking measures:
+ "ranking_metric": "faiss",
+ "selection_threshold": 50,
+ "num_candidates": 1,
+ "verbose": False,
+ # DeezyMatch training:
+ "overwrite_training": False,
+ "do_test": False,
+ },
+ )
+
+ with sqlite3.connect(os.path.join(current_dir, "../resources/rel_db/embeddings_database.db")) as conn:
+ cursor = conn.cursor()
+ linker = linking.RelDisambLinker(
+ resources_path=os.path.join(current_dir,"../resources/"),
+ ranker=ranker,
+ linking_resources=dict(),
+ rel_params={
+ "db_embeddings": cursor,
+ "with_publication": True,
+ "predict_place_of_publication": False,
+ "combined_score": True,
+ "without_microtoponyms": False,
+ },
+ overwrite_training=False,
+ )
+
+ geoparser = pipeline.Pipeline(recogniser=recogniser, ranker=ranker, linker=linker)
+
+ text = """There was very little to choose between the play of the two teams, and why the Penrith forwards did not bang the ball out of the scrummage during the quarter of an hour they had the Aspatria men penned within their "25," and their backs having the assistance of the wind to kick with, was a puzzler to me, and why the backs didn't kick more during the second half was another puzzler."""
+
+ place_of_pub = "Carlisle, Cumbria, England"
+ place_of_pub_wqid = "Q192896"
+
+ predictions = geoparser.run(text, place_of_pub_wqid, place_of_pub)
+
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.rel_scores) == 1
+ combined_scores = predictions.rel_scores[0]
+
+ # Check that Penrith, Australia is the REL prediction but Penrith, Cumbria
+ # is the prediction *after* applying the combined score.
+
+ # Penrith, Cumbria is Q798906, latlon (54.6648, -2.7548).
+ assert predictions.best_wqids()[0] == 'Q798906'
+ assert predictions.best_coords()[0] == (54.6648, -2.7548)
+
+ # Combined scores:
+ assert combined_scores.scores['Q798906'] == pytest.approx(0.26184, 1e-4)
+ assert combined_scores.scores['Q798906'] == max(combined_scores.scores.values())
+ # REL scores:
+ assert combined_scores.rel_scores['Q798906'] == pytest.approx(0.26195, 1e-4)
+ assert combined_scores.rel_scores['Q798906'] != max(combined_scores.scores.values())
+
+ # Penrith, Australia is Q385155, latlon (-33.751111, 150.694167).
+ assert combined_scores.scores['Q385155'] == pytest.approx(0.15684, 1e-4)
+ assert combined_scores.scores['Q385155'] != max(combined_scores.scores.values())
+
+ assert combined_scores.rel_scores['Q385155'] == pytest.approx(0.39417, 1e-4)
+ assert combined_scores.rel_scores['Q385155'] == max(combined_scores.rel_scores.values())
+
+ # print("combined scores:")
+ # for k, v in sorted(combined_scores.scores.items(), key=lambda item: item[1], reverse=True):
+ # print(f'{k}: {v}')
+ # print("REL scores:")
+ # for k, v in sorted(combined_scores.rel_scores.items(), key=lambda item: item[1], reverse=True):
+ # print(f'{k}: {v}')
+
+ # Re-run the same test but omitting place of publication info, so the default is used (UK).
+ predictions = geoparser.run(text)
+
+ assert isinstance(predictions, RelPredictions)
+ assert len(predictions.rel_scores) == 1
+ combined_scores = predictions.rel_scores[0]
+
+ # Check that Penrith, Australia is the REL prediction but Penrith, Cumbria
+ # is the prediction *after* applying the combined score.
+
+ # Penrith, Cumbria is Q798906, latlon (54.6648, -2.7548).
+ assert predictions.best_wqids()[0] == 'Q798906'
+ assert predictions.best_coords()[0] == (54.6648, -2.7548)
+
+ # Combined scores:
+ assert combined_scores.scores['Q798906'] == pytest.approx(0.29257, 1e-4)
+ assert combined_scores.scores['Q798906'] == max(combined_scores.scores.values())
+ # REL scores:
+ assert combined_scores.rel_scores['Q798906'] == pytest.approx(0.29295, 1e-4)
+ assert combined_scores.rel_scores['Q798906'] != max(combined_scores.scores.values())
+
+ # Penrith, Australia is Q385155, latlon (-33.751111, 150.694167).
+ assert combined_scores.scores['Q385155'] == pytest.approx(0.12602, 1e-4)
+ assert combined_scores.scores['Q385155'] != max(combined_scores.scores.values())
- assert disambiguation[0]["prediction"] == "Q989418"
- assert disambiguation[-1]["prediction"] == "Q171866"
+ assert combined_scores.rel_scores['Q385155'] == pytest.approx(0.316710, 1e-4)
+ assert combined_scores.rel_scores['Q385155'] == max(combined_scores.rel_scores.values())
diff --git a/tests/test_process_data.py b/tests/test_process_data.py
index 9d5fb683..b0566132 100644
--- a/tests/test_process_data.py
+++ b/tests/test_process_data.py
@@ -5,7 +5,7 @@
import pandas as pd
import pytest
-from t_res.geoparser import recogniser
+from t_res.geoparser import ner
from t_res.utils import process_data
current_dir = Path(__file__).parent.resolve()
@@ -64,9 +64,11 @@ def test_prepare_sents():
assert len([x for x, y in dMetadata.items() if len(y) == 0]) == 0
+@pytest.mark.train(reason="Trains an NER model")
def test_align_gold(tmp_path):
- myner = recogniser.Recogniser(
- model="blb_lwm-ner-fine",
+
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
pipe=None,
@@ -80,11 +82,9 @@ def test_align_gold(tmp_path):
},
overwrite_training=False, # Set to True if you want to overwrite model if existing
do_test=False, # Set to True if you want to train on test mode
- load_from_hub=False, # Bool: True if model is in HuggingFace hub
)
- myner.train()
- myner.pipe = myner.create_pipeline()
+ recogniser.load()
dataset_df = pd.read_csv(
os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"),
@@ -97,7 +97,7 @@ def test_align_gold(tmp_path):
if "3580760_2" == sent_id:
sent = dSentences[sent_id]
annotations = dAnnotated[sent_id]
- predictions = myner.ner_predict(sent)
+ predictions = recogniser.ner_predict(sent)
gold_positions = process_data.align_gold(predictions, annotations)
I_elements = [
@@ -121,9 +121,11 @@ def test_align_gold(tmp_path):
assert len(empty_list) == 0
+@pytest.mark.train(reason="Trains an NER model")
def test_ner_and_process(tmp_path):
- myner = recogniser.Recogniser(
- model="blb_lwm-ner-fine",
+
+ recogniser = ner.CustomRecogniser(
+ model_name="blb_lwm-ner-fine",
train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"),
test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"),
pipe=None,
@@ -137,11 +139,9 @@ def test_ner_and_process(tmp_path):
},
overwrite_training=False, # Set to True if you want to overwrite model if existing
do_test=False, # Set to True if you want to train on test mode
- load_from_hub=False, # Bool: True if model is in HuggingFace hub
)
- myner.train()
- myner.pipe = myner.create_pipeline()
+ recogniser.load()
dataset_df = pd.read_csv(
os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"),
@@ -157,7 +157,7 @@ def test_ner_and_process(tmp_path):
gold_tokenization,
dMentionsPred,
dMentionsGold,
- ) = process_data.ner_and_process(dSentences, dAnnotated, myner)
+ ) = process_data.ner_and_process(dSentences, dAnnotated, recogniser)
B_els = [
[z for z in range(len(y)) if "B-" in y[z]["entity"]]
diff --git a/tests/test_ranking.py b/tests/test_ranking.py
index a2d9dfb9..30c9120b 100644
--- a/tests/test_ranking.py
+++ b/tests/test_ranking.py
@@ -3,116 +3,255 @@
import pytest
-from t_res.geoparser import ranking
+from t_res.geoparser.ranking import *
+from t_res.utils.dataclasses import *
current_dir = Path(__file__).parent.resolve()
+def test_init():
+
+ # Test the default parameters.
+ ranker = DeezyMatchRanker(
+ resources_path=os.path.join(current_dir,"../resources/")
+ )
+ assert not ranker.deezy_parameters['verbose']
+ assert not ranker.deezy_parameters['overwrite_training']
+ assert not ranker.deezy_parameters['do_test']
+
+ # Test that default parameters are overridden if passed explicitly.
+ ranker = DeezyMatchRanker(
+ resources_path=os.path.join(current_dir,"../resources/"),
+ deezy_parameters = {'verbose': True, 'do_test': True}
+ )
+ assert ranker.deezy_parameters['verbose']
+ assert not ranker.deezy_parameters['overwrite_training']
+ assert ranker.deezy_parameters['do_test']
+
+def test_new():
+ # Test Ranker construction via string parameters.
+
+ # If a required parameter is omitted, expect a TypeError.
+ kwargs = {
+ 'method_name': 'perfectmatch',
+ }
+ with pytest.raises(TypeError):
+ ranker = Ranker.new(**kwargs)
+
+ kwargs = {
+ 'method_name': 'perfectmatch',
+ 'resources_path': 'sample_files/resources/',
+ }
+ ranker = Ranker.new(**kwargs)
+ assert isinstance(ranker, PerfectMatchRanker)
+ assert ranker.method_name == 'perfectmatch'
+ assert ranker.mentions_to_wikidata == dict()
+
+ kwargs = {
+ 'method_name': 'levenshtein',
+ 'resources_path': 'sample_files/resources/',
+ }
+ ranker = Ranker.new(**kwargs)
+ assert isinstance(ranker, LevenshteinRanker)
+ assert ranker.method_name == 'levenshtein'
+
+ kwargs = {
+ 'method_name': 'deezymatch',
+ 'resources_path': 'sample_files/resources/',
+ }
+ ranker = Ranker.new(**kwargs)
+ assert isinstance(ranker, DeezyMatchRanker)
+ assert ranker.method_name == 'deezymatch'
+
+ # If the ranking method is invalid, expect a ValueError.
+ kwargs = {
+ 'method_name': 'nosuchmatch',
+ }
+ with pytest.raises(ValueError):
+ ranker = Ranker.new(**kwargs)
+
def test_ranking_perfect_match():
"""
Test that perfect_match returns only perfect matching cases
"""
- myranker = ranking.Ranker(
- method="perfectmatch",
- resources_path=os.path.join(current_dir,"sample_files/resources/"),
+ ranker = PerfectMatchRanker(
+ resources_path=os.path.join(current_dir, "sample_files/resources/"),
)
+ assert ranker.method_name == "perfectmatch"
- myranker.mentions_to_wikidata = myranker.load_resources()
- myranker.already_collected_cands = {}
- candidates, already_collected_cands = myranker.perfect_match(["London"])
- assert candidates["London"]["London"] == 1.0
+ ranker.load()
+ ranker.cache = {}
+
+ # Check the cache is empty.
+ assert len(ranker.cache) == 0
- candidates, already_collected_cands = myranker.perfect_match(["Lvndon"])
- assert candidates["Lvndon"] == {}
+ # Construct a dummy mention for the test.
+ mention = Mention("London", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
- candidates, already_collected_cands = myranker.perfect_match(["Paperopoli"])
- assert candidates["Paperopoli"] == {}
+ assert candidates.ranking_method == "perfectmatch"
+ assert candidates.mention.mention == "London"
+ assert candidates.get("London").variation == "London"
+ assert candidates.get("London").string_similarity == 1.0
+ # Check the cache has been updated.
+ assert len(ranker.cache) == 1
+ assert ranker.cache["London"] == candidates.matches
-def test_ranking_damlev():
- """
- Test that damlev returns correctly
- """
- myranker = ranking.Ranker(
- method="partialmatch",
- resources_path=os.path.join(current_dir,"sample_files/resources/"),
- )
+ mention = Mention("Lvndon", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
- score = myranker.damlev_dist("Lvndon", {"mentions": "London"})
- assert score == 0.8333333283662796
+ assert candidates.ranking_method == "perfectmatch"
+ assert candidates.mention.mention == "Lvndon"
+ assert candidates.is_empty()
- score = myranker.damlev_dist("uityity", {"mentions": "asdasd"})
- assert score == 0.0
+ # Check the cache has been updated.
+ assert len(ranker.cache) == 2
+ assert ranker.cache["Lvndon"] == candidates.matches
- with pytest.raises(TypeError):
- myranker.damlev_dist("Lvndon", "London")
+ # Construct a dummy mention for the test.
+ mention = Mention("Paperopoli", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+
+ assert candidates.ranking_method == "perfectmatch"
+ assert candidates.mention.mention == "Paperopoli"
+ assert candidates.is_empty()
+ # Check the cache has been updated.
+ assert len(ranker.cache) == 3
+ assert ranker.cache["Paperopoli"] == candidates.matches
-def test_ranking_check_if_contained():
+def test_ranking_matching_score():
"""
- Test that check_if_contained returns score only when there is an overlap
+ Test that matching_score returns score only when there is an overlap
"""
- myranker = ranking.Ranker(
- method="partialmatch",
+ # Test the overlap matching score.
+ ranker = PartialMatchRanker(
resources_path=os.path.join(current_dir,"sample_files/resources/"),
)
+ assert ranker.method_name == "partialmatch"
- score_a = myranker.check_if_contained("New York", {"mentions": "New York City"})
- score_b = myranker.check_if_contained("New York City", {"mentions": "New York"})
+ score_a = ranker.matching_score("New York", {"mentions": "New York City"})
+ score_b = ranker.matching_score("New York City", {"mentions": "New York"})
assert score_a == score_b == 0.6153846153846154
with pytest.raises(TypeError):
- myranker.check_if_contained("Lvndon", "London")
+ ranker.matching_score("Lvndon", "London")
- score = myranker.check_if_contained("London", {"mentions": "New York"})
+ score = ranker.matching_score("London", {"mentions": "New York"})
assert score is None
+ # Test the Levenshtein distance matching score.
+ ranker = LevenshteinRanker(
+ resources_path=os.path.join(current_dir,"sample_files/resources/"),
+ )
+ ranker.load()
+
+ score = ranker.matching_score("Lvndon", {"mentions": "London"})
+ assert score == 0.8333333283662796
+
+ score = ranker.matching_score("uityity", {"mentions": "asdasd"})
+ assert score == 0.0
+
+ with pytest.raises(TypeError):
+ ranker.matching_score("Lvndon", "London")
def test_ranking_partial_match():
"""
Test that partial match either returns results or {}
"""
- myranker = ranking.Ranker(
- method="partialmatch",
+ ranker = PartialMatchRanker(
resources_path=os.path.join(current_dir,"sample_files/resources/"),
)
+ assert ranker.method_name == "partialmatch"
+
+ ranker.load()
- myranker.mentions_to_wikidata = myranker.load_resources()
-
- # Test that perfect_match acts before partial match
- myranker.mentions_to_wikidata = {"London": "Q84"}
- myranker.already_collected_cands = {}
- candidates, already_collected_cands = myranker.partial_match(["London"], damlev=False)
- assert candidates["London"]["London"] == 1.0
+ ranker.mentions_to_wikidata = {"London": {"Q84": 0.922}}
+ ranker.cache = {}
- # Test that damlev works
- myranker.already_collected_cands = {}
+ # Construct a dummy mention for the test.
+ mention = Mention("London", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
- candidates, already_collected_cands = myranker.partial_match(["Lvndvn"], damlev=True)
- assert candidates["Lvndvn"]["London"] == 0.6666666567325592
+ assert candidates.ranking_method == "partialmatch"
+ assert candidates.mention.mention == "London"
+ assert candidates.get("London").variation == "London"
+ assert candidates.get("London").string_similarity == 1.0
# Test that overlap works properly
- myranker.mentions_to_wikidata = {"New York City": "Q60"}
- myranker.already_collected_cands = {}
+ ranker.mentions_to_wikidata = {"New York City": {"Q60": 0.884}}
+
+ ranker.cache = {}
+ # Construct a dummy mention for the test.
+ mention = Mention("New York", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+
+ assert candidates.mention.mention == "New York"
+ assert candidates.get("New York City").variation == "New York City"
+ assert candidates.get("New York City").string_similarity == pytest.approx(0.615384, abs=10e-6)
+
+ ranker.cache = {}
+ mention = Mention("Lvndvn", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+
+ assert candidates.mention.mention == "Lvndvn"
+ assert candidates.is_empty()
- candidates, already_collected_cands = myranker.partial_match(["New York"], damlev=False)
- assert candidates["New York"]["New York City"] == 0.6153846153846154
- myranker.already_collected_cands = {}
+def test_ranking_levenshtein():
+ """
+ Test that Levenshtein partial match either returns results or {}
+ """
+
+ ranker = LevenshteinRanker(
+ resources_path=os.path.join(current_dir,"sample_files/resources/"),
+ )
+ assert ranker.method_name == "levenshtein"
+
+ ranker.load()
- candidates, already_collected_cands = myranker.partial_match(["Lvndvn"], damlev=False)
- assert candidates["Lvndvn"] == {}
+ ranker.mentions_to_wikidata = {"London": {"Q84": 0.922}}
+ ranker.cache = {}
- myranker.already_collected_cands = {}
+ # Construct a dummy mention for the test.
+ mention = Mention("London", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.get("London").string_similarity == 1.0
- candidates, already_collected_cands = myranker.partial_match(["asdasd"], damlev=True)
- assert candidates["asdasd"] == {"New York City": 0.0}
+ ranker.cache = {}
+ # Construct a dummy mention for the test.
+ mention = Mention("Lvndvn", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.get("London").string_similarity == pytest.approx(0.66666665, abs=10e-6)
-@pytest.mark.skip(reason="Needs deezy model")
+ # Test that overlap works properly
+ ranker.mentions_to_wikidata = {"New York City": {"Q60": 0.884}}
+
+ ranker.cache = {}
+ # Construct a dummy mention for the test.
+ mention = Mention("New York", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.mention.mention == "New York"
+ assert candidates.get("New York City").string_similarity == pytest.approx(0.615384615, abs=10e-6)
+
+ ranker.cache = {}
+ mention = Mention("Lvndvn", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.mention.mention == "Lvndvn"
+ assert candidates.get("New York City").string_similarity == 0.0
+
+ ranker.cache = {}
+ mention = Mention("asdasd", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.mention.mention == "asdasd"
+ assert candidates.get("New York City").string_similarity == 0.0
+
+
+@pytest.mark.resources(reason="Needs deezy model")
def test_ranking_deezy_on_the_fly(tmp_path):
- myranker = ranking.Ranker(
- method="deezymatch",
+ ranker = DeezyMatchRanker(
resources_path=os.path.join(current_dir,"../resources/"),
mentions_to_wikidata=dict(),
wikidata_to_mentions=dict(),
@@ -142,24 +281,93 @@ def test_ranking_deezy_on_the_fly(tmp_path):
"overwrite_training": False,
"do_test": False,
},
- already_collected_cands=dict(),
)
+ assert ranker.method_name == "deezymatch"
+
+ ranker.load()
+
+ # Construct a dummy mention for the test.
+ mention = Mention("London", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
# Test that perfect_match acts before deezy
- myranker.mentions_to_wikidata = myranker.load_resources()
- candidates, already_collected_cands = myranker.deezy_on_the_fly(["London"])
- assert candidates["London"]["London"] == 1.0
+ assert candidates.mention.mention == "London"
+ assert candidates.get("London").string_similarity == 1.0
# Test that deezy works
- myranker.already_collected_cands = {}
- candidates, already_collected_cands = myranker.deezy_on_the_fly(["Ashton-cnderLyne"])
+ ranker.cache = {}
+ # Construct a dummy mention for the test.
+ mention = Mention("Ashton-cnderLyne", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.mention.mention == "Ashton-cnderLyne"
+ assert candidates.ranking_method == "deezymatch"
+
+ assert len(candidates.matches) == 3
+ assert (0.0 < candidates.get("Ashton under Lyne").string_similarity < 1.0)
+ assert (0.0 < candidates.get("Ashton-under-Lyne").string_similarity < 1.0)
+ assert (0.0 < candidates.get("Aston-under-Lynne").string_similarity < 1.0)
+
+
+@pytest.mark.resources(reason="Needs deezy model")
+def test_ranking_deezy_on_the_fly_queries(tmp_path):
+ ranker = DeezyMatchRanker(
+ resources_path=os.path.join(current_dir,"../resources/"),
+ mentions_to_wikidata=dict(),
+ wikidata_to_mentions=dict(),
+ strvar_parameters={
+ # Parameters to create the string pair dataset:
+ "ocr_threshold": 60,
+ "top_threshold": 85,
+ "min_len": 5,
+ "max_len": 15,
+ "w2v_ocr_path": str(tmp_path),
+ "w2v_ocr_model": "w2v_1800s_news",
+ "overwrite_dataset": False,
+ },
+ deezy_parameters={
+ # Paths and filenames of DeezyMatch models and data:
+ "dm_path": os.path.join(current_dir, "../resources/deezymatch/"),
+ "dm_cands": "wkdtalts",
+ "dm_model": "w2v_ocr",
+ "dm_output": "deezymatch_on_the_fly",
+ # Ranking measures:
+ "ranking_metric": "cosine",
+ "selection_threshold": 0.9,
+ "num_candidates": 3,
+ "search_size": 3,
+ "verbose": False,
+ # DeezyMatch training:
+ "overwrite_training": False,
+ "do_test": False,
+ },
+ )
+ assert ranker.method_name == "deezymatch"
+
+ ranker.load()
+
+ queries = ['Crewe', 'Bamsley', 'Madchester', 'London']
+
+ # Construct dummy mention for the test.
+ mentions = [Mention(q, 0, 0, 0, 0.0, 'LOC', 'O') for q in queries]
+ candidates = ranker.run(mentions)
+
+ assert len(candidates) == 4
+
+ assert candidates[0].mention.mention == 'Crewe'
+ assert candidates[0].matches[0].variation == 'Crewe'
+
+ assert candidates[1].mention.mention == 'Bamsley'
+ assert candidates[1].matches[0].variation == 'Beamsley'
- assert (0.0 < candidates["Ashton-cnderLyne"]["Ashton-under-Lyne"] < 1.0)
+ assert candidates[2].mention.mention == 'Madchester'
+ assert candidates[2].matches[0].variation == 'Marmes Rock Shelter'
-@pytest.mark.skip(reason="Needs deezy model")
-def test_ranking_find_candidates(tmp_path):
- myranker = ranking.Ranker(
- method="deezymatch",
+ assert candidates[3].mention.mention == 'London'
+ assert candidates[3].matches[0].variation == 'London'
+
+@pytest.mark.resources(reason="Needs deezy model")
+def test_ranking_attach_wikidata(tmp_path):
+ ranker = DeezyMatchRanker(
resources_path=os.path.join(current_dir,"../resources/"),
mentions_to_wikidata=dict(),
wikidata_to_mentions=dict(),
@@ -189,60 +397,82 @@ def test_ranking_find_candidates(tmp_path):
"overwrite_training": False,
"do_test": True,
},
- already_collected_cands=dict(),
)
+ ranker.load(train=False)
- # Test that perfect_match acts before deezy
- myranker.mentions_to_wikidata = myranker.load_resources()
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "London"}])
- assert candidates["London"]["London"]["Score"] == 1.0
- assert "Q84" in candidates["London"]["London"]["Candidates"]
-
- # Test that deezy works
- myranker.already_collected_cands = {}
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}])
- assert (0.0 < candidates["Sheftield"]["Sheffield"]["Score"] < 1.0)
- assert "Q42448" in candidates["Sheftield"]["Sheffield"]["Candidates"]
+ # Check the cache is empty.
+ assert len(ranker.cache) == 0
- # Test that Perfect Match works
- myranker.method = "perfectmatch"
+ # Construct a dummy mention for the test.
+ mention = Mention("London", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.mention.mention == "London"
+ assert isinstance(candidates.get("London"), StringMatchLinks)
+ assert candidates.get("London").variation == "London"
# Test that perfect_match acts before deezy
- myranker.mentions_to_wikidata = myranker.load_resources()
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheffield"}])
- assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0
- assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"]
+ assert candidates.get("London").string_similarity == 1.0
+ assert len(candidates.get("London").wqid_links) == 194
+ assert "Q84" in candidates.get("London").wqid_links
- myranker.already_collected_cands = {}
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}])
- assert candidates["Sheftield"] == {}
+ # Check the cache has been updated.
+ assert len(ranker.cache) == 1
+ assert ranker.cache["London"] == candidates.matches
- # Test that check if contained works
- myranker.method = "partialmatch"
+ # Test that deezy works
+ # TODO: add a ranker.clear_cache() method.
+ ranker.cache = {}
+
+ # Construct a dummy mention for the test.
+ mention = Mention("Sheftield", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.mention.mention == "Sheftield"
+ assert isinstance(candidates.get("Sheffield"), StringMatchLinks)
+ assert candidates.get("Sheffield").variation == "Sheffield"
+ assert (0.0 < candidates.get("Sheffield").string_similarity < 1.0)
+ assert len(candidates.get("Sheffield").wqid_links) == 50
+ assert "Q42448" in candidates.get("Sheffield").wqid_links
- # Test that perfect_match acts before partialmatch
- myranker.mentions_to_wikidata = myranker.load_resources()
+ # Test that Perfect Match works
+ ranker = PerfectMatchRanker(
+ resources_path=os.path.join(current_dir,"sample_files/resources/"),
+ )
+ ranker.load()
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheffield"}])
- assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0
- assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"]
+ # Construct a dummy mention for the test.
+ mention = Mention("Sheffield", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.get("Sheffield").variation == "Sheffield"
+ assert candidates.get("Sheffield").string_similarity == 1.0
+ assert "Q42448" in candidates.get("Sheffield").wqid_links
- myranker.already_collected_cands = {}
+ ranker.cache = {}
+ mention = Mention("Sheftield", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.is_empty()
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}])
- assert "Sheffield" not in candidates["Sheftield"]
+ # Test that check if contained works
+ ranker = PartialMatchRanker(
+ resources_path=os.path.join(current_dir,"sample_files/resources/"),
+ )
+ ranker.load()
# Test that levenshtein works
- myranker.method = "levenshtein"
-
- # Test that perfect_match acts before partialmatch
- myranker.mentions_to_wikidata = myranker.load_resources()
-
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheffield"}])
- assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0
- assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"]
-
- myranker.already_collected_cands = {}
- candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}])
- assert (0.0 < candidates["Sheftield"]["Sheffield"]["Score"] < 1.0)
- assert "Q42448" in candidates["Sheftield"]["Sheffield"]["Candidates"]
\ No newline at end of file
+ ranker = LevenshteinRanker(
+ resources_path=os.path.join(current_dir,"sample_files/resources/"),
+ )
+ ranker.load()
+
+ # Construct a dummy mention for the test.
+ mention = Mention("Sheffield", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.get("Sheffield").variation == "Sheffield"
+ assert candidates.get("Sheffield").string_similarity == 1.0
+ assert "Q42448" in candidates.get("Sheffield").wqid_links
+
+ ranker.cache = {}
+ mention = Mention("Sheftield", 0, 0, 0, 0.0, 'LOC', 'O')
+ candidates = ranker.run([mention])[0]
+ assert candidates.mention.mention == "Sheftield"
+ assert (0.0 < candidates.get("Sheffield").string_similarity < 1.0)
+ assert "Q42448" in candidates.get("Sheffield").wqid_links
\ No newline at end of file
diff --git a/tests/test_wiki_functions.py b/tests/test_wiki_functions.py
index 0b677d66..40aba8db 100644
--- a/tests/test_wiki_functions.py
+++ b/tests/test_wiki_functions.py
@@ -17,7 +17,7 @@ def test_make_links_consistent():
assert (process_wikipedia.make_wikilinks_consistent(string_a) == string_a) is False
assert process_wikipedia.make_wikilinks_consistent(string_c) == "new%20york"
-@pytest.mark.skip(reason="Needs large db file")
+@pytest.mark.resources(reason="Needs large db file")
def test_wikidata2wikipedia():
db = "resources/wikipedia/index_enwiki-latest.db"
assert process_wikipedia.title_to_id("BOLOGNA", lower=True, path_to_db=db) is None