Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
**/_*
!**/__init__.py
valtiopy/cfg_list.json
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
cff-version: 1.2.0
message: To cite this reposository, please use these metadata.
title: "valtiopy: access the Finnish Parliament Corpus"
version: v0.0.2
version: v0.0.3
authors:
- family-names: Borges
given-names: Robert
orcid: "https://orcid.org/0000-0002-7647-4048"
alias: BobBorges
date-released: 2025-02-25
date-released: 2025-05-20
identifiers:
- description: Repository basename
type: other
Expand Down
2 changes: 1 addition & 1 deletion docs/search.js

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion docs/valtiopy.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,15 @@ <h2>Submodules</h2>
<li><a href="valtiopy/args.html">args</a></li>
<li><a href="valtiopy/config.html">config</a></li>
<li><a href="valtiopy/curate.html">curate</a></li>
<li><a href="valtiopy/metadata.html">metadata</a></li>
<li><a href="valtiopy/plot.html">plot</a></li>
<li><a href="valtiopy/regex.html">regex</a></li>
<li><a href="valtiopy/sample.html">sample</a></li>
<li><a href="valtiopy/utils.html">utils</a></li>
</ul>


<footer>v0.0.2</footer>
<footer>v0.0.3</footer>

<a class="attribution" title="pdoc: Python API documentation generator" href="https://pdoc.dev" target="_blank">
built with <span class="visually-hidden">pdoc</span><img
Expand Down
2 changes: 1 addition & 1 deletion docs/valtiopy/args.html
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ <h2>API Documentation</h2>
</ul>


<footer>v0.0.2</footer>
<footer>v0.0.3</footer>

<a class="attribution" title="pdoc: Python API documentation generator" href="https://pdoc.dev" target="_blank">
built with <span class="visually-hidden">pdoc</span><img
Expand Down
619 changes: 336 additions & 283 deletions docs/valtiopy/config.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/valtiopy/curate.html
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ <h2>API Documentation</h2>
</ul>


<footer>v0.0.2</footer>
<footer>v0.0.3</footer>

<a class="attribution" title="pdoc: Python API documentation generator" href="https://pdoc.dev" target="_blank">
built with <span class="visually-hidden">pdoc</span><img
Expand Down
448 changes: 448 additions & 0 deletions docs/valtiopy/metadata.html

Large diffs are not rendered by default.

494 changes: 494 additions & 0 deletions docs/valtiopy/plot.html

Large diffs are not rendered by default.

313 changes: 313 additions & 0 deletions docs/valtiopy/regex.html

Large diffs are not rendered by default.

456 changes: 456 additions & 0 deletions docs/valtiopy/sample.html

Large diffs are not rendered by default.

394 changes: 318 additions & 76 deletions docs/valtiopy/utils.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "valtiopy"
version = "v0.0.2"
version = "v0.0.3"
description = "Work with the Valtiopaivat Corpus."
authors = ["bobborges"]
repository = "https://github.com/swerik-project/valtiopy"
Expand Down
16 changes: 10 additions & 6 deletions valtiopy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,21 @@ def __init__(self, **kwargs):
self.ValtiopaivatRecordsTEILocation = None
self.ValtiopaivatRecordsALTOLocation = None
self.ValtiopaivatRecordsPDFLocation = None
self.ValtiopaivatRecordsLOMap = None
self.ValtiopaivatHandlingarTEILocation = None
self.ValtiopaivatHandlingarALTOLocation = None
self.ValtiopaivatHandlingarPDFLocation = None
self.ValtiopaivatRegisterTEILocation = None
self.ValtiopaivatRegisterALTOLocation = None
self.ValtiopaivatRegisterPDFLocation = None
self.ValtiopaivatHandlingarLOMap = None
self.ValtiopaivatRegistersTEILocation = None
self.ValtiopaivatRegistersALTOLocation = None
self.ValtiopaivatRegistersPDFLocation = None
self.ValtiopaivatRegistersLOMap = None

for k,v in kwargs.items():
if hasattr(self, k):
setattr(self, k, v)
else:
warnings.warm(f"The property -- {k} -- from the config file is not a valid property. Ignoring")
warnings.warn(f"The property -- {k} -- from the config file is not a valid property. Ignoring")

def write(self):
"""
Expand All @@ -56,7 +59,7 @@ def update(self, **kwargs):



def track_existing_config(name = None, location = None):
def track_existing_config(name = None, location = None, overwrite_existing=False):
"""
Assign a name to an existing config file.

Expand All @@ -79,7 +82,8 @@ def track_existing_config(name = None, location = None):
cfg_list = {}

if name in cfg_list:
raise Exception(f"The name -- {name} -- already exists as a named config.")
if overwrite_existing == False:
raise Exception(f"The name -- {name} -- already exists as a named config. (pass overwrite_existing=True to override this error)")

cfg_list[name] = os.path.abspath(location)

Expand Down
64 changes: 64 additions & 0 deletions valtiopy/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from glob import glob
import pandas as pd
import warnings



def fetch_metadata_tables(metadata_path="valtiopaivat-persons/data"):
"""
return a list of metadata tables.

Args

metadata_path (str): place to look for metadata tables

returns

list of pd.DataFrame objects
"""
tables = []
csv_files = glob(f"{metadata_path}/*.csv")
for csv in csv_files:
tables.append(pd.read_csv(csv))
return tables


def join_metadata_tables(tables, key="swerik_person_id"):
"""
Join metadata tables on a key. Return all possible combinations of data on the matching key.

Args

tables (list): list of pd dataFrame objects
key (str): common column to merge on

returns

pandas df
"""
assert len(tables) > 0
if not isinstance(tables, list):
raise TypeError("Expected a list")
try:
assert all(isinstance(_,pd.DataFrame) for _ in tables)
except Exceptoin as e:
print("expected a pandas dataframe", e)

if len(tables) == 1:
warnings.warn("I can't merge a table with itself. You get back what you put in.")
return tables[0]
else:
inner_df = pd.merge(tables[0], tables[1], how='inner', on=key)
outer_df = pd.merge(tables[0], tables[1], how='outer', on=key)
if len(tables) > 2:
for table in tables[2:]:
inner_df = pd.merge(inner_df, table, how='inner', on=key)
outer_df = pd.merge(outer_df, table, how='outer', on=key)
return pd.concat([inner_df, outer_df]).drop_duplicates()


def fetch_metadata(metadata_path="valtiopaivat-persons/data"):
"""
Get all metadata tables at metadata path and merge them to one df
"""
return join_metadata_tables(fetch_metadata_tables(metadata_path=metadata_path)).sort_values(by=["swerik_person_id"])
97 changes: 97 additions & 0 deletions valtiopy/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""
Plot stuff
"""
from cycler import cycler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def plot_speaker_mapping(df, out_path="test/result/speaker-mapping.png"):
"""
Plot mapped vs unmapped speaker introductions

Args

d: dictionary prouced by test.known-speakers
out_path: where the plot gets written

Return

nothing, but writes a plot
"""
estates = df["estate"].unique()
print(estates)
colors = list('kkbbggrrcc')
default_cycler = (cycler(color=colors) +
cycler(linestyle=(['-', '--']*5)) +
cycler(linewidth=([2, 1.25]*5))
)
plt.rc('axes', prop_cycle=default_cycler)
f, ax = plt.subplots(figsize=(16,7))
legend_text = []
ddf = df.groupby("year")[["total", "matched"]].sum().reset_index()
x = ddf['year'].tolist()
for count in ["total", "matched"]:
legend_text.append(f"ALL:{count}")
Y = ddf[count].tolist()
X, Y = zip(*sorted(zip(x,Y),key=lambda x:x[0]))
plt.plot(X,Y)
for estate in estates:
print(estate)
dfv = df.loc[df["estate"] == estate]
x = dfv['year'].tolist()
for count in ["total", "matched"]:
legend_text.append(f"{estate}:{count}")
Y = dfv[count].tolist()
X, Y = zip(*sorted(zip(x,Y),key=lambda x:x[0]))
plt.plot(X,Y)
plt.title('Coverage of matched speakers vs total speakers')
plt.legend(legend_text, loc ="upper right")
ax.set_xlabel('Year')
ax.tick_params(axis='x', labelrotation=90)
plt.savefig(out_path, dpi=300)



def plot_speaker_mapping_proportion(df, out_path="test/result/speaker-mapping-proportion.png"):
"""
Plot mapped vs unmapped speaker introductions

Args

d: dictionary prouced by test.known-speakers
out_path: where the plot gets written

Return

nothing, but writes a plot
"""
estates = df["estate"].unique()
print(estates)
colors = list('kbgrc')
default_cycler = (cycler(color=colors) +
cycler(linestyle=(['-', '--', '--', '--', '--'])) +
cycler(linewidth=([2, 1.25, 1.25, 1.25, 1.25]))
)
plt.rc('axes', prop_cycle=default_cycler)
f, ax = plt.subplots(figsize=(16,7))
ddf = df.groupby("year")[["total", "matched"]].sum().reset_index()
lab = ["total"]
ddf["proportion"] = ddf.apply(lambda x: x["matched"]/x["total"], axis=1)
x = ddf['year'].tolist()
y = ddf['proportion'].tolist()
X, Y = zip(*sorted(zip(x,y),key=lambda x:x[0]))
plt.plot(X,Y)
for estate in estates:
lab.append(estate)
dfv = df.loc[df["estate"] == estate]
dfv["proportion"] = dfv.apply(lambda x: x["matched"]/x["total"], axis=1)
x = dfv['year'].tolist()
y = dfv['proportion'].tolist()
X, Y = zip(*sorted(zip(x,y),key=lambda x:x[0]))
plt.plot(X,Y)
plt.title('Coverage of matched speakers vs total speakers')
plt.legend(lab, loc ="upper right")
ax.set_xlabel('Year')
ax.tick_params(axis='x', labelrotation=90)
plt.savefig(out_path, dpi=300)
23 changes: 23 additions & 0 deletions valtiopy/regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
import regex as re

def compile_pattern(query):
"""
load query from a file

Args

query (str): name of .rq file stored with this module under `regex/`
"""
if not os.path.exists(f"{os.path.dirname(os.path.abspath(__file__))}/rq/{query}.rq"):
raise FileNotFoundError(f"Couldn't find file : {os.path.dirname(os.path.abspath(__file__))}/rq/{query}.rq")

with open(f"{os.path.dirname(os.path.abspath(__file__))}/rq/{query}.rq", 'r') as inf:
lines = [rf"{_.strip()}" for _ in inf.readlines()]
q = ''.join(lines)
#q = q.replace("\\s", r"\s")
q = q.replace("\\\\","\\")
print(q)
pat = re.compile(rf"{q}")
print(pat)
return pat
68 changes: 68 additions & 0 deletions valtiopy/rq/intros.rq
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
(
(
(B|b)iskop(en)?
|
(D|d)oktor
|
(D|d)omprosten
|
((F|f)ri)(H|h)err(e)?
|
(F|f)ri(h|b)?(,|\.)?
|
(g|G)refve
|
(H|h)err
|
(H|h)r(\.)?
|
(K|k)yrkoherden
|
(L|l)andtdagsmannen
|
(L|l)ektor
|
(O|Ö|o|ö)fverlärar(en)?
|
(P|p)rofessor(n)?
|
(P|p)rosten
|
(R|r)ektor
|
(S|s)tatsrådet
|
((V|v)ice\s?)?(P|p)astor(n)?
){s<=1}
\s
(
[A-Z]{1}\.\s
(
[A-Z]{1}\.\s
)?
)?
(
(
(
(
von
|
de\sla
|
af
)
\s
)?
[^\s,:;\.]+
)
(
,\s
(
[A-Z]{1}\.\s
(
[A-Z]{1}\.(\s)?
)?
)
)?
)
)
Loading