Skip to content

Commit 767149d

Browse files
authored
Merge pull request #1 from swerik-project/development
New version
2 parents 673dda5 + 0b61a23 commit 767149d

9 files changed

Lines changed: 91 additions & 15 deletions

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Package: rcr
22
Type: Package
33
Title: Working with the Riksdagen Corpus
4-
Version: 0.2.0
5-
Date: 2023-12-13
4+
Version: 0.3.0
5+
Date: 2024-03-07
66
Author: Mans Magnusson
77
Maintainer: Mans Magnusson <mans.magnusson@statistik.uu.se>
88
Description: The package contain helper functions to easily work with the Riksdagen corpus from R.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#' Extract Date from Record
2+
#'
3+
#' @description
4+
#' The function extract date from the Riksdagen Records.
5+
#'
6+
#' @param record_path a file path to a record XML file
7+
#' @param record_paths a vector of file paths to a record XML file
8+
#' @param mc.cores the number of cores to use (Linux and Mac only) in \code{mclapply}.
9+
#' Defaults to available cores - 1.
10+
#' @param ... further arguments supplied to \code{mclapply}.
11+
#'
12+
#' @return
13+
#' The function returns a \code{tibble} data frame with the following variables:
14+
#' \describe{
15+
#' \item{record_id}{The id of the record.}
16+
#' \item{record_date}{The date of the record.}
17+
#' }
18+
#'
19+
#' @importFrom xml2 read_xml xml_ns_strip, xml_attr
20+
#' @export
21+
extract_record_dates_from_record <- function(record_path, all=F){
22+
checkmate::assert_string(record_path)
23+
rcp <- get_riksdag_corpora_path()
24+
rcfp <- file.path(rcp, record_path)
25+
if(file.exists(rcfp)){
26+
record_path <- rcfp
27+
}
28+
checkmate::assert_file_exists(record_path)
29+
30+
x <- read_xml(record_path)
31+
x <- xml_ns_strip(x)
32+
33+
id <- xml_attr(xml_find_all(x, "TEI"),attr = "id")
34+
xs <- xml_find_all(x,".//docDate")
35+
df <- tibble("record_id" = id,
36+
"doc_date" = as.Date(xml_text(xs)))
37+
38+
return(df)
39+
}
40+
41+
42+
#' @rdname extract_speeches_from_record
43+
#' @export
44+
extract_record_dates_from_records <- function(record_paths, mc.cores = getOption("mc.cores", detectCores() - 1L), ...){
45+
checkmate::assert_character(record_paths)
46+
rcp <- get_riksdag_corpora_path()
47+
rcfp <- file.path(rcp, record_paths)
48+
for(i in seq_along(rcfp)){
49+
if(file.exists(rcfp[i])){
50+
record_paths[i] <- rcfp[i]
51+
}
52+
}
53+
checkmate::assert_file_exists(record_paths)
54+
55+
if(mc.cores > 1L & .Platform$OS.type == "unix"){
56+
message(mc.cores, " cores are used to process the data.")
57+
res <- parallel::mclapply(record_paths, extract_record_dates_from_record, mc.cores = mc.cores, ...)
58+
} else {
59+
res <- lapply(record_paths, extract_record_dates_from_record)
60+
}
61+
62+
res <- bind_rows(res)
63+
res[, c("record_id", "doc_date")]
64+
}

R/extract_speeches.R

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
#' @param ... further arguments supplied to \code{mclapply}.
1616
#'
1717
#' @return
18-
#' The function returns a tibble data frame with the following variables:
18+
#' The function returns a \code{tibble} data frame with the following variables:
1919
#' \describe{
20+
#' \item{record_id}{The id of the record.}
2021
#' \item{speech_no}{The speech number in the record.}
2122
#' \item{speech_id}{The id of the XML node to the introduction of the speaker.}
2223
#' \item{who}{The id of the person giving the speech.}
@@ -43,8 +44,10 @@ extract_speeches_from_record <- function(record_path){
4344
x <- xml_ns_strip(x)
4445

4546
# Extract speeches
47+
id <- xml_attr(xml_find_all(x, "TEI"),attr = "id")
4648
xs <- xml_find_all(x, ".//note[@type = 'speaker']|.//u|.//seg")
47-
df <- tibble("type_speaker" = xml_attr(xs, attr = "type") == "speaker",
49+
df <- tibble("record_id" = id,
50+
"type_speaker" = xml_attr(xs, attr = "type") == "speaker",
4851
"name" = xml_name(xs),
4952
"who" = xml_attr(xs, attr = "who"),
5053
"id" = xml_attr(xs, attr = "id"),
@@ -57,7 +60,7 @@ extract_speeches_from_record <- function(record_path){
5760
df <- df[df$name == "seg",]
5861
df$type_speaker <- NULL
5962
df$name <- NULL
60-
df[, c("speech_no", "speech_id", "who", "id", "text")]
63+
df[, c("record_id", "speech_no", "speech_id", "who", "id", "text")]
6164
}
6265

6366
#' @rdname extract_speeches_from_record
@@ -80,10 +83,7 @@ extract_speeches_from_records <- function(record_paths, mc.cores = getOption("mc
8083
res <- lapply(record_paths, extract_speeches_from_record)
8184
}
8285

83-
for(i in seq_along(res)){
84-
res[[i]]$record <- basename(record_paths[i])
85-
}
8686
res <- bind_rows(res)
87-
res[, c("record", "speech_no", "speech_id", "who", "id", "text")]
87+
res[, c("record_id", "speech_no", "speech_id", "who", "id", "text")]
8888
}
8989

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,7 @@ fps <-
2929
sp <- extract_speeches_from_records(fps)
3030
```
3131

32+
Similarly we can extract the dates from the records with
33+
```
34+
ds <- extract_record_dates_from_records(fps)
35+
```

tests/testthat/files/prot-1951--fk--029.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
</editorialDecl>
2929
</encodingDesc>
3030
</teiHeader>
31-
<TEI>
31+
<TEI xml:id="prot-1951--fk--029">
3232
<teiHeader>
3333
<fileDesc>
3434
<titleStmt>

tests/testthat/files/prot-1975--036.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
</editorialDecl>
2929
</encodingDesc>
3030
</teiHeader>
31-
<TEI>
31+
<TEI xml:id="prot-1975--036">
3232
<teiHeader>
3333
<fileDesc>
3434
<titleStmt>
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
test_that("extracting dates works", {
2+
3+
tfp <- c('prot-1896--ak--042.xml', 'prot-1951--fk--029.xml', 'prot-1975--036.xml')
4+
tfp <- test_path(file.path("files", tfp))
5+
6+
expect_silent(sp <- extract_record_dates_from_record(record_path = tfp[1]))
7+
expect_error(sp <- extract_record_dates_from_record(tfp))
8+
expect_silent(sp <- extract_record_dates_from_records(tfp, mc.cores = 1L))
9+
expect_silent(suppressMessages(sp <- extract_speeches_from_records(record_paths = tfp, mc.cores = 2L)))
10+
11+
})

tests/testthat/test-extract_speeches.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ test_that("extracting speeches works", {
1414
tfp <- c('prot-1896--ak--042.xml', 'prot-1951--fk--029.xml', 'prot-1975--036.xml')
1515
tfp <- test_path(file.path("files", tfp))
1616

17-
expect_silent(sp <- extract_speeches_from_record(tfp[1]))
17+
expect_silent(sp <- extract_speeches_from_record(record_path = tfp[1]))
1818
expect_error(sp <- extract_speeches_from_record(tfp))
1919
expect_silent(sp <- extract_speeches_from_records(tfp, mc.cores = 1L))
2020
expect_silent(suppressMessages(sp <- extract_speeches_from_records(record_paths = tfp, mc.cores = 2L)))

tests/testthat/test-test.R

Lines changed: 0 additions & 3 deletions
This file was deleted.

0 commit comments

Comments
 (0)