Skip to content

Commit be0d812

Browse files
Merge pull request #42 from CSAFE-ISU/37-short-vs-long
37 short vs long
2 parents 66352e8 + d7446c9 commit be0d812

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+747
-254
lines changed

NEWS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# handwriterRF (development version)
22

3+
## New Features and Enhancements
4+
5+
* Enhanced `compare_writer_profiles()`, `get_distances()`, `get_ref_scores()`, and `train_rf()` functionality:
6+
- Now accepts an optional second dataframe of writer profiles
7+
- With a single dataframe input: Compares all possible pairs of writer profiles within that dataframe
8+
- With two dataframe inputs: Compares each profile from the first dataframe against each profile from the second dataframe (cross-comparison only, no within-dataframe comparisons)
9+
310
# handwriterRF 1.1.1
411

512
## Fixes

R/compare.R

Lines changed: 123 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,10 +138,15 @@ compare_documents <- function(sample1,
138138
#'
139139
#' Compare the writer profiles from two handwritten documents to predict whether
140140
#' they were written by the same person. Use either a similarity score or a
141-
#' score-based likelihood ratio as a comparison method.
141+
#' score-based likelihood ratio (SLR) as a comparison method.
142142
#'
143143
#' @param writer_profiles A dataframe of writer profiles or cluster fill rates
144144
#' calculated with [get_cluster_fill_rates]
145+
#' @param writer_profiles2 Optional. A second dataframe of writer profiles or
146+
#' cluster fill rates. If it is not provided, a score or SLR will be
147+
#' calculated between every pair of rows in writer_profiles. If
148+
#' writer_profiles2 is provided, a score or SLR will be calculated between
149+
#' every row of writer_profiles and every row of writer_profiles2.
145150
#' @param score_only TRUE returns only the similarity score. FALSE returns the
146151
#' similarity score and a score-based likelihood ratio for that score,
147152
#' calculated using `reference_scores`.
@@ -166,12 +171,14 @@ compare_documents <- function(sample1,
166171
#' @md
167172
compare_writer_profiles <- function(
168173
writer_profiles,
174+
writer_profiles2 = NULL,
169175
score_only = TRUE,
170176
rforest = NULL,
171177
reference_scores = NULL) {
172178
params <- list(
173179
samples = NULL,
174180
writer_profiles = writer_profiles,
181+
writer_profiles2 = writer_profiles2,
175182
score_only = score_only,
176183
rforest = rforest,
177184
project_dir = NULL,
@@ -184,10 +191,14 @@ compare_writer_profiles <- function(
184191
stop("Writer profiles must be a rates dataframe created by get_writer_profiles() with measure = 'rates'.")
185192
}
186193

194+
if (!is.null(params$writer_profiles2) && !is_rates_df(params$writer_profiles2)) {
195+
stop("Writer profiles must be a rates dataframe created by get_writer_profiles() with measure = 'rates'.")
196+
}
197+
187198
params <- handle_null_values(params)
188199

189200
message("Calculating distance between samples...")
190-
params$dist <- get_distances(df = writer_profiles, distance_measures = params$rforest$distance_measures)
201+
params$dist <- get_distances(df = writer_profiles, distance_measures = params$rforest$distance_measures, df2 = writer_profiles2)
191202

192203
message("Calculating similarity score...")
193204
params$score <- get_score(d = params$dist, rforest = params$rforest)$score
@@ -205,6 +216,21 @@ compare_writer_profiles <- function(
205216

206217
# Internal Functions ------------------------------------------------------
207218

219+
#' Handle Null Values
220+
#'
221+
#' The following rules are applied:
222+
#' * If reference_scores are supplied and score_only is TRUE, score_only is changed to FALSE
223+
#' * If project_dir is NULL, the project_dir is set to tempdir() > comparison.
224+
#' * If rforest is NULL, the default random_forest is used
225+
#' * If reference_scores is NULL, the default ref_scores are used
226+
#'
227+
#' @param params A list of parameters: samples, writer_profiles,
228+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
229+
#' score, slr.
230+
#'
231+
#' @returns A list of parameters
232+
#'
233+
#' @noRd
208234
handle_null_values <- function(params) {
209235
if (!is.null(params$reference_scores) && params$score_only) {
210236
message("Reference scores were supplied so score_only will be changed to FALSE.")
@@ -226,13 +252,36 @@ handle_null_values <- function(params) {
226252
return(params)
227253
}
228254

229-
create_dirs <- function(params, subdirs = NULL) {
255+
#' Create Directories
256+
#'
257+
#' Subfolders clusters, docs, and graphs are created in the project directory.
258+
#'
259+
#' @param params A list of parameters: samples, writer_profiles,
260+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
261+
#' score, slr.
262+
#'
263+
#' @returns No value returned
264+
#'
265+
#' @noRd
266+
create_dirs <- function(params) {
230267
create_dir(params$project_dir)
231268
create_dir(file.path(params$project_dir, "clusters"))
232269
create_dir(file.path(params$project_dir, "docs"))
233270
create_dir(file.path(params$project_dir, "graphs"))
234271
}
235272

273+
#' Handle Samples with Same Name
274+
#'
275+
#' If the input samples are in different directories but have the same
276+
#' filenames, the samples are relabelled as sample1.png and sample2.png.
277+
#'
278+
#' @param params A list of parameters: samples, writer_profiles,
279+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
280+
#' score, slr.
281+
#'
282+
#' @returns A list of parameters
283+
#'
284+
#' @noRd
236285
handle_samples_w_same_name <- function(params) {
237286

238287
# samples in two different directories CAN have the same filename
@@ -253,6 +302,19 @@ handle_samples_w_same_name <- function(params) {
253302
return(params)
254303
}
255304

305+
#' Check Directory Contents
306+
#'
307+
#' Check the contents of project_dir > dir_name to ensure that it does not
308+
#' contain helper files for documents other than sample1 or sample2.
309+
#'
310+
#' @param params A list of parameters: samples, writer_profiles,
311+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
312+
#' score, slr.
313+
#' @param dir_name The name of the subdirectory in project_dir to check.
314+
#'
315+
#' @returns A list of parameters
316+
#'
317+
#' @noRd
256318
check_dir_contents <- function(params, dir_name) {
257319
if (!is.null(params$project_dir) && dir.exists(file.path(params$project_dir, dir_name))) {
258320
actual_files <- list.files(file.path(params$project_dir, dir_name))
@@ -296,6 +358,18 @@ is_rates_df <- function(df) {
296358
}
297359
}
298360

361+
#' Run Checks
362+
#'
363+
#' Check the contents of the clusters, docs, and graphs folders in the project
364+
#' directory.
365+
#'
366+
#' @param params A list of parameters: samples, writer_profiles,
367+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
368+
#' score, slr.
369+
#'
370+
#' @returns A list of parameters
371+
#'
372+
#' @noRd
299373
run_checks <- function(params) {
300374

301375
check_dir_contents(params, "clusters")
@@ -305,6 +379,17 @@ run_checks <- function(params) {
305379
return(params)
306380
}
307381

382+
#' Copy Samples to Project Directory
383+
#'
384+
#' Copy the samples to the project directory.
385+
#'
386+
#' @param params A list of parameters: samples, writer_profiles,
387+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
388+
#' score, slr.
389+
#'
390+
#' @returns A list of parameters
391+
#'
392+
#' @noRd
308393
copy_samples_to_project_dir <- function(params) {
309394
# Copy samples to project_dir > docs
310395
message("Copying samples to project directory > docs...\n")
@@ -319,7 +404,18 @@ copy_samples_to_project_dir <- function(params) {
319404
return(params)
320405
}
321406

322-
407+
#' Get Score-based Likelihood Ratio
408+
#'
409+
#' Calculate the score-based likelihood ratio for each score in params$score
410+
#' using params$rforest and params$reference_scores.
411+
#'
412+
#' @param params A list of parameters: samples, writer_profiles,
413+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
414+
#' score, slr.
415+
#'
416+
#' @returns A list of parameters
417+
#'
418+
#' @noRd
323419
get_slr <- function(params) {
324420
get_slr_for_single_score <- function(score, densities) {
325421
numerator <- eval_density_at_point(den = densities$same_writer, x = score, type = "numerator")
@@ -335,6 +431,18 @@ get_slr <- function(params) {
335431
return(params)
336432
}
337433

434+
#' Make Results Dataframe
435+
#'
436+
#' Format the comparison results in a dataframe with columns docname1, writer1,
437+
#' docname2, writer2, ground_truth, score, and optionally slr.
438+
#'
439+
#' @param params A list of parameters: samples, writer_profiles,
440+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
441+
#' score, slr.
442+
#'
443+
#' @returns A dataframe
444+
#'
445+
#' @noRd
338446
make_results_df <- function(params) {
339447
df <- params$dist
340448

@@ -356,6 +464,17 @@ make_results_df <- function(params) {
356464
return(df)
357465
}
358466

467+
#' Cleanup
468+
#'
469+
#' Delete the comparison folder and its contents from the tempdir().
470+
#'
471+
#' @param params A list of parameters: samples, writer_profiles,
472+
#' writer_profiles2, score_only, rforest, project_dir, reference_scores,
473+
#' score, slr.
474+
#'
475+
#' @returns No return value
476+
#'
477+
#' @noRd
359478
clean_up <- function(params) {
360479
# Optional. Delete comparison folder and contents in tempdir()
361480
if (params$project_dir == file.path(tempdir(), "comparison")) {

0 commit comments

Comments
 (0)