Skip to content

Commit c89f316

Browse files
committed
fix a substantial NA matching error when joining gender attribution
1 parent ff8a818 commit c89f316

File tree

2 files changed

+34
-13
lines changed

2 files changed

+34
-13
lines changed

06_gender.R

+21-10
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,20 @@ source('api_keys.R')
1616

1717
authors_df_unfltd = read_rds('03_authors.rds') %>%
1818
filter(!is.na(family))
19-
names_df = read_csv('04_names_verif.csv', na = 'Ignored')
19+
names_df = read_csv('04_names_verif.csv', na = 'Ignored') %>%
20+
filter(!duplicated(.)) %>%
21+
mutate(`Canonical Family` = ifelse(is.na(`Canonical Family`),
22+
`Orig Family`,
23+
`Canonical Family`),
24+
`Canonical Given` = ifelse(is.na(`Canonical Given`),
25+
`Orig Given`,
26+
`Canonical Given`))
2027

2128
## Combine author-level metadata and canonical names
22-
authors_df = left_join(authors_df_unfltd, names_df,
23-
by = c('family' = 'Orig Family',
24-
'given' = 'Orig Given')) %>%
29+
authors_df = authors_df_unfltd %>%
30+
left_join(names_df,
31+
by = c('family' = 'Orig Family',
32+
'given' = 'Orig Given')) %>%
2533
rename(family_orig = family,
2634
given_orig = given) %>%
2735
## Springer had some encoding errors that caused problems w/ deduping
@@ -294,7 +302,7 @@ if (!file.exists(genderize_file)) {
294302
gender_genderize = chunks_genderize %>%
295303
map_dfr(genderize_list, api_key = genderize.io_key)
296304
# tictoc::toc()
297-
305+
298306
gender_genderize = gender_genderize %>%
299307
## Rescale output variables
300308
mutate(gender = case_when(gender == 'male' ~ 'm',
@@ -304,7 +312,7 @@ if (!file.exists(genderize_file)) {
304312
rename(prob_f_genderize = probability,
305313
gender_genderize = gender,
306314
for_gender_attr = name)
307-
315+
308316
write_rds(gender_genderize, genderize_file)
309317
} else {
310318
gender_genderize = read_rds(genderize_file)
@@ -313,12 +321,15 @@ if (!file.exists(genderize_file)) {
313321

314322
## Combine ----
315323

316-
gender_combined = gender_blevins %>%
324+
gender_combined = phil_sci %>%
325+
select(for_gender_attr, given, family) %>%
326+
filter(!duplicated(.)) %>%
327+
left_join(gender_blevins) %>%
317328
select(-`F`, -M, -n) %>%
318-
full_join(gender_namsor) %>%
329+
left_join(gender_namsor) %>%
319330
select(-id) %>%
320-
full_join(gender_genderize) %>%
321-
select(-count) %>%
331+
left_join(gender_genderize) %>%
332+
select(-count) %>%
322333
rowwise() %>%
323334
mutate(avg = mean(c(prob_f_blevins,
324335
prob_f_namsor,

07_dataset.R

+13-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@ library(tidyverse)
33
authors_unfltd = read_rds('03_authors.rds') %>%
44
filter(!duplicated(.))
55
names_df = read_csv('04_names_verif.csv', na = 'Ignored') %>%
6-
filter(!duplicated(.))
6+
filter(!duplicated(.)) %>%
7+
mutate(`Canonical Family` = ifelse(is.na(`Canonical Family`),
8+
`Orig Family`,
9+
`Canonical Family`),
10+
`Canonical Given` = ifelse(is.na(`Canonical Given`),
11+
`Orig Given`,
12+
`Canonical Given`))
713

814
phil_sci = read_rds('06_phil_sci.Rds')
915

@@ -34,8 +40,12 @@ authors_phs = inner_join(authors_full, phil_sci)
3440

3541

3642
## Publication-wise formats
37-
pubs_full = nest(authors_full, given_orig:gender_attr, .key = 'author_data')
38-
pubs_phs = nest(authors_phs, given_orig:gender_attr, .key = 'author_data')
43+
pubs_full = nest(authors_full, given_orig:gender_attr,
44+
.key = 'author_data') %>%
45+
mutate(n_authors = map_int(author_data, nrow))
46+
pubs_phs = nest(authors_phs, given_orig:gender_attr,
47+
.key = 'author_data') %>%
48+
mutate(n_authors = map_int(author_data, nrow))
3949

4050

4151
## Output ----

0 commit comments

Comments
 (0)