@@ -16,12 +16,20 @@ source('api_keys.R')
16
16
17
17
authors_df_unfltd = read_rds(' 03_authors.rds' ) %> %
18
18
filter(! is.na(family ))
19
- names_df = read_csv(' 04_names_verif.csv' , na = ' Ignored' )
19
+ names_df = read_csv(' 04_names_verif.csv' , na = ' Ignored' ) %> %
20
+ filter(! duplicated(. )) %> %
21
+ mutate(`Canonical Family` = ifelse(is.na(`Canonical Family` ),
22
+ `Orig Family` ,
23
+ `Canonical Family` ),
24
+ `Canonical Given` = ifelse(is.na(`Canonical Given` ),
25
+ `Orig Given` ,
26
+ `Canonical Given` ))
20
27
21
28
# # Combine author-level metadata and canonical names
22
- authors_df = left_join(authors_df_unfltd , names_df ,
23
- by = c(' family' = ' Orig Family' ,
24
- ' given' = ' Orig Given' )) %> %
29
+ authors_df = authors_df_unfltd %> %
30
+ left_join(names_df ,
31
+ by = c(' family' = ' Orig Family' ,
32
+ ' given' = ' Orig Given' )) %> %
25
33
rename(family_orig = family ,
26
34
given_orig = given ) %> %
27
35
# # Springer had some encoding errors that caused problems w/ deduping
@@ -294,7 +302,7 @@ if (!file.exists(genderize_file)) {
294
302
gender_genderize = chunks_genderize %> %
295
303
map_dfr(genderize_list , api_key = genderize.io_key )
296
304
# tictoc::toc()
297
-
305
+
298
306
gender_genderize = gender_genderize %> %
299
307
# # Rescale output variables
300
308
mutate(gender = case_when(gender == ' male' ~ ' m' ,
@@ -304,7 +312,7 @@ if (!file.exists(genderize_file)) {
304
312
rename(prob_f_genderize = probability ,
305
313
gender_genderize = gender ,
306
314
for_gender_attr = name )
307
-
315
+
308
316
write_rds(gender_genderize , genderize_file )
309
317
} else {
310
318
gender_genderize = read_rds(genderize_file )
@@ -313,12 +321,15 @@ if (!file.exists(genderize_file)) {
313
321
314
322
# # Combine ----
315
323
316
- gender_combined = gender_blevins %> %
324
+ gender_combined = phil_sci %> %
325
+ select(for_gender_attr , given , family ) %> %
326
+ filter(! duplicated(. )) %> %
327
+ left_join(gender_blevins ) %> %
317
328
select(- `F` , - M , - n ) %> %
318
- full_join (gender_namsor ) %> %
329
+ left_join (gender_namsor ) %> %
319
330
select(- id ) %> %
320
- full_join (gender_genderize ) %> %
321
- select(- count ) %> %
331
+ left_join (gender_genderize ) %> %
332
+ select(- count ) %> %
322
333
rowwise() %> %
323
334
mutate(avg = mean(c(prob_f_blevins ,
324
335
prob_f_namsor ,
0 commit comments