|
| 1 | +#' Retrieve Author Biographical Information from OpenAlex |
| 2 | +#' |
| 3 | +#' This function downloads comprehensive author information from OpenAlex based on a DOI |
| 4 | +#' and the numerical position of the author in the co-authors list. It provides detailed |
| 5 | +#' biographical data, bibliometric indicators, and affiliation information. |
| 6 | +#' |
| 7 | +#' @param author_position Integer. The numerical position of the author in the authors list (default: 1) |
| 8 | +#' @param doi Character. DOI of the article used to identify the authors |
| 9 | +#' @param verbose Logical. Print informative messages during execution (default: FALSE) |
| 10 | +#' @param return_all_authors Logical. If TRUE, returns information for all co-authors (default: FALSE) |
| 11 | +#' |
| 12 | +#' @return If \code{return_all_authors = FALSE}, returns a tibble with comprehensive information |
| 13 | +#' about the specified author including: |
| 14 | +#' \itemize{ |
| 15 | +#' \item Basic information (name, ORCID, OpenAlex ID) |
| 16 | +#' \item Bibliometric indicators (works count, citations, h-index, i10-index) |
| 17 | +#' \item Affiliation details from both the paper and author profile |
| 18 | +#' \item Research topics and areas |
| 19 | +#' \item Paper-specific metadata (corresponding author status, position type) |
| 20 | +#' } |
| 21 | +#' If \code{return_all_authors = TRUE}, returns a list of tibbles, one for each co-author. |
| 22 | +#' |
| 23 | +#' @details |
| 24 | +#' The function first retrieves the work information using the provided DOI, then extracts |
| 25 | +#' author IDs from the authorships data, and finally fetches detailed author profiles from |
| 26 | +#' OpenAlex. It enriches the author data with paper-specific information such as authorship |
| 27 | +#' position, corresponding author status, and affiliations as listed in the paper. |
| 28 | +#' |
| 29 | +#' The function handles various edge cases including missing author IDs, invalid positions, |
| 30 | +#' and network errors. It also provides comprehensive error messages to help troubleshoot |
| 31 | +#' common issues. |
| 32 | +#' |
| 33 | +#' @examples |
| 34 | +#' \dontrun{ |
| 35 | +#' # Get information for the first author |
| 36 | +#' first_author <- authorBio(doi = "10.1016/j.joi.2017.08.007") |
| 37 | +#' |
| 38 | +#' # Get information for the second author with verbose output |
| 39 | +#' second_author <- authorBio( |
| 40 | +#' author_position = 2, |
| 41 | +#' doi = "10.1016/j.joi.2017.08.007", |
| 42 | +#' verbose = TRUE |
| 43 | +#' ) |
| 44 | +#' |
| 45 | +#' # Get information for all co-authors |
| 46 | +#' all_authors <- authorBio( |
| 47 | +#' doi = "10.1016/j.joi.2017.08.007", |
| 48 | +#' return_all_authors = TRUE |
| 49 | +#' ) |
| 50 | +#' } |
| 51 | +#' |
| 52 | +#' @export |
| 53 | +#' |
| 54 | +authorBio <- function(author_position = 1, |
| 55 | + doi = "10.1016/j.joi.2017.08.007", |
| 56 | + verbose = FALSE, |
| 57 | + return_all_authors = FALSE) { |
| 58 | + |
| 59 | + # Input validation |
| 60 | + if (is.null(doi) || !is.character(doi) || nchar(trimws(doi)) == 0) { |
| 61 | + stop("The 'doi' parameter must be a non-empty character string") |
| 62 | + } |
| 63 | + |
| 64 | + if (!is.numeric(author_position) || author_position < 1 || author_position != as.integer(author_position)) { |
| 65 | + stop("The 'author_position' parameter must be a positive integer") |
| 66 | + } |
| 67 | + |
| 68 | + # Check library availability |
| 69 | + if (!requireNamespace("openalexR", quietly = TRUE)) { |
| 70 | + stop("The 'openalexR' library is not available. Install it with: install.packages('openalexR')") |
| 71 | + } |
| 72 | + |
| 73 | + if (verbose) cat("Retrieving article information for DOI:", doi, "\n") |
| 74 | + |
| 75 | + # Retrieve article information with error handling |
| 76 | + au_work <- tryCatch({ |
| 77 | + openalexR::oa_fetch( |
| 78 | + entity = "works", |
| 79 | + doi = doi, |
| 80 | + output = "tibble" |
| 81 | + ) |
| 82 | + }, error = function(e) { |
| 83 | + stop("Error retrieving article: ", e$message, |
| 84 | + "\nPlease verify that the DOI is correct and OpenAlex is accessible") |
| 85 | + }) |
| 86 | + |
| 87 | + # Verify that the article was found |
| 88 | + if (is.null(au_work) || nrow(au_work) == 0) { |
| 89 | + stop("No article found for the provided DOI: ", doi) |
| 90 | + } |
| 91 | + |
| 92 | + # Extract author information from the correct structure |
| 93 | + authorships <- au_work$authorships[[1]] |
| 94 | + |
| 95 | + if (is.null(authorships) || nrow(authorships) == 0) { |
| 96 | + stop("No author information found for this article") |
| 97 | + } |
| 98 | + |
| 99 | + # Verify that the requested position exists |
| 100 | + if (author_position > nrow(authorships)) { |
| 101 | + stop("Author position (", author_position, |
| 102 | + ") is greater than the total number of authors (", nrow(authorships), ")") |
| 103 | + } |
| 104 | + |
| 105 | + if (verbose) { |
| 106 | + cat("Article found:", au_work$display_name[1], "\n") |
| 107 | + cat("Total number of authors:", nrow(authorships), "\n") |
| 108 | + if (nrow(authorships) > 0) { |
| 109 | + cat("Authors:\n") |
| 110 | + for (i in 1:nrow(authorships)) { |
| 111 | + cat(" ", i, ".", authorships$display_name[i], "\n") |
| 112 | + } |
| 113 | + } |
| 114 | + } |
| 115 | + |
| 116 | + # If requested, return all authors |
| 117 | + if (return_all_authors) { |
| 118 | + if (verbose) cat("Retrieving information for all authors...\n") |
| 119 | + |
| 120 | + all_authors <- list() |
| 121 | + for (i in 1:nrow(authorships)) { |
| 122 | + if (verbose) cat("Processing author", i, "of", nrow(authorships), ":", authorships$display_name[i], "\n") |
| 123 | + |
| 124 | + author_id <- authorships$id[i] |
| 125 | + if (!is.na(author_id) && author_id != "") { |
| 126 | + # Extract only the OpenAlex ID from the full URL |
| 127 | + clean_id <- gsub("https://openalex.org/", "", author_id) |
| 128 | + |
| 129 | + author_info <- tryCatch({ |
| 130 | + openalexR::oa_fetch( |
| 131 | + entity = "authors", |
| 132 | + identifier = clean_id, |
| 133 | + output = "tibble" |
| 134 | + ) |
| 135 | + }, error = function(e) { |
| 136 | + if (verbose) cat("Error for author", i, ":", e$message, "\n") |
| 137 | + NULL |
| 138 | + }) |
| 139 | + |
| 140 | + if (!is.null(author_info) && nrow(author_info) > 0) { |
| 141 | + # Add additional information from the authorships structure |
| 142 | + author_info$author_position_in_paper <- i |
| 143 | + author_info$original_author_name <- authorships$display_name[i] |
| 144 | + author_info$is_corresponding <- authorships$is_corresponding[i] |
| 145 | + author_info$author_position_type <- authorships$author_position[i] |
| 146 | + |
| 147 | + # Add affiliation information if available |
| 148 | + if (!is.null(authorships$affiliations[[i]]) && nrow(authorships$affiliations[[i]]) > 0) { |
| 149 | + author_info$primary_affiliation <- authorships$affiliations[[i]]$display_name[1] |
| 150 | + author_info$primary_affiliation_country <- authorships$affiliations[[i]]$country_code[1] |
| 151 | + } else { |
| 152 | + author_info$primary_affiliation <- NA |
| 153 | + author_info$primary_affiliation_country <- NA |
| 154 | + } |
| 155 | + |
| 156 | + # Add raw affiliation if available |
| 157 | + if (!is.null(authorships$affiliation_raw) && length(authorships$affiliation_raw) >= i) { |
| 158 | + author_info$affiliation_raw <- authorships$affiliation_raw[i] |
| 159 | + } else { |
| 160 | + author_info$affiliation_raw <- NA |
| 161 | + } |
| 162 | + |
| 163 | + all_authors[[i]] <- author_info |
| 164 | + } |
| 165 | + } else { |
| 166 | + if (verbose) cat("Invalid author ID for position", i, "\n") |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + # Combine all valid results |
| 171 | + valid_authors <- all_authors[!sapply(all_authors, is.null)] |
| 172 | + if (length(valid_authors) > 0) { |
| 173 | + # Add common metadata to all |
| 174 | + for (i in seq_along(valid_authors)) { |
| 175 | + valid_authors[[i]]$source_doi <- doi |
| 176 | + valid_authors[[i]]$source_title <- au_work$display_name[1] |
| 177 | + valid_authors[[i]]$query_timestamp <- Sys.time() |
| 178 | + } |
| 179 | + return(valid_authors) |
| 180 | + } else { |
| 181 | + stop("Unable to retrieve information for any author") |
| 182 | + } |
| 183 | + } |
| 184 | + |
| 185 | + # Retrieve information for the specific author |
| 186 | + author_id <- authorships$id[author_position] |
| 187 | + |
| 188 | + if (is.na(author_id) || author_id == "") { |
| 189 | + stop("Invalid author ID at position ", author_position) |
| 190 | + } |
| 191 | + |
| 192 | + # Extract only the OpenAlex ID from the full URL |
| 193 | + clean_id <- gsub("https://openalex.org/", "", author_id) |
| 194 | + |
| 195 | + if (verbose) { |
| 196 | + cat("Retrieving information for author at position", author_position, "\n") |
| 197 | + cat("Author name:", authorships$display_name[author_position], "\n") |
| 198 | + cat("OpenAlex ID:", clean_id, "\n") |
| 199 | + cat("Position type:", authorships$author_position[author_position], "\n") |
| 200 | + cat("Is corresponding author:", authorships$is_corresponding[author_position], "\n") |
| 201 | + } |
| 202 | + |
| 203 | + # Retrieve author biographical data |
| 204 | + au_info <- tryCatch({ |
| 205 | + openalexR::oa_fetch( |
| 206 | + entity = "authors", |
| 207 | + identifier = clean_id, |
| 208 | + output = "tibble" |
| 209 | + ) |
| 210 | + }, error = function(e) { |
| 211 | + stop("Error retrieving author information: ", e$message) |
| 212 | + }) |
| 213 | + |
| 214 | + if (is.null(au_info) || nrow(au_info) == 0) { |
| 215 | + stop("No biographical information found for the author at position ", author_position) |
| 216 | + } |
| 217 | + |
| 218 | + # Add useful metadata from the authorships structure |
| 219 | + au_info$author_position_in_paper <- author_position |
| 220 | + au_info$original_author_name <- authorships$display_name[author_position] |
| 221 | + au_info$is_corresponding <- authorships$is_corresponding[author_position] |
| 222 | + au_info$author_position_type <- authorships$author_position[author_position] |
| 223 | + |
| 224 | + # Add affiliation information if available |
| 225 | + if (!is.null(authorships$affiliations[[author_position]]) && |
| 226 | + nrow(authorships$affiliations[[author_position]]) > 0) { |
| 227 | + au_info$primary_affiliation <- authorships$affiliations[[author_position]]$display_name[1] |
| 228 | + au_info$primary_affiliation_country <- authorships$affiliations[[author_position]]$country_code[1] |
| 229 | + au_info$primary_affiliation_ror <- authorships$affiliations[[author_position]]$ror[1] |
| 230 | + } else { |
| 231 | + au_info$primary_affiliation <- NA |
| 232 | + au_info$primary_affiliation_country <- NA |
| 233 | + au_info$primary_affiliation_ror <- NA |
| 234 | + } |
| 235 | + |
| 236 | + # Add raw affiliation if available |
| 237 | + if (!is.null(authorships$affiliation_raw) && length(authorships$affiliation_raw) >= author_position) { |
| 238 | + au_info$affiliation_raw <- authorships$affiliation_raw[author_position] |
| 239 | + } else { |
| 240 | + au_info$affiliation_raw <- NA |
| 241 | + } |
| 242 | + |
| 243 | + # Add query metadata |
| 244 | + au_info$source_doi <- doi |
| 245 | + au_info$source_title <- au_work$display_name[1] |
| 246 | + au_info$query_timestamp <- Sys.time() |
| 247 | + |
| 248 | + if (verbose) { |
| 249 | + cat("Information successfully retrieved for:", au_info$display_name[1], "\n") |
| 250 | + cat("Number of publications:", au_info$works_count[1], "\n") |
| 251 | + cat("Number of citations:", au_info$cited_by_count[1], "\n") |
| 252 | + cat("H-index:", au_info$h_index[1], "\n") |
| 253 | + cat("Primary affiliation:", au_info$primary_affiliation[1], "\n") |
| 254 | + } |
| 255 | + |
| 256 | + return(au_info) |
| 257 | +} |
| 258 | + |
| 259 | +# Helper function to analyze all authors of an article |
| 260 | +analyze_all_authors <- function(doi, verbose = FALSE) { |
| 261 | + return(authorBio(doi = doi, return_all_authors = TRUE, verbose = verbose)) |
| 262 | +} |
| 263 | + |
| 264 | +#' Get Authors Summary from OpenAlex |
| 265 | +#' |
| 266 | +#' Retrieves a quick summary of all authors from a paper without making additional API calls |
| 267 | +#' for individual author profiles. Useful for getting an overview of the authorship structure. |
| 268 | +#' |
| 269 | +#' @param doi Character. DOI of the article |
| 270 | +#' @param verbose Logical. Print informative messages during execution (default: FALSE) |
| 271 | +#' |
| 272 | +#' @return A data frame with summary information for all authors including: |
| 273 | +#' \itemize{ |
| 274 | +#' \item position: Author position in the paper |
| 275 | +#' \item display_name: Author name as it appears in the paper |
| 276 | +#' \item author_position_type: Type of position (first, last, middle) |
| 277 | +#' \item is_corresponding: Whether the author is a corresponding author |
| 278 | +#' \item orcid: ORCID identifier if available |
| 279 | +#' \item openalex_id: OpenAlex author identifier |
| 280 | +#' \item primary_affiliation: Main institutional affiliation |
| 281 | +#' } |
| 282 | +#' |
| 283 | +#' @examples |
| 284 | +#' \dontrun{ |
| 285 | +#' # Get a quick summary of all authors |
| 286 | +#' summary <- get_authors_summary(doi = "10.1016/j.joi.2017.08.007") |
| 287 | +#' print(summary) |
| 288 | +#' } |
| 289 | +#' |
| 290 | +#' @export |
| 291 | +get_authors_summary <- function(doi="10.1016/j.joi.2017.08.007", verbose = FALSE) { |
| 292 | + if (verbose) cat("Retrieving author summary for DOI:", doi, "\n") |
| 293 | + |
| 294 | + au_work <- tryCatch({ |
| 295 | + openalexR::oa_fetch(entity = "works", doi = doi, output = "tibble") |
| 296 | + }, error = function(e) { |
| 297 | + stop("Error retrieving article: ", e$message) |
| 298 | + }) |
| 299 | + |
| 300 | + |
| 301 | + if (is.null(au_work) || nrow(au_work) == 0) { |
| 302 | + stop("No article found for the provided DOI: ", doi) |
| 303 | + } |
| 304 | + |
| 305 | + authorships <- au_work$authorships[[1]] |
| 306 | + |
| 307 | + # Create a summary without additional API calls |
| 308 | + summary_df <- data.frame( |
| 309 | + position = 1:nrow(authorships), |
| 310 | + display_name = authorships$display_name, |
| 311 | + author_position_type = authorships$author_position, |
| 312 | + is_corresponding = authorships$is_corresponding, |
| 313 | + orcid = authorships$orcid, |
| 314 | + openalex_id = authorships$id, |
| 315 | + stringsAsFactors = FALSE |
| 316 | + ) |
| 317 | + |
| 318 | + # Add affiliations if available |
| 319 | + summary_df$primary_affiliation <- sapply(1:nrow(authorships), function(i) { |
| 320 | + if (!is.null(authorships$affiliations[[i]]) && nrow(authorships$affiliations[[i]]) > 0) { |
| 321 | + return(authorships$affiliations[[i]]$display_name[1]) |
| 322 | + } else { |
| 323 | + return(NA) |
| 324 | + } |
| 325 | + }) |
| 326 | + |
| 327 | + return(summary_df) |
| 328 | +} |
| 329 | + |
0 commit comments