Merge pull request #565 from massimoaria/develop

massimoaria · web-flow · commit 95d7b7691979 · 2025-09-18T10:28:29.000+02:00
Issue with database coming from Scopus
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -62,6 +62,6 @@ Suggests:
     rmarkdown,
     testthat (>= 3.0.0),
     wordcloud2
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 NeedsCompilation: no
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -307,13 +307,18 @@ importFrom(rscopus,author_df_orig)
 importFrom(rscopus,author_search)
 importFrom(rscopus,get_complete_author_info)
 importFrom(stringdist,stringdistmatrix)
+importFrom(stringr,fixed)
+importFrom(stringr,str_detect)
 importFrom(stringr,str_extract_all)
 importFrom(stringr,str_locate_all)
 importFrom(stringr,str_replace_all)
 importFrom(stringr,str_split)
+importFrom(stringr,str_squish)
+importFrom(stringr,str_to_upper)
 importFrom(stringr,str_trim)
 importFrom(tibble,rownames_to_column)
 importFrom(tidyr,drop_na)
+importFrom(tidyr,expand_grid)
 importFrom(tidyr,gather)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
diff --git a/R/csvScopus2df.R b/R/csvScopus2df.R
@@ -28,7 +28,8 @@ csvScopus2df <- function(file) {
 
   # Authors' names cleaning (surname and initials)
   DATA$AU <- gsub("\\.", "", DATA$AU)
-  DATA$AU <- gsub(",", ";", DATA$AU)
+  #DATA$AU <- gsub(",", ";", DATA$AU)
+  DATA$AU <- gsub(",", "", DATA$AU)
 
   ### store raw affiliation format to extract link among authors and affiliations
   DATA$C1raw <- DATA$C1
@@ -72,8 +73,9 @@ labelling <- function(DATA) {
   df_tag <- data.frame(
     rbind(
       c("Abbreviated Source Title", "JI"),
-      c("Authors with affiliations", "C1"),
-      c("Author Addresses", "C1"),
+      c("Affiliations", "C1"),
+      c("Authors with affiliations", "C1_raw"),
+      c("Author Addresses", "C1_raw"),
       c("Authors", "AU"),
       c("Author Names", "AU"),
       c("Author full names", "AF"),
@@ -115,38 +117,14 @@ labelling <- function(DATA) {
     mutate(tag = ifelse(is.na(tag), orig, tag))
 
   names(DATA) <- label$tag
+  
+  if (!"C1" %in% names(DATA)) {
+    if ("C1_raw" %in% names(DATA)) {
+      DATA$C1 <- DATA$C1_raw
+    } else {
+      DATA$C1 <- NA
+    }
+  }
 
-
-  # label <- names(DATA)
-  # label <- gsub("Abbreviated Source Title","JI",label)
-  # label <- gsub("Authors with affiliations","C1",label)
-  # label <- gsub("Author Addresses","C1",label)
-  # #label <- gsub("Affiliations","RP",label)
-  # label <- gsub("Authors","AU",label)
-  # label <- gsub("Author Names","AU",label)
-  # label <- gsub("Source title","SO",label)
-  # label <- gsub("Titles","TI",label)
-  # label <- gsub("Title","TI",label)
-  # label <- gsub("Publication Year","PY",label)
-  # label <- gsub("Year","PY",label)
-  # label <- gsub("Volume","VL",label)
-  # label <- gsub("Issue","IS",label)
-  # label <- gsub("Page count","PP",label)
-  # label <- gsub("Cited by","TC",label)
-  # label <- gsub("DOI","DI",label)
-  # label <- gsub("Link","URL",label)
-  # label <- gsub("Abstract","AB",label)
-  # label <- gsub("Author Keywords","DE",label)
-  # label <- gsub("Index Keywords","ID",label)
-  # label <- gsub("Funding Details","FU",label)
-  # label <- gsub("Funding Text 1","FX",label)
-  # label <- gsub("References","CR",label)
-  # label <- gsub("Correspondence Address","RP",label)
-  # label <- gsub("Funding Details","FU",label)
-  # label <- gsub("Language of Original Document","LA",label)
-  # label <- gsub("Document Type","DT",label)
-  # label <- gsub("Source","DB",label)
-  # label <- gsub("EID","UT",label)
-  # names(DATA) <- label
   return(DATA)
 }
diff --git a/R/histNetwork.R b/R/histNetwork.R
@@ -3,7 +3,7 @@ utils::globalVariables(c(
   "AU", "Page.start", "Page.end", "PP", "SR", "Included",
   "PP.y", "PP.x", "toRemove", "SR_cited", "LCS", "SR_FULL",
   "TI", "DE", "ID", "DI", "Year", "SR_citing", "ref", "n",
-  "id_oa", "UT", "PY"
+  "id_oa", "UT", "PY","TI_clean","CR_clean","is_match", "cited_SR", "citing_SR"
 ))
 #' Historical co-citation network
 #'
@@ -219,51 +219,20 @@ wos <- function(M, min.citations, sep, network, verbose) {
 }
 
 # New algorithm for Scopus
-# Local citation matching is based on First Author, Year and PP
 scopus <- function(M, min.citations, sep, network, verbose) {
   if (!("SR_FULL" %in% names(M))) {
     M <- metaTagExtraction(M, Field = "SR")
   }
+  
+  CR <- match_citations_fast(
+    titles_df=M %>% select(SR,TI),
+    references_df=M %>% select(SR,CR)
+  )
 
-  CR <- strsplit(M$CR, ";")
-
-  CR <- data.frame(SR_citing = rep(M$SR, lengths(CR)), ref = trimws(unlist(CR)))
-
-  CR$PY <- as.numeric(gsub(".*\\((\\d{4})\\).*", "\\1", CR$ref))
-
-  CR$AU <- trimws(gsub("\\.", "", gsub("\\. ", "", gsub("^(.*?),.*$", "\\1", CR$ref))))
-
-  CR$PP <- gsub(".*PP\\. ([0-9-]+).*", "\\1", CR$ref)
-
-  CR <- CR %>%
-    dplyr::filter(!is.na(PY), (substr(CR$PP, 1, 1) %in% 0:9))
-
-  M_merge <- M %>%
-    select(AU, PY, Page.start, Page.end, PP, SR) %>%
-    mutate(
-      AU = trimws(gsub("\\.", "", gsub("\\. ", "", gsub("^(.*?),.*$", "\\1", SR)))),
-      Page.start = as.numeric(Page.start),
-      Page.end = as.numeric(Page.end),
-      PP = ifelse(!is.na(Page.start), paste0(Page.start, "-", Page.end), NA),
-      Included = TRUE
-    ) %>%
-    rename(SR_cited = SR)
-
-  CR <- CR %>%
-    left_join(M_merge, join_by("PY", "AU"), relationship = "many-to-many") %>%
-    dplyr::filter(!is.na(Included)) %>%
-    group_by(PY, AU) %>%
-    mutate(toRemove = ifelse(!is.na(PP.y) & PP.x != PP.y, TRUE, FALSE)) %>% # to remove FALSE POSITIVE
-    ungroup() %>%
-    dplyr::filter(toRemove != TRUE) %>%
-    mutate(toRemove = ifelse(!is.na(PP.x) & is.na(PP.y), TRUE, FALSE)) %>%
-    dplyr::filter(toRemove != TRUE)
-
-  LCS <- CR %>%
+  LCS <- CR %>% 
     group_by(SR_cited) %>%
     count(name = "LCS")
 
-
   M <- M %>%
     left_join(LCS, by = c("SR" = "SR_cited")) %>%
     mutate(LCS = ifelse(is.na(LCS), 0, LCS))
@@ -459,3 +428,85 @@ lens <- function(M, min.citations = min.citations, sep = sep, network = network,
       LCS = M$LCS
     )
 }
+
+# Funzione alternativa più veloce per dataset molto grandi
+# match_citations_fast <- function(titles_df, references_df) {
+#   
+#   # Normalizza
+#   titles_norm <- titles_df %>%
+#     mutate(TI_clean = normalize_text(TI))
+#   
+#   refs_norm <- references_df %>%
+#     mutate(CR_clean = normalize_text(CR))
+#   
+#   # Crea una matrice di matching usando stringdist
+#   results <- expand_grid(
+#     SR_cited = titles_norm$SR,
+#     SR_citing = refs_norm$SR
+#   ) %>%
+#     left_join(titles_norm %>% select(SR, TI_clean), by = c("SR_cited" = "SR")) %>%
+#     left_join(refs_norm %>% select(SR, CR_clean), by = c("SR_citing" = "SR")) %>%
+#     mutate(
+#       is_match = str_detect(CR_clean, fixed(TI_clean))
+#     ) %>%
+#     filter(is_match) %>%
+#     select(SR_cited, SR_citing)
+#   
+#   return(results)
+# }
+# 
+# # Funzione to normalize text
+# normalize_text <- function(text) {
+#   text %>%
+#     str_to_upper() %>%
+#     str_replace_all("[^A-Z0-9\\s]", " ") %>%
+#     str_squish()
+# }
+match_citations_fast <- function(titles_df, references_df) {
+  
+  # Normalizza i titoli
+  titles_norm <- titles_df %>%
+    dplyr::mutate(TI_clean = stringr::str_to_upper(TI) %>%
+                    stringr::str_replace_all("[^A-Z0-9\\s]", " ") %>%
+                    stringr::str_squish())
+  
+  # Normalizza le bibliografie
+  refs_norm <- references_df %>%
+    dplyr::mutate(CR_clean = stringr::str_to_upper(CR) %>%
+                    stringr::str_replace_all("[^A-Z0-9\\s]", " ") %>%
+                    stringr::str_squish())
+  
+  # Crea tutte le possibili combinazioni
+  all_combinations <- tidyr::expand_grid(
+    cited_SR = titles_norm$SR,
+    citing_SR = refs_norm$SR
+  )
+  
+  # Aggiungi i titoli normalizzati
+  all_combinations <- dplyr::left_join(
+    all_combinations, 
+    titles_norm %>% dplyr::select(SR, TI_clean), 
+    by = c("cited_SR" = "SR")
+  )
+  
+  # Aggiungi le bibliografie normalizzate
+  all_combinations <- dplyr::left_join(
+    all_combinations,
+    refs_norm %>% dplyr::select(SR, CR_clean), 
+    by = c("citing_SR" = "SR")
+  )
+  
+  # Trova i match
+  all_combinations$is_match <- stringr::str_detect(
+    all_combinations$CR_clean, 
+    stringr::fixed(all_combinations$TI_clean)
+  )
+  
+  # Filtra solo i match
+  matches <- all_combinations %>%
+    dplyr::filter(is_match) %>%
+    dplyr::select(cited_SR, citing_SR) %>% 
+    rename("SR_cited" = cited_SR, "SR_citing" = citing_SR)
+  
+  return(matches)
+}
diff --git a/R/zzz.R b/R/zzz.R
@@ -79,6 +79,7 @@ utils::globalVariables(c("matches", "KW_Merged"))
 # #' @import shinycssloaders
 # #' @import shinythemes
 #' @importFrom openxlsx write.xlsx
+#' @importFrom tidyr expand_grid
 #' @importFrom tidyr gather
 #' @importFrom tidyr spread
 #' @importFrom tidyr pivot_wider
@@ -263,6 +264,10 @@ utils::globalVariables(c("matches", "KW_Merged"))
 #' @importFrom Matrix updown
 #' @importFrom Matrix which
 #' @importFrom Matrix writeMM
+#' @importFrom stringr str_detect
+#' @importFrom stringr fixed
+#' @importFrom stringr str_to_upper
+#' @importFrom stringr str_squish
 #' @importFrom stringr str_locate_all
 #' @importFrom stringr str_extract_all
 #' @importFrom stringr str_replace_all