@@ -3,7 +3,7 @@ utils::globalVariables(c(
33 " AU" , " Page.start" , " Page.end" , " PP" , " SR" , " Included" ,
44 " PP.y" , " PP.x" , " toRemove" , " SR_cited" , " LCS" , " SR_FULL" ,
55 " TI" , " DE" , " ID" , " DI" , " Year" , " SR_citing" , " ref" , " n" ,
6- " id_oa" , " UT" , " PY"
6+ " id_oa" , " UT" , " PY" , " TI_clean " , " CR_clean " , " is_match " , " cited_SR " , " citing_SR "
77))
88# ' Historical co-citation network
99# '
@@ -219,51 +219,20 @@ wos <- function(M, min.citations, sep, network, verbose) {
219219}
220220
221221# New algorithm for Scopus
222- # Local citation matching is based on First Author, Year and PP
223222scopus <- function (M , min.citations , sep , network , verbose ) {
224223 if (! (" SR_FULL" %in% names(M ))) {
225224 M <- metaTagExtraction(M , Field = " SR" )
226225 }
226+
227+ CR <- match_citations_fast(
228+ titles_df = M %> % select(SR ,TI ),
229+ references_df = M %> % select(SR ,CR )
230+ )
227231
228- CR <- strsplit(M $ CR , " ;" )
229-
230- CR <- data.frame (SR_citing = rep(M $ SR , lengths(CR )), ref = trimws(unlist(CR )))
231-
232- CR $ PY <- as.numeric(gsub(" .*\\ ((\\ d{4})\\ ).*" , " \\ 1" , CR $ ref ))
233-
234- CR $ AU <- trimws(gsub(" \\ ." , " " , gsub(" \\ . " , " " , gsub(" ^(.*?),.*$" , " \\ 1" , CR $ ref ))))
235-
236- CR $ PP <- gsub(" .*PP\\ . ([0-9-]+).*" , " \\ 1" , CR $ ref )
237-
238- CR <- CR %> %
239- dplyr :: filter(! is.na(PY ), (substr(CR $ PP , 1 , 1 ) %in% 0 : 9 ))
240-
241- M_merge <- M %> %
242- select(AU , PY , Page.start , Page.end , PP , SR ) %> %
243- mutate(
244- AU = trimws(gsub(" \\ ." , " " , gsub(" \\ . " , " " , gsub(" ^(.*?),.*$" , " \\ 1" , SR )))),
245- Page.start = as.numeric(Page.start ),
246- Page.end = as.numeric(Page.end ),
247- PP = ifelse(! is.na(Page.start ), paste0(Page.start , " -" , Page.end ), NA ),
248- Included = TRUE
249- ) %> %
250- rename(SR_cited = SR )
251-
252- CR <- CR %> %
253- left_join(M_merge , join_by(" PY" , " AU" ), relationship = " many-to-many" ) %> %
254- dplyr :: filter(! is.na(Included )) %> %
255- group_by(PY , AU ) %> %
256- mutate(toRemove = ifelse(! is.na(PP.y ) & PP.x != PP.y , TRUE , FALSE )) %> % # to remove FALSE POSITIVE
257- ungroup() %> %
258- dplyr :: filter(toRemove != TRUE ) %> %
259- mutate(toRemove = ifelse(! is.na(PP.x ) & is.na(PP.y ), TRUE , FALSE )) %> %
260- dplyr :: filter(toRemove != TRUE )
261-
262- LCS <- CR %> %
232+ LCS <- CR %> %
263233 group_by(SR_cited ) %> %
264234 count(name = " LCS" )
265235
266-
267236 M <- M %> %
268237 left_join(LCS , by = c(" SR" = " SR_cited" )) %> %
269238 mutate(LCS = ifelse(is.na(LCS ), 0 , LCS ))
@@ -459,3 +428,85 @@ lens <- function(M, min.citations = min.citations, sep = sep, network = network,
459428 LCS = M $ LCS
460429 )
461430}
431+
432+ # Funzione alternativa più veloce per dataset molto grandi
433+ # match_citations_fast <- function(titles_df, references_df) {
434+ #
435+ # # Normalizza
436+ # titles_norm <- titles_df %>%
437+ # mutate(TI_clean = normalize_text(TI))
438+ #
439+ # refs_norm <- references_df %>%
440+ # mutate(CR_clean = normalize_text(CR))
441+ #
442+ # # Crea una matrice di matching usando stringdist
443+ # results <- expand_grid(
444+ # SR_cited = titles_norm$SR,
445+ # SR_citing = refs_norm$SR
446+ # ) %>%
447+ # left_join(titles_norm %>% select(SR, TI_clean), by = c("SR_cited" = "SR")) %>%
448+ # left_join(refs_norm %>% select(SR, CR_clean), by = c("SR_citing" = "SR")) %>%
449+ # mutate(
450+ # is_match = str_detect(CR_clean, fixed(TI_clean))
451+ # ) %>%
452+ # filter(is_match) %>%
453+ # select(SR_cited, SR_citing)
454+ #
455+ # return(results)
456+ # }
457+ #
458+ # # Funzione to normalize text
459+ # normalize_text <- function(text) {
460+ # text %>%
461+ # str_to_upper() %>%
462+ # str_replace_all("[^A-Z0-9\\s]", " ") %>%
463+ # str_squish()
464+ # }
465+ match_citations_fast <- function (titles_df , references_df ) {
466+
467+ # Normalizza i titoli
468+ titles_norm <- titles_df %> %
469+ dplyr :: mutate(TI_clean = stringr :: str_to_upper(TI ) %> %
470+ stringr :: str_replace_all(" [^A-Z0-9\\ s]" , " " ) %> %
471+ stringr :: str_squish())
472+
473+ # Normalizza le bibliografie
474+ refs_norm <- references_df %> %
475+ dplyr :: mutate(CR_clean = stringr :: str_to_upper(CR ) %> %
476+ stringr :: str_replace_all(" [^A-Z0-9\\ s]" , " " ) %> %
477+ stringr :: str_squish())
478+
479+ # Crea tutte le possibili combinazioni
480+ all_combinations <- tidyr :: expand_grid(
481+ cited_SR = titles_norm $ SR ,
482+ citing_SR = refs_norm $ SR
483+ )
484+
485+ # Aggiungi i titoli normalizzati
486+ all_combinations <- dplyr :: left_join(
487+ all_combinations ,
488+ titles_norm %> % dplyr :: select(SR , TI_clean ),
489+ by = c(" cited_SR" = " SR" )
490+ )
491+
492+ # Aggiungi le bibliografie normalizzate
493+ all_combinations <- dplyr :: left_join(
494+ all_combinations ,
495+ refs_norm %> % dplyr :: select(SR , CR_clean ),
496+ by = c(" citing_SR" = " SR" )
497+ )
498+
499+ # Trova i match
500+ all_combinations $ is_match <- stringr :: str_detect(
501+ all_combinations $ CR_clean ,
502+ stringr :: fixed(all_combinations $ TI_clean )
503+ )
504+
505+ # Filtra solo i match
506+ matches <- all_combinations %> %
507+ dplyr :: filter(is_match ) %> %
508+ dplyr :: select(cited_SR , citing_SR ) %> %
509+ rename(" SR_cited" = cited_SR , " SR_citing" = citing_SR )
510+
511+ return (matches )
512+ }
0 commit comments