Skip to content

Commit 95d7b76

Browse files
authored
Merge pull request #565 from massimoaria/develop
Issue with database coming from Scopus
2 parents 6321b95 + 7f760e1 commit 95d7b76

File tree

5 files changed

+113
-74
lines changed

5 files changed

+113
-74
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,6 @@ Suggests:
6262
rmarkdown,
6363
testthat (>= 3.0.0),
6464
wordcloud2
65-
RoxygenNote: 7.3.2
65+
RoxygenNote: 7.3.3
6666
NeedsCompilation: no
6767
Config/testthat/edition: 3

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,13 +307,18 @@ importFrom(rscopus,author_df_orig)
307307
importFrom(rscopus,author_search)
308308
importFrom(rscopus,get_complete_author_info)
309309
importFrom(stringdist,stringdistmatrix)
310+
importFrom(stringr,fixed)
311+
importFrom(stringr,str_detect)
310312
importFrom(stringr,str_extract_all)
311313
importFrom(stringr,str_locate_all)
312314
importFrom(stringr,str_replace_all)
313315
importFrom(stringr,str_split)
316+
importFrom(stringr,str_squish)
317+
importFrom(stringr,str_to_upper)
314318
importFrom(stringr,str_trim)
315319
importFrom(tibble,rownames_to_column)
316320
importFrom(tidyr,drop_na)
321+
importFrom(tidyr,expand_grid)
317322
importFrom(tidyr,gather)
318323
importFrom(tidyr,pivot_longer)
319324
importFrom(tidyr,pivot_wider)

R/csvScopus2df.R

Lines changed: 13 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ csvScopus2df <- function(file) {
2828

2929
# Authors' names cleaning (surname and initials)
3030
DATA$AU <- gsub("\\.", "", DATA$AU)
31-
DATA$AU <- gsub(",", ";", DATA$AU)
31+
#DATA$AU <- gsub(",", ";", DATA$AU)
32+
DATA$AU <- gsub(",", "", DATA$AU)
3233

3334
### store raw affiliation format to extract link among authors and affiliations
3435
DATA$C1raw <- DATA$C1
@@ -72,8 +73,9 @@ labelling <- function(DATA) {
7273
df_tag <- data.frame(
7374
rbind(
7475
c("Abbreviated Source Title", "JI"),
75-
c("Authors with affiliations", "C1"),
76-
c("Author Addresses", "C1"),
76+
c("Affiliations", "C1"),
77+
c("Authors with affiliations", "C1_raw"),
78+
c("Author Addresses", "C1_raw"),
7779
c("Authors", "AU"),
7880
c("Author Names", "AU"),
7981
c("Author full names", "AF"),
@@ -115,38 +117,14 @@ labelling <- function(DATA) {
115117
mutate(tag = ifelse(is.na(tag), orig, tag))
116118

117119
names(DATA) <- label$tag
120+
121+
if (!"C1" %in% names(DATA)) {
122+
if ("C1_raw" %in% names(DATA)) {
123+
DATA$C1 <- DATA$C1_raw
124+
} else {
125+
DATA$C1 <- NA
126+
}
127+
}
118128

119-
120-
# label <- names(DATA)
121-
# label <- gsub("Abbreviated Source Title","JI",label)
122-
# label <- gsub("Authors with affiliations","C1",label)
123-
# label <- gsub("Author Addresses","C1",label)
124-
# #label <- gsub("Affiliations","RP",label)
125-
# label <- gsub("Authors","AU",label)
126-
# label <- gsub("Author Names","AU",label)
127-
# label <- gsub("Source title","SO",label)
128-
# label <- gsub("Titles","TI",label)
129-
# label <- gsub("Title","TI",label)
130-
# label <- gsub("Publication Year","PY",label)
131-
# label <- gsub("Year","PY",label)
132-
# label <- gsub("Volume","VL",label)
133-
# label <- gsub("Issue","IS",label)
134-
# label <- gsub("Page count","PP",label)
135-
# label <- gsub("Cited by","TC",label)
136-
# label <- gsub("DOI","DI",label)
137-
# label <- gsub("Link","URL",label)
138-
# label <- gsub("Abstract","AB",label)
139-
# label <- gsub("Author Keywords","DE",label)
140-
# label <- gsub("Index Keywords","ID",label)
141-
# label <- gsub("Funding Details","FU",label)
142-
# label <- gsub("Funding Text 1","FX",label)
143-
# label <- gsub("References","CR",label)
144-
# label <- gsub("Correspondence Address","RP",label)
145-
# label <- gsub("Funding Details","FU",label)
146-
# label <- gsub("Language of Original Document","LA",label)
147-
# label <- gsub("Document Type","DT",label)
148-
# label <- gsub("Source","DB",label)
149-
# label <- gsub("EID","UT",label)
150-
# names(DATA) <- label
151129
return(DATA)
152130
}

R/histNetwork.R

Lines changed: 89 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ utils::globalVariables(c(
33
"AU", "Page.start", "Page.end", "PP", "SR", "Included",
44
"PP.y", "PP.x", "toRemove", "SR_cited", "LCS", "SR_FULL",
55
"TI", "DE", "ID", "DI", "Year", "SR_citing", "ref", "n",
6-
"id_oa", "UT", "PY"
6+
"id_oa", "UT", "PY","TI_clean","CR_clean","is_match", "cited_SR", "citing_SR"
77
))
88
#' Historical co-citation network
99
#'
@@ -219,51 +219,20 @@ wos <- function(M, min.citations, sep, network, verbose) {
219219
}
220220

221221
# New algorithm for Scopus
222-
# Local citation matching is based on First Author, Year and PP
223222
scopus <- function(M, min.citations, sep, network, verbose) {
224223
if (!("SR_FULL" %in% names(M))) {
225224
M <- metaTagExtraction(M, Field = "SR")
226225
}
226+
227+
CR <- match_citations_fast(
228+
titles_df=M %>% select(SR,TI),
229+
references_df=M %>% select(SR,CR)
230+
)
227231

228-
CR <- strsplit(M$CR, ";")
229-
230-
CR <- data.frame(SR_citing = rep(M$SR, lengths(CR)), ref = trimws(unlist(CR)))
231-
232-
CR$PY <- as.numeric(gsub(".*\\((\\d{4})\\).*", "\\1", CR$ref))
233-
234-
CR$AU <- trimws(gsub("\\.", "", gsub("\\. ", "", gsub("^(.*?),.*$", "\\1", CR$ref))))
235-
236-
CR$PP <- gsub(".*PP\\. ([0-9-]+).*", "\\1", CR$ref)
237-
238-
CR <- CR %>%
239-
dplyr::filter(!is.na(PY), (substr(CR$PP, 1, 1) %in% 0:9))
240-
241-
M_merge <- M %>%
242-
select(AU, PY, Page.start, Page.end, PP, SR) %>%
243-
mutate(
244-
AU = trimws(gsub("\\.", "", gsub("\\. ", "", gsub("^(.*?),.*$", "\\1", SR)))),
245-
Page.start = as.numeric(Page.start),
246-
Page.end = as.numeric(Page.end),
247-
PP = ifelse(!is.na(Page.start), paste0(Page.start, "-", Page.end), NA),
248-
Included = TRUE
249-
) %>%
250-
rename(SR_cited = SR)
251-
252-
CR <- CR %>%
253-
left_join(M_merge, join_by("PY", "AU"), relationship = "many-to-many") %>%
254-
dplyr::filter(!is.na(Included)) %>%
255-
group_by(PY, AU) %>%
256-
mutate(toRemove = ifelse(!is.na(PP.y) & PP.x != PP.y, TRUE, FALSE)) %>% # to remove FALSE POSITIVE
257-
ungroup() %>%
258-
dplyr::filter(toRemove != TRUE) %>%
259-
mutate(toRemove = ifelse(!is.na(PP.x) & is.na(PP.y), TRUE, FALSE)) %>%
260-
dplyr::filter(toRemove != TRUE)
261-
262-
LCS <- CR %>%
232+
LCS <- CR %>%
263233
group_by(SR_cited) %>%
264234
count(name = "LCS")
265235

266-
267236
M <- M %>%
268237
left_join(LCS, by = c("SR" = "SR_cited")) %>%
269238
mutate(LCS = ifelse(is.na(LCS), 0, LCS))
@@ -459,3 +428,85 @@ lens <- function(M, min.citations = min.citations, sep = sep, network = network,
459428
LCS = M$LCS
460429
)
461430
}
431+
432+
# Funzione alternativa più veloce per dataset molto grandi
433+
# match_citations_fast <- function(titles_df, references_df) {
434+
#
435+
# # Normalizza
436+
# titles_norm <- titles_df %>%
437+
# mutate(TI_clean = normalize_text(TI))
438+
#
439+
# refs_norm <- references_df %>%
440+
# mutate(CR_clean = normalize_text(CR))
441+
#
442+
# # Crea una matrice di matching usando stringdist
443+
# results <- expand_grid(
444+
# SR_cited = titles_norm$SR,
445+
# SR_citing = refs_norm$SR
446+
# ) %>%
447+
# left_join(titles_norm %>% select(SR, TI_clean), by = c("SR_cited" = "SR")) %>%
448+
# left_join(refs_norm %>% select(SR, CR_clean), by = c("SR_citing" = "SR")) %>%
449+
# mutate(
450+
# is_match = str_detect(CR_clean, fixed(TI_clean))
451+
# ) %>%
452+
# filter(is_match) %>%
453+
# select(SR_cited, SR_citing)
454+
#
455+
# return(results)
456+
# }
457+
#
458+
# # Funzione to normalize text
459+
# normalize_text <- function(text) {
460+
# text %>%
461+
# str_to_upper() %>%
462+
# str_replace_all("[^A-Z0-9\\s]", " ") %>%
463+
# str_squish()
464+
# }
465+
match_citations_fast <- function(titles_df, references_df) {
466+
467+
# Normalizza i titoli
468+
titles_norm <- titles_df %>%
469+
dplyr::mutate(TI_clean = stringr::str_to_upper(TI) %>%
470+
stringr::str_replace_all("[^A-Z0-9\\s]", " ") %>%
471+
stringr::str_squish())
472+
473+
# Normalizza le bibliografie
474+
refs_norm <- references_df %>%
475+
dplyr::mutate(CR_clean = stringr::str_to_upper(CR) %>%
476+
stringr::str_replace_all("[^A-Z0-9\\s]", " ") %>%
477+
stringr::str_squish())
478+
479+
# Crea tutte le possibili combinazioni
480+
all_combinations <- tidyr::expand_grid(
481+
cited_SR = titles_norm$SR,
482+
citing_SR = refs_norm$SR
483+
)
484+
485+
# Aggiungi i titoli normalizzati
486+
all_combinations <- dplyr::left_join(
487+
all_combinations,
488+
titles_norm %>% dplyr::select(SR, TI_clean),
489+
by = c("cited_SR" = "SR")
490+
)
491+
492+
# Aggiungi le bibliografie normalizzate
493+
all_combinations <- dplyr::left_join(
494+
all_combinations,
495+
refs_norm %>% dplyr::select(SR, CR_clean),
496+
by = c("citing_SR" = "SR")
497+
)
498+
499+
# Trova i match
500+
all_combinations$is_match <- stringr::str_detect(
501+
all_combinations$CR_clean,
502+
stringr::fixed(all_combinations$TI_clean)
503+
)
504+
505+
# Filtra solo i match
506+
matches <- all_combinations %>%
507+
dplyr::filter(is_match) %>%
508+
dplyr::select(cited_SR, citing_SR) %>%
509+
rename("SR_cited" = cited_SR, "SR_citing" = citing_SR)
510+
511+
return(matches)
512+
}

R/zzz.R

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ utils::globalVariables(c("matches", "KW_Merged"))
7979
# #' @import shinycssloaders
8080
# #' @import shinythemes
8181
#' @importFrom openxlsx write.xlsx
82+
#' @importFrom tidyr expand_grid
8283
#' @importFrom tidyr gather
8384
#' @importFrom tidyr spread
8485
#' @importFrom tidyr pivot_wider
@@ -263,6 +264,10 @@ utils::globalVariables(c("matches", "KW_Merged"))
263264
#' @importFrom Matrix updown
264265
#' @importFrom Matrix which
265266
#' @importFrom Matrix writeMM
267+
#' @importFrom stringr str_detect
268+
#' @importFrom stringr fixed
269+
#' @importFrom stringr str_to_upper
270+
#' @importFrom stringr str_squish
266271
#' @importFrom stringr str_locate_all
267272
#' @importFrom stringr str_extract_all
268273
#' @importFrom stringr str_replace_all

0 commit comments

Comments
 (0)