Skip to content

Commit b9af69d

Browse files
Merge pull request #15 from feddelegrand7/copilot/add-audio-scrap-function
2 parents 9c05d68 + 1800219 commit b9af69d

File tree

4 files changed

+254
-0
lines changed

4 files changed

+254
-0
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export(attribute_scrap)
4+
export(audio_scrap)
45
export(comments_scrap)
56
export(csv_scrap)
67
export(images_noalt_scrap)

R/audio_scrap.R

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#' Scrape Audio Files from a Web Page
2+
#'
3+
#' @description This function is used to scrape audio file URLs from a web page
4+
#' and optionally download them. It searches both \code{<audio>} tags and
5+
#' \code{<a>} (anchor) tags for links matching the specified audio extensions.
6+
#'
7+
#' @param link the link of the web page to scrape
8+
#' @param extensions a character vector of audio file extensions to filter by
9+
#' (without the leading dot). Defaults to \code{c("mp3", "wav")}.
10+
#' @param path the path where audio files will be downloaded. Defaults to the
11+
#' current working directory. Set to \code{NULL} to return URLs without
12+
#' downloading.
13+
#' @param askRobot logical. Should the function ask the robots.txt if we're
14+
#' allowed or not to scrape the web page? Default is \code{FALSE}.
15+
#'
16+
#' @return called for the side effect of downloading audio files. Returns the
17+
#' vector of matched audio URLs invisibly, or \code{NULL} if none are found.
18+
#'
19+
#' @examples \dontrun{
20+
#'
21+
#' # Scrape and download mp3 and wav files from a page
22+
#' audio_scrap(
23+
#' link = "https://www.example.com/podcasts",
24+
#' extensions = c("mp3", "wav"),
25+
#' path = getwd()
26+
#' )
27+
#'
28+
#' # Return audio URLs without downloading
29+
#' audio_scrap(
30+
#' link = "https://www.example.com/podcasts",
31+
#' extensions = "mp3",
32+
#' path = NULL
33+
#' )
34+
#'
35+
#' }
36+
#'
37+
#' @export
38+
#' @importFrom rvest html_nodes html_attr %>%
39+
#' @importFrom xml2 read_html
40+
#' @importFrom robotstxt paths_allowed
41+
#' @importFrom crayon green
42+
#' @importFrom crayon bgRed
43+
#' @importFrom curl has_internet
44+
#' @importFrom utils download.file
45+
46+
audio_scrap <- function(link,
47+
extensions = c("mp3", "wav"),
48+
path = getwd(),
49+
askRobot = FALSE) {
50+
51+
if (missing(link)) {
52+
stop("'link' is a mandatory parameter")
53+
}
54+
55+
if (!is.character(link)) {
56+
stop("'link' must be provided as a character string")
57+
}
58+
59+
if (!is.character(extensions) || length(extensions) == 0) {
60+
stop("'extensions' must be a non-empty character vector, e.g. c(\"mp3\", \"wav\")")
61+
}
62+
63+
if (any(grepl("^\\.", extensions))) {
64+
stop("No need to include the '.' in 'extensions', just provide the extension as it is (e.g. \"mp3\" not \".mp3\")")
65+
}
66+
67+
if (!is.null(path) && path != getwd() && !dir.exists(path)) {
68+
stop("the path: ", path, " doesn't seem to exist !")
69+
}
70+
71+
###################### Ask robot related ##################################################
72+
if (askRobot) {
73+
if (paths_allowed(link) == TRUE) {
74+
message(green("the robot.txt doesn't prohibit scraping this web page"))
75+
} else {
76+
message(bgRed(
77+
"WARNING: the robot.txt doesn't allow scraping this web page"
78+
))
79+
}
80+
}
81+
##########################################################################################
82+
83+
tryCatch(
84+
expr = {
85+
86+
# Collect candidate URLs from <audio src=>, <source src=> inside <audio>,
87+
# and <a href=> anchor links
88+
raw_urls <- lapply(link, function(url) {
89+
page <- url %>% read_html()
90+
91+
audio_src <- page %>%
92+
html_nodes("audio") %>%
93+
html_attr("src")
94+
95+
source_src <- page %>%
96+
html_nodes("audio source") %>%
97+
html_attr("src")
98+
99+
anchor_href <- page %>%
100+
html_nodes("a") %>%
101+
html_attr("href")
102+
103+
c(audio_src, source_src, anchor_href)
104+
})
105+
106+
all_urls <- unlist(raw_urls)
107+
all_urls <- all_urls[!is.na(all_urls)]
108+
109+
# Filter to only URLs whose file extension matches one of the requested extensions
110+
ext_pattern <- paste0("\\.(", paste(extensions, collapse = "|"), ")$")
111+
matched_urls <- all_urls[grepl(ext_pattern, all_urls, ignore.case = TRUE)]
112+
113+
if (length(matched_urls) == 0) {
114+
message("No audio file has been found. Returning NULL.")
115+
return(invisible(NULL))
116+
}
117+
118+
# Resolve relative URLs to absolute
119+
matched_urls <- purrr::map_chr(
120+
matched_urls,
121+
.format_url,
122+
link = link[[1]]
123+
)
124+
125+
# Remove duplicates
126+
matched_urls <- unique(matched_urls)
127+
128+
if (is.null(path)) {
129+
return(invisible(matched_urls))
130+
}
131+
132+
for (i in seq_along(matched_urls)) {
133+
download.file(
134+
matched_urls[i],
135+
destfile = paste0(path, "/", basename(matched_urls[i])),
136+
mode = "wb"
137+
)
138+
}
139+
140+
return(invisible(matched_urls))
141+
142+
},
143+
144+
error = function(cond) {
145+
146+
if (!has_internet()) {
147+
message(paste0("Please check your internet connexion: ", cond))
148+
return(invisible(NULL))
149+
} else if (grepl("current working directory", cond) ||
150+
grepl("HTTP error 404", cond)) {
151+
message(paste0("The URL doesn't seem to be a valid one: ", link))
152+
message(paste0("Here the original error message: ", cond))
153+
return(invisible(NULL))
154+
} else {
155+
message(paste0("Undefined Error: ", cond))
156+
return(invisible(NULL))
157+
}
158+
159+
}
160+
)
161+
}

man/audio_scrap.Rd

Lines changed: 50 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-audio_scrap.R

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
2+
test_that("audio_scrap() errors when mandatory arguments are missing", {
3+
4+
expect_error(audio_scrap())
5+
6+
})
7+
8+
9+
test_that("audio_scrap() errors when 'link' is not a character string", {
10+
11+
expect_error(audio_scrap(link = 12345))
12+
13+
})
14+
15+
16+
test_that("audio_scrap() errors when 'extensions' is invalid", {
17+
18+
expect_error(audio_scrap(link = "https://www.example.com", extensions = 123))
19+
20+
expect_error(audio_scrap(link = "https://www.example.com", extensions = character(0)))
21+
22+
})
23+
24+
25+
test_that("audio_scrap() errors when extensions include a leading dot", {
26+
27+
expect_error(audio_scrap(link = "https://www.example.com", extensions = ".mp3"))
28+
29+
})
30+
31+
32+
test_that("audio_scrap() errors when path does not exist", {
33+
34+
expect_error(
35+
audio_scrap(
36+
link = "https://www.example.com",
37+
extensions = "mp3",
38+
path = "/non/existent/path/xyz"
39+
)
40+
)
41+
42+
})

0 commit comments

Comments
 (0)