|
| 1 | +#' Scrape Audio Files from a Web Page |
| 2 | +#' |
| 3 | +#' @description This function is used to scrape audio file URLs from a web page |
| 4 | +#' and optionally download them. It searches both \code{<audio>} tags and |
| 5 | +#' \code{<a>} (anchor) tags for links matching the specified audio extensions. |
| 6 | +#' |
| 7 | +#' @param link the link of the web page to scrape |
| 8 | +#' @param extensions a character vector of audio file extensions to filter by |
| 9 | +#' (without the leading dot). Defaults to \code{c("mp3", "wav")}. |
| 10 | +#' @param path the path where audio files will be downloaded. Defaults to the |
| 11 | +#' current working directory. Set to \code{NULL} to return URLs without |
| 12 | +#' downloading. |
| 13 | +#' @param askRobot logical. Should the function ask the robots.txt if we're |
| 14 | +#' allowed or not to scrape the web page? Default is \code{FALSE}. |
| 15 | +#' |
| 16 | +#' @return called for the side effect of downloading audio files. Returns the |
| 17 | +#' vector of matched audio URLs invisibly, or \code{NULL} if none are found. |
| 18 | +#' |
| 19 | +#' @examples \dontrun{ |
| 20 | +#' |
| 21 | +#' # Scrape and download mp3 and wav files from a page |
| 22 | +#' audio_scrap( |
| 23 | +#' link = "https://www.example.com/podcasts", |
| 24 | +#' extensions = c("mp3", "wav"), |
| 25 | +#' path = getwd() |
| 26 | +#' ) |
| 27 | +#' |
| 28 | +#' # Return audio URLs without downloading |
| 29 | +#' audio_scrap( |
| 30 | +#' link = "https://www.example.com/podcasts", |
| 31 | +#' extensions = "mp3", |
| 32 | +#' path = NULL |
| 33 | +#' ) |
| 34 | +#' |
| 35 | +#' } |
| 36 | +#' |
| 37 | +#' @export |
| 38 | +#' @importFrom rvest html_nodes html_attr %>% |
| 39 | +#' @importFrom xml2 read_html |
| 40 | +#' @importFrom robotstxt paths_allowed |
| 41 | +#' @importFrom crayon green |
| 42 | +#' @importFrom crayon bgRed |
| 43 | +#' @importFrom curl has_internet |
| 44 | +#' @importFrom utils download.file |
| 45 | + |
| 46 | +audio_scrap <- function(link, |
| 47 | + extensions = c("mp3", "wav"), |
| 48 | + path = getwd(), |
| 49 | + askRobot = FALSE) { |
| 50 | + |
| 51 | + if (missing(link)) { |
| 52 | + stop("'link' is a mandatory parameter") |
| 53 | + } |
| 54 | + |
| 55 | + if (!is.character(link)) { |
| 56 | + stop("'link' must be provided as a character string") |
| 57 | + } |
| 58 | + |
| 59 | + if (!is.character(extensions) || length(extensions) == 0) { |
| 60 | + stop("'extensions' must be a non-empty character vector, e.g. c(\"mp3\", \"wav\")") |
| 61 | + } |
| 62 | + |
| 63 | + if (any(grepl("^\\.", extensions))) { |
| 64 | + stop("No need to include the '.' in 'extensions', just provide the extension as it is (e.g. \"mp3\" not \".mp3\")") |
| 65 | + } |
| 66 | + |
| 67 | + if (!is.null(path) && path != getwd() && !dir.exists(path)) { |
| 68 | + stop("the path: ", path, " doesn't seem to exist !") |
| 69 | + } |
| 70 | + |
| 71 | + ###################### Ask robot related ################################################## |
| 72 | + if (askRobot) { |
| 73 | + if (paths_allowed(link) == TRUE) { |
| 74 | + message(green("the robot.txt doesn't prohibit scraping this web page")) |
| 75 | + } else { |
| 76 | + message(bgRed( |
| 77 | + "WARNING: the robot.txt doesn't allow scraping this web page" |
| 78 | + )) |
| 79 | + } |
| 80 | + } |
| 81 | + ########################################################################################## |
| 82 | + |
| 83 | + tryCatch( |
| 84 | + expr = { |
| 85 | + |
| 86 | + # Collect candidate URLs from <audio src=>, <source src=> inside <audio>, |
| 87 | + # and <a href=> anchor links |
| 88 | + raw_urls <- lapply(link, function(url) { |
| 89 | + page <- url %>% read_html() |
| 90 | + |
| 91 | + audio_src <- page %>% |
| 92 | + html_nodes("audio") %>% |
| 93 | + html_attr("src") |
| 94 | + |
| 95 | + source_src <- page %>% |
| 96 | + html_nodes("audio source") %>% |
| 97 | + html_attr("src") |
| 98 | + |
| 99 | + anchor_href <- page %>% |
| 100 | + html_nodes("a") %>% |
| 101 | + html_attr("href") |
| 102 | + |
| 103 | + c(audio_src, source_src, anchor_href) |
| 104 | + }) |
| 105 | + |
| 106 | + all_urls <- unlist(raw_urls) |
| 107 | + all_urls <- all_urls[!is.na(all_urls)] |
| 108 | + |
| 109 | + # Filter to only URLs whose file extension matches one of the requested extensions |
| 110 | + ext_pattern <- paste0("\\.(", paste(extensions, collapse = "|"), ")$") |
| 111 | + matched_urls <- all_urls[grepl(ext_pattern, all_urls, ignore.case = TRUE)] |
| 112 | + |
| 113 | + if (length(matched_urls) == 0) { |
| 114 | + message("No audio file has been found. Returning NULL.") |
| 115 | + return(invisible(NULL)) |
| 116 | + } |
| 117 | + |
| 118 | + # Resolve relative URLs to absolute |
| 119 | + matched_urls <- purrr::map_chr( |
| 120 | + matched_urls, |
| 121 | + .format_url, |
| 122 | + link = link[[1]] |
| 123 | + ) |
| 124 | + |
| 125 | + # Remove duplicates |
| 126 | + matched_urls <- unique(matched_urls) |
| 127 | + |
| 128 | + if (is.null(path)) { |
| 129 | + return(invisible(matched_urls)) |
| 130 | + } |
| 131 | + |
| 132 | + for (i in seq_along(matched_urls)) { |
| 133 | + download.file( |
| 134 | + matched_urls[i], |
| 135 | + destfile = paste0(path, "/", basename(matched_urls[i])), |
| 136 | + mode = "wb" |
| 137 | + ) |
| 138 | + } |
| 139 | + |
| 140 | + return(invisible(matched_urls)) |
| 141 | + |
| 142 | + }, |
| 143 | + |
| 144 | + error = function(cond) { |
| 145 | + |
| 146 | + if (!has_internet()) { |
| 147 | + message(paste0("Please check your internet connexion: ", cond)) |
| 148 | + return(invisible(NULL)) |
| 149 | + } else if (grepl("current working directory", cond) || |
| 150 | + grepl("HTTP error 404", cond)) { |
| 151 | + message(paste0("The URL doesn't seem to be a valid one: ", link)) |
| 152 | + message(paste0("Here the original error message: ", cond)) |
| 153 | + return(invisible(NULL)) |
| 154 | + } else { |
| 155 | + message(paste0("Undefined Error: ", cond)) |
| 156 | + return(invisible(NULL)) |
| 157 | + } |
| 158 | + |
| 159 | + } |
| 160 | + ) |
| 161 | +} |
0 commit comments