-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_scraper.py
More file actions
95 lines (74 loc) · 3.25 KB
/
Copy pathdata_scraper.py
File metadata and controls
95 lines (74 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import time
import requests
import progressbar
from bs4 import BeautifulSoup
# Define the base URL of the initial website and the directory to save downloaded files
base_url = 'https://whoicf2.whoi.edu/science/B/whalesounds/'
download_dir = 'Data'
widgets = ['\x1b[36m Processed: \t',
progressbar.Counter(),
progressbar.GranularBar(markers=" ▁▂▃▄▅▆▇█", left=' | ', right=' |'),
' (', progressbar.ETA(), ') ',
]
# Function to transform a string for use as a folder name
def transform_folder_name(name):
# Remove special characters and replace spaces with underscores
name = ''.join(char if char.isalnum() or char.isspace() else '' for char in name)
# Convert to lowercase
name = name.lower()
# Replace spaces with underscores
name = name.replace(' ', '_')
return name
# Function to download a file and save it to a specified directory
def download_file(url, save_path):
response = requests.get(url, stream=True)
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
# Function to scrape a page, extract links, and download files
def scrape_page(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links to follow on the page
links = soup.find_all('a', href=True, )
i = 0
print('Scraping Page')
for link in links:
link_url = link['href']
link_text = link.find('h3') # Extract the text within the h3 tag
if link_url and link_text:
i = i + 1
link_text = link_text.text
folder_name = transform_folder_name(link_text)
link_url = base_url + link_url
folder_path = os.path.join(download_dir, folder_name)
os.makedirs(folder_path, exist_ok=True)
print('Downloading (' + str(i) + '/32) : ' + link_text)
download_audio_files(link_url, folder_path)
# Function to download audio files for a given link and folder
def download_audio_files(url, folder_path):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find and download audio files
audio_links = soup.find_all('a', href=True, string='Download')
total_links = len(audio_links)
with progressbar.ProgressBar(max_value=total_links,
widgets=widgets, ).start() as bar:
for i, audio_link in enumerate(audio_links, start=1):
audio_url = 'https://whoicf2.whoi.edu' + audio_link['href']
audio_name = os.path.basename(audio_url)
download_path = os.path.join(folder_path, audio_name)
# Check if the file already exists, and only download if it doesn't
if not os.path.exists(download_path):
download_file(audio_url, download_path)
else:
#Keeps the printing from showing weirdly when content is already downloaded
time.sleep(0.01)
bar.update(i)
# Main script to start the download process
if __name__ == '__main__':
initial_url = 'https://whoicf2.whoi.edu/science/B/whalesounds/index.cfm'
scrape_page(initial_url)
print("Download completed.")