-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbat_img_scraper.py
More file actions
61 lines (47 loc) · 2.33 KB
/
bat_img_scraper.py
File metadata and controls
61 lines (47 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#scrape script for BAT galleries
import requests
from bs4 import BeautifulSoup
import json
import os
import re
special_id = 'data-gallery-items' #this is the DIV identifier for the gallery data we want to scrape
# The URLs of the page you want to scrape (list)
urls = ['http://www.example12345667.com/page1','http://www.example12345667.com/page2','http://etc']
for url in urls:
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code != 200:
print(f'Failed to get content of the URL with status code: {response.status_code}')
else:
# Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the title from the HTML
title = soup.title.string
# Sanitize the title by removing any character not allowed in file/directory names
title_sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
# Try to find the div and extract the data-gallery-items attribute
try:
data_gallery_items = soup.find('div', attrs={special_id: True})[special_id]
# Load the JSON data
data = json.loads(data_gallery_items)
# Create a directory with the title's text as its name if it does not exist
if not os.path.exists(title_sanitized):
os.makedirs(title_sanitized)
# Loop over each item and save the large image to disk
for idx, item in enumerate(data):
img_url = item['large']['url']
img_response = requests.get(img_url)
# Check if the request is successful
if img_response.status_code == 200:
# Open file in write and binary mode
with open(f'{title_sanitized}/image_large_{idx}.jpg', 'wb') as file:
# Write the image content to the file
file.write(img_response.content)
print("WROTE FILE: ", file)
else:
print(f'Failed to get image from URL {img_url} with status code: {img_response.status_code}')
except KeyError:
print("Could not find div with attribute ", special_id)
print("FINISHED with ", url)
print("FINISHED OVERALL.")