-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
48 lines (36 loc) · 1.44 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
from bs4 import BeautifulSoup
import json
# Define a list of URLs to scrape
base_url = "https://www.gemeinden.at/gemeinden/plz/"
urls = [f"{base_url}{x}" for x in range(1, 10)]
# Initialize a list to store the extracted data from all URLs
all_data = []
for url in urls:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
data = []
rows = soup.find_all('tr')
for row in rows:
try:
name = row.find('span', style='font-weight:bold;').text.strip()
plz = row.find('a').text.strip()
# Encode and decode special characters
name = name.encode('utf-8').decode('utf-8')
plz = plz.encode('utf-8').decode('utf-8')
entry = {
'Name': name,
'PLZ': plz
}
data.append(entry)
except AttributeError as e:
print(f"Error occurred: {e}")
# Append the data from this URL to the list of all_data
all_data.extend(data)
else:
print(f"Failed to retrieve the web page at URL: {url}. Status code:", response.status_code)
# Store all the data in a single JSON file
with open('plz.json', 'w', encoding='utf-8') as json_file:
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
print("Data extracted and saved to 'plz.json'")