-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
138 lines (112 loc) · 5.4 KB
/
main.py
File metadata and controls
138 lines (112 loc) · 5.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
from pathlib import Path
import requests
import wget
from bs4 import BeautifulSoup
import feedparser
def main():
GET_EXTENDED_FEED = input("Do you want to get the latest 20 instead of 10 posts? ").lower()
CURRENT_DIR = str(Path.cwd().absolute())
FEED_URL = "https://heg-uelzen.de/hpp/rss.xml"
feed = feedparser.parse(FEED_URL)
feeditems = feed['entries']
# create directory for later downloaded data
datapath = "./data" # os.path.join(CURRENT_DIR, "data")
# iterate over feed items to fetch them
for item in feeditems:
# create directory for the post
dirname = item.id.replace(
" at https://heg-uelzen.de/hpp", "") + "--" + item.title.replace(" ", "_").replace("/", "-").replace("!", "")
newpath = datapath + "/" + dirname # os.path.join(datapath, dirname)
try:
os.makedirs(newpath, 0o777)
print("Created Directory '% s' " % newpath)
except OSError as error:
print(error)
# get & write post content to (txt) file
res = requests.get(item.link)
soup = BeautifulSoup(res.content, 'html.parser')
title = soup.title.text.replace(" | Herzog-Ernst-Gymnasium", "")
content = soup.find('div', class_="node__content")
try:
with open(newpath + "/" + "_post.html", "w+") as output_file:
output_file.write(str(content))
output_file.close()
print("Wrote output file for '% s'" % title)
except OSError as error:
print(error)
# get files included in the post
link_elements = content.find_all('a')
for element in link_elements:
# extraction of pdf files
if ".pdf" in element.get('href'):
link = "https://heg-uelzen.de" + element.get('href')
filename = link.split("/")[-1]
response = requests.get(link)
pdf = open(newpath + "/" + filename, "wb")
pdf.write(response.content)
pdf.close()
print("File " + filename + " downloaded")
# extraction of image files
image_elements = content.find_all('img')
for image in image_elements:
image_link = "https://heg-uelzen.de" + \
image.get('src').split("?")[0]
img_filename = image_link.split("/")[-1]
img_download = wget.download(
image_link, out=newpath + "/" + img_filename)
print("\nDownloaded image: " + img_download)
print("\n")
if GET_EXTENDED_FEED.startswith("y") or GET_EXTENDED_FEED.startswith("j"):
print("Getting extended feed...")
NEWSPAGE_TWO = "https://heg-uelzen.de/hpp/?page=1"
page_two_res = requests.get(NEWSPAGE_TWO)
page_two_soup = BeautifulSoup(page_two_res.content, 'html.parser')
page_two_list = page_two_soup.find_all('div', class_="views-row")
for post in page_two_list:
post_title = post.find('span', class_="field field--name-title field--type-string field--label-hidden").text
post_id = post.find('article', class_="node node--type-article node--promoted node--view-mode-teaser").get('data-history-node-id')
dirname = post_id + "--" + post_title.replace(" ", "_").replace("/", "-").replace("!", "")
newpath = datapath + "/" + dirname
try:
os.makedirs(newpath, 0o777)
print("Created Directory '% s' " % newpath)
except OSError as error:
print(error)
page_url = "https://heg-uelzen.de/hpp/node/" + str(post_id)
page = requests.get(page_url)
page_soup = BeautifulSoup(page.content, 'html.parser')
title = page_soup.title.text.replace(" | Herzog-Ernst-Gymnasium", "")
content = page_soup.find('div', class_="node__content")
try:
with open(newpath + "/" + "_post.html", "w+") as output_file:
output_file.write(str(content))
output_file.close()
print("Wrote output file for '% s'" % title)
except OSError as error:
print(error)
# get files included in the post
link_elements = content.find_all('a')
for element in link_elements:
# extraction of pdf files
if ".pdf" in element.get('href'):
link = "https://heg-uelzen.de" + element.get('href')
filename = link.split("/")[-1]
response = requests.get(link)
pdf = open(newpath + "/" + filename, "wb")
pdf.write(response.content)
pdf.close()
print("File " + filename + " downloaded")
# extraction of image files
image_elements = content.find_all('img')
for image in image_elements:
image_link = "https://heg-uelzen.de" + image.get('src').split("?")[0]
img_filename = image_link.split("/")[-1]
try:
img_download = wget.download(image_link, out=newpath + "/" + img_filename)
print("\nDownloaded image: " + img_download)
except Exception as err:
print("ERROR: "+ str(err))
print("\nFinished!")
if __name__ == "__main__":
main()