-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwebscraper.py
31 lines (20 loc) · 907 Bytes
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from urllib import urlopen
import requests
from bs4 import BeautifulSoup
import re
page = "https://medium.com/NameOfThePublication"
pageContent = requests.get(page, headers={'User-Agent': 'Mozilla/5.0'})
webpage = pageContent.text
soup = BeautifulSoup(webpage,'html.parser')
first_information = soup.findAll('a',attrs={'class':'u-block'})
second_information = soup.findAll('div',attrs={'class':'u-letterSpacingTight'})
print(len(first_information),len(second_information))
j=0
for i in range(0,len(first_information)):
string = first_information[i]['style'].split(" ")[1]
a = re.search("(?<=\().*(?<=\")",string)
print(a.group(0).strip("\"")) # The url to the image for the story
print(first_information[i]['href']) # The link to the story
print(second_information[j].text) # The title of the story
print(second_information[j+1].text) # The summary of the story
j=j+2