-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
78 lines (64 loc) · 2.43 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
from bs4 import BeautifulSoup
import pandas as pd
# page_url = 'https://www.politifact.com/'
# page = requests.get(page_url)
authors = []
date = []
statement = []
sources = []
truth_meter = []
# soup = BeautifulSoup(page.content, "html.parser")
# ig_posts = soup.find_all("div", class_= 'm-statement__quote')# Statement location
# images = soup.find_all("div", class_ = 'm-statement__meter')
# footer = soup.find_all("footer", class_ ='m-statement__footer')
# meta_source = soup.find_all("a", class_= "m-statement__name")
# true_news = soup.find_all("div", class_ = 'm-statement__content')
# python_job_element = [h2_element.parent.parent.parent for h2_element in python_jobs]
# for posts in ig_posts:
# post = posts.find("a")
# print(post.text.strip())
# print()
# image_sources = []
# true_content = []
# for item in images:
# image_sources.append(item.get('src'))
# for pics in image_sources:
# if pics == 'https://static.politifact.com/politifact/rulings/meter-mostly-true.jpg':
# true_content.append(pics)
# print(true_content)
def scrape():
page_url = 'https://www.politifact.com/'
page = requests.get(page_url)
soup = BeautifulSoup(page.content, "html.parser")
ig_posts = soup.find_all("div", class_= 'm-statement__quote')# Statement location
images = soup.find_all("div", class_ = 'm-statement__meter')
footer = soup.find_all("footer", class_ ='m-statement__footer')
meta_source = soup.find_all("a", class_= "m-statement__meta")
true_news = soup.find_all("div", class_ = 'm-statement__content')
for contents in footer:
details = contents.text.strip()
content = details.split()
first_name = content[1]
last_name = content[2]
month = content[4]
day = content[5]
year = content[6]
name = first_name + ' '+last_name
new_date = month+ ' ' +day+ ' '+year
authors.append(name)
date.append(new_date)
for posts in ig_posts:
link = posts.find_all("a")
post = link[0].text.strip()
statement.append(post)
# print(post.text.strip())
# print()
for s in meta_source:
word = s.find_all("a")
word_text = word[0].text.strip()
sources.append(word_text)
for i in images:
facts = i.find("div", class_ = 'c-image').find('img').get('alt')
truth_meter.append(facts)
scrape()