-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhumblebee.py
67 lines (51 loc) · 1.83 KB
/
humblebee.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 22:57:50 2020
@author: crystalhansen
"""
from lxml import html
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
dt = datetime.now() #+ timedelta(hours=1) #sadds an hour for a condition of time is less than one hour ahead
d = dt.strftime("%m-%d-%Y_%H-%M-%S")
def logInfo( action,link ):
print("info Logged" + link)
#write to log file action and links to trace
fileName= "humblebee/log/" + action +"_"+d+".txt"
f=open(fileName,"a")
f.write(action +"\n" + link + ";\n\n")
f.close()
return
#step 1 build url
#step 2 open request
#setp 3 get response
#step 4 do something iterate response for content and links
# for each response get links get content
# for each link capture content and links
#write to files
basicURL = "https://www.humblebeeandme.com/recipe-index/"
response = requests.get(basicURL)
soup = BeautifulSoup(response.text,'lxml')
#print (soup)
#websiteBaseUrl = "https://www.humblebeeandme.com/"
for link in soup.find_all('article'):
#print(link)
for href in link.find_all('a',href=True):
link = href['href']
logInfo("href: ", link)
#print(link)
articleResponse = requests.get(link)
articleSoup = BeautifulSoup(articleResponse.text,'lxml')
soupTitle = articleSoup.title.string
print(soupTitle)
logInfo("title", soupTitle)
soupTitle = soupTitle.replace("/", "_")
fileN= "humblebee/articles/" + soupTitle +"_"+d+".txt"
f=open(fileN,"w")
f.write(soupTitle +"\n" + link + "\n")
for article in articleSoup.find_all('div', class_="entry-content"):
#print(article.text.strip())
f.write(article.text.strip())
f.close()