-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_scape_CSx.py
More file actions
74 lines (50 loc) · 1.76 KB
/
text_scape_CSx.py
File metadata and controls
74 lines (50 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#Script for scraping text off of a specfic website config, not general purpose
#saves to a text file
import time
import urllib.request
from bs4 import BeautifulSoup, Comment
import re
import random
toplevel = "http://*******URL REMOVED*********"
subfolder = "/************/"
mainurl = toplevel + subfolder
savefile = "data.txt"
# start and end words on page to scrape between
start = "************ "
end = "********"
delay = 2000 #microseconds
def getlinks(url_in):
links_scraped = []
parser = 'html.parser'
resp = urllib.request.urlopen(url_in)
soup = BeautifulSoup(resp, parser,
from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
# get links that are children of main url only:
if re.search(mainurl, link['href']):
links_scraped.append(link['href'])
return links_scraped
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
all_links = getlinks(mainurl)
#open file to start writing data
f = open(savefile, "a")
#get data
for i in range(len(all_links)):
url = all_links[i]
html = urllib.request.urlopen(url).read()
full_text = (text_from_html(html))
result = ''.join((re.findall(f'{start}(.*?){end}', full_text))) + end
f.writelines([("ENTRY: " + url + "\n"), (result + "\n") , "\n\n"])
print("Wrote entry for: ", url)
time.sleep(random.randint(500, delay)/1000)
f.close()