Skip to content

Commit e879c1b

Browse files
committed
Readability and improvements for future implementations
1 parent f85d9de commit e879c1b

File tree

3 files changed

+203
-180
lines changed

3 files changed

+203
-180
lines changed

.idea/vcs.xml

+6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

finviz/async_connector.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import asyncio
2+
import aiohttp
3+
4+
5+
class Connector(object):
6+
7+
def __init__(self, scrape_function, tasks):
8+
9+
self.scrape_function = scrape_function
10+
self.tasks = tasks
11+
self.data = []
12+
13+
async def __http_request__async(self, url, session):
14+
15+
async with session.get(url) as response:
16+
page_html = await response.read()
17+
18+
return self.scrape_function(page_html)
19+
20+
async def __async_scraper(self):
21+
22+
tasks = []
23+
async with aiohttp.ClientSession() as session:
24+
for n in self.tasks:
25+
tasks.append(self.__http_request__async(n, session))
26+
27+
self.data = await asyncio.gather(*tasks)
28+
29+
def run_connector(self):
30+
31+
loop = asyncio.get_event_loop()
32+
loop.run_until_complete(self.__async_scraper())
33+
34+
return self.data
+163-180
Original file line numberDiff line numberDiff line change
@@ -1,180 +1,163 @@
1-
from lxml import html
2-
from lxml import etree
3-
import asyncio
4-
import aiohttp
5-
import requests
6-
import urllib3
7-
import os
8-
9-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
10-
11-
TABLE = {
12-
'Overview': '110',
13-
'Valuation': '120',
14-
'Ownership': '130',
15-
'Performance': '140',
16-
'Custom': '150',
17-
'Financial': '160',
18-
'Technical': '170'
19-
}
20-
21-
22-
def http_request(url, payload=None):
23-
24-
if payload is None:
25-
payload = {}
26-
27-
content = requests.get(url, params=payload, verify=False)
28-
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
29-
30-
return content, content.url
31-
32-
33-
class Screener(object):
34-
35-
def __init__(self, tickers=None, filters=None, rows=None, order='', signal='', table='Overview'):
36-
37-
if tickers is None:
38-
self.tickers = []
39-
else:
40-
self.tickers = tickers
41-
42-
if filters is None:
43-
self.filters = []
44-
else:
45-
self.filters = filters
46-
47-
self.rows = rows
48-
self.order = order
49-
self.signal = signal
50-
self.table = table
51-
self.page_content = None
52-
self.url = None
53-
self.headers = None
54-
self.page_urls = None
55-
self.data = None
56-
57-
self.__search_screener()
58-
59-
def to_csv(self, directory=None):
60-
61-
from save_data import export_to_csv
62-
63-
if directory is None:
64-
directory = os.getcwd()
65-
66-
export_to_csv(self.headers, self.data, directory)
67-
68-
def __get_total_rows(self):
69-
70-
total_element = self.page_content.cssselect('td[width="140"]')
71-
self.rows = int(etree.tostring(total_element[0]).decode("utf-8").split('</b>')[1].split(' ')[0])
72-
73-
def __get_page_urls(self):
74-
75-
try:
76-
total_pages = int([i.text.split('/')[1] for i in self.page_content.cssselect('option[value="1"]')][0])
77-
except IndexError: # No results found
78-
return None
79-
80-
urls = []
81-
82-
for page_number in range(1, total_pages + 1): # 5
83-
84-
sequence = 1 + (page_number - 1) * 20 # 1
85-
86-
if sequence - 20 <= self.rows < sequence: # -20 <= 3 < 1
87-
break
88-
else:
89-
urls.append(self.url + '&r={}'.format(str(sequence)))
90-
91-
self.page_urls = urls
92-
93-
def __get_table_headers(self):
94-
95-
first_row = self.page_content.cssselect('tr[valign="middle"]')
96-
97-
headers = []
98-
for table_content in first_row[0]:
99-
100-
if table_content.text is None:
101-
sorted_text_list = etree.tostring(table_content.cssselect('img')[0]).decode("utf-8").split('/>')
102-
headers.append(sorted_text_list[1])
103-
else:
104-
headers.append(table_content.text)
105-
106-
self.headers = headers
107-
108-
def __get_table_data(self, page):
109-
110-
def parse_row(line):
111-
112-
row_data = []
113-
114-
for tags in line:
115-
if tags.text is not None:
116-
row_data.append(tags.text)
117-
else:
118-
row_data.append([span.text for span in tags.cssselect('span')][0])
119-
120-
return row_data
121-
122-
data_sets = []
123-
page = html.fromstring(page)
124-
all_rows = [i.cssselect('a') for i in page.cssselect('tr[valign="top"]')[1:]]
125-
126-
for row in all_rows:
127-
128-
if int(row[0].text) is self.rows:
129-
values = dict(zip(self.headers, parse_row(row)))
130-
data_sets.append(values)
131-
break
132-
133-
else:
134-
values = dict(zip(self.headers, parse_row(row)))
135-
data_sets.append(values)
136-
137-
return data_sets
138-
139-
async def __http_request__async(self, url, session):
140-
141-
async with session.get(url) as response:
142-
page = await response.read()
143-
144-
return self.__get_table_data(page)
145-
146-
async def __async_scraper(self):
147-
148-
tasks = []
149-
async with aiohttp.ClientSession() as session:
150-
for n in self.page_urls:
151-
tasks.append(self.__http_request__async(n, session))
152-
153-
self.data = await asyncio.gather(*tasks)
154-
155-
def __search_screener(self):
156-
157-
payload = {
158-
'v': TABLE[self.table],
159-
't': ','.join(self.tickers),
160-
'f': ','.join(self.filters),
161-
'o': self.order,
162-
's': self.signal
163-
}
164-
165-
self.page_content, self.url = http_request('https://finviz.com/screener.ashx', payload)
166-
self.page_content = html.fromstring(self.page_content.text) # Parses the page with the default lxml parser
167-
168-
self.__get_table_headers()
169-
170-
if self.rows is None:
171-
self.__get_total_rows()
172-
173-
self.__get_page_urls()
174-
175-
if self.page_urls is None:
176-
raise Exception("No results matching the criteria: {}"
177-
.format(self.url.split('?', 1)[1]))
178-
179-
loop = asyncio.get_event_loop()
180-
loop.run_until_complete(self.__async_scraper())
1+
from finviz.async_connector import Connector
2+
from lxml import html
3+
from lxml import etree
4+
import requests
5+
import urllib3
6+
import os
7+
8+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
9+
10+
TABLE = {
11+
'Overview': '110',
12+
'Valuation': '120',
13+
'Ownership': '130',
14+
'Performance': '140',
15+
'Custom': '150',
16+
'Financial': '160',
17+
'Technical': '170'
18+
}
19+
20+
21+
def http_request(url, payload=None):
22+
23+
if payload is None:
24+
payload = {}
25+
26+
content = requests.get(url, params=payload, verify=False)
27+
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
28+
29+
return content, content.url
30+
31+
32+
class Screener(object):
33+
34+
def __init__(self, tickers=None, filters=None, rows=None, order='', signal='', table='Overview'):
35+
36+
if tickers is None:
37+
self.tickers = []
38+
else:
39+
self.tickers = tickers
40+
41+
if filters is None:
42+
self.filters = []
43+
else:
44+
self.filters = filters
45+
46+
self.rows = rows
47+
self.order = order
48+
self.signal = signal
49+
self.table = table
50+
self.page_content = None
51+
self.url = None
52+
self.headers = None
53+
self.page_urls = None
54+
self.data = None
55+
56+
self.__search_screener()
57+
58+
def to_csv(self, directory=None):
59+
60+
from save_data import export_to_csv
61+
62+
if directory is None:
63+
directory = os.getcwd()
64+
65+
export_to_csv(self.headers, self.data, directory)
66+
67+
def __get_total_rows(self):
68+
69+
total_element = self.page_content.cssselect('td[width="140"]')
70+
self.rows = int(etree.tostring(total_element[0]).decode("utf-8").split('</b>')[1].split(' ')[0])
71+
72+
def __get_page_urls(self):
73+
74+
try:
75+
total_pages = int([i.text.split('/')[1] for i in self.page_content.cssselect('option[value="1"]')][0])
76+
except IndexError: # No results found
77+
return None
78+
79+
urls = []
80+
81+
for page_number in range(1, total_pages + 1):
82+
83+
sequence = 1 + (page_number - 1) * 20
84+
85+
if sequence - 20 <= self.rows < sequence:
86+
break
87+
else:
88+
urls.append(self.url + '&r={}'.format(str(sequence)))
89+
90+
self.page_urls = urls
91+
92+
def __get_table_headers(self):
93+
94+
first_row = self.page_content.cssselect('tr[valign="middle"]')
95+
96+
headers = []
97+
for table_content in first_row[0]:
98+
99+
if table_content.text is None:
100+
sorted_text_list = etree.tostring(table_content.cssselect('img')[0]).decode("utf-8").split('/>')
101+
headers.append(sorted_text_list[1])
102+
else:
103+
headers.append(table_content.text)
104+
105+
self.headers = headers
106+
107+
def __get_table_data(self, page=None):
108+
109+
def parse_row(line):
110+
111+
row_data = []
112+
113+
for tags in line:
114+
if tags.text is not None:
115+
row_data.append(tags.text)
116+
else:
117+
row_data.append([span.text for span in tags.cssselect('span')][0])
118+
119+
return row_data
120+
121+
data_sets = []
122+
page = html.fromstring(page)
123+
all_rows = [i.cssselect('a') for i in page.cssselect('tr[valign="top"]')[1:]]
124+
125+
for row in all_rows:
126+
127+
if int(row[0].text) is self.rows:
128+
values = dict(zip(self.headers, parse_row(row)))
129+
data_sets.append(values)
130+
break
131+
132+
else:
133+
values = dict(zip(self.headers, parse_row(row)))
134+
data_sets.append(values)
135+
136+
return data_sets
137+
138+
def __search_screener(self):
139+
140+
payload = {
141+
'v': TABLE[self.table],
142+
't': ','.join(self.tickers),
143+
'f': ','.join(self.filters),
144+
'o': self.order,
145+
's': self.signal
146+
}
147+
148+
self.page_content, self.url = http_request('https://finviz.com/screener.ashx', payload)
149+
self.page_content = html.fromstring(self.page_content.text) # Parses the page with the default lxml parser
150+
151+
self.__get_table_headers()
152+
153+
if self.rows is None:
154+
self.__get_total_rows()
155+
156+
self.__get_page_urls()
157+
158+
if self.page_urls is None:
159+
raise Exception("No results matching the criteria: {}"
160+
.format(self.url.split('?', 1)[1]))
161+
162+
async_connector = Connector(self.__get_table_data, self.page_urls)
163+
self.data = async_connector.run_connector()

0 commit comments

Comments
 (0)