|
1 |
| -from lxml import html |
2 |
| -from lxml import etree |
3 |
| -import asyncio |
4 |
| -import aiohttp |
5 |
| -import requests |
6 |
| -import urllib3 |
7 |
| -import os |
8 |
| - |
9 |
| -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
10 |
| - |
11 |
| -TABLE = { |
12 |
| - 'Overview': '110', |
13 |
| - 'Valuation': '120', |
14 |
| - 'Ownership': '130', |
15 |
| - 'Performance': '140', |
16 |
| - 'Custom': '150', |
17 |
| - 'Financial': '160', |
18 |
| - 'Technical': '170' |
19 |
| -} |
20 |
| - |
21 |
| - |
22 |
| -def http_request(url, payload=None): |
23 |
| - |
24 |
| - if payload is None: |
25 |
| - payload = {} |
26 |
| - |
27 |
| - content = requests.get(url, params=payload, verify=False) |
28 |
| - content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx) |
29 |
| - |
30 |
| - return content, content.url |
31 |
| - |
32 |
| - |
33 |
| -class Screener(object): |
34 |
| - |
35 |
| - def __init__(self, tickers=None, filters=None, rows=None, order='', signal='', table='Overview'): |
36 |
| - |
37 |
| - if tickers is None: |
38 |
| - self.tickers = [] |
39 |
| - else: |
40 |
| - self.tickers = tickers |
41 |
| - |
42 |
| - if filters is None: |
43 |
| - self.filters = [] |
44 |
| - else: |
45 |
| - self.filters = filters |
46 |
| - |
47 |
| - self.rows = rows |
48 |
| - self.order = order |
49 |
| - self.signal = signal |
50 |
| - self.table = table |
51 |
| - self.page_content = None |
52 |
| - self.url = None |
53 |
| - self.headers = None |
54 |
| - self.page_urls = None |
55 |
| - self.data = None |
56 |
| - |
57 |
| - self.__search_screener() |
58 |
| - |
59 |
| - def to_csv(self, directory=None): |
60 |
| - |
61 |
| - from save_data import export_to_csv |
62 |
| - |
63 |
| - if directory is None: |
64 |
| - directory = os.getcwd() |
65 |
| - |
66 |
| - export_to_csv(self.headers, self.data, directory) |
67 |
| - |
68 |
| - def __get_total_rows(self): |
69 |
| - |
70 |
| - total_element = self.page_content.cssselect('td[width="140"]') |
71 |
| - self.rows = int(etree.tostring(total_element[0]).decode("utf-8").split('</b>')[1].split(' ')[0]) |
72 |
| - |
73 |
| - def __get_page_urls(self): |
74 |
| - |
75 |
| - try: |
76 |
| - total_pages = int([i.text.split('/')[1] for i in self.page_content.cssselect('option[value="1"]')][0]) |
77 |
| - except IndexError: # No results found |
78 |
| - return None |
79 |
| - |
80 |
| - urls = [] |
81 |
| - |
82 |
| - for page_number in range(1, total_pages + 1): # 5 |
83 |
| - |
84 |
| - sequence = 1 + (page_number - 1) * 20 # 1 |
85 |
| - |
86 |
| - if sequence - 20 <= self.rows < sequence: # -20 <= 3 < 1 |
87 |
| - break |
88 |
| - else: |
89 |
| - urls.append(self.url + '&r={}'.format(str(sequence))) |
90 |
| - |
91 |
| - self.page_urls = urls |
92 |
| - |
93 |
| - def __get_table_headers(self): |
94 |
| - |
95 |
| - first_row = self.page_content.cssselect('tr[valign="middle"]') |
96 |
| - |
97 |
| - headers = [] |
98 |
| - for table_content in first_row[0]: |
99 |
| - |
100 |
| - if table_content.text is None: |
101 |
| - sorted_text_list = etree.tostring(table_content.cssselect('img')[0]).decode("utf-8").split('/>') |
102 |
| - headers.append(sorted_text_list[1]) |
103 |
| - else: |
104 |
| - headers.append(table_content.text) |
105 |
| - |
106 |
| - self.headers = headers |
107 |
| - |
108 |
| - def __get_table_data(self, page): |
109 |
| - |
110 |
| - def parse_row(line): |
111 |
| - |
112 |
| - row_data = [] |
113 |
| - |
114 |
| - for tags in line: |
115 |
| - if tags.text is not None: |
116 |
| - row_data.append(tags.text) |
117 |
| - else: |
118 |
| - row_data.append([span.text for span in tags.cssselect('span')][0]) |
119 |
| - |
120 |
| - return row_data |
121 |
| - |
122 |
| - data_sets = [] |
123 |
| - page = html.fromstring(page) |
124 |
| - all_rows = [i.cssselect('a') for i in page.cssselect('tr[valign="top"]')[1:]] |
125 |
| - |
126 |
| - for row in all_rows: |
127 |
| - |
128 |
| - if int(row[0].text) is self.rows: |
129 |
| - values = dict(zip(self.headers, parse_row(row))) |
130 |
| - data_sets.append(values) |
131 |
| - break |
132 |
| - |
133 |
| - else: |
134 |
| - values = dict(zip(self.headers, parse_row(row))) |
135 |
| - data_sets.append(values) |
136 |
| - |
137 |
| - return data_sets |
138 |
| - |
139 |
| - async def __http_request__async(self, url, session): |
140 |
| - |
141 |
| - async with session.get(url) as response: |
142 |
| - page = await response.read() |
143 |
| - |
144 |
| - return self.__get_table_data(page) |
145 |
| - |
146 |
| - async def __async_scraper(self): |
147 |
| - |
148 |
| - tasks = [] |
149 |
| - async with aiohttp.ClientSession() as session: |
150 |
| - for n in self.page_urls: |
151 |
| - tasks.append(self.__http_request__async(n, session)) |
152 |
| - |
153 |
| - self.data = await asyncio.gather(*tasks) |
154 |
| - |
155 |
| - def __search_screener(self): |
156 |
| - |
157 |
| - payload = { |
158 |
| - 'v': TABLE[self.table], |
159 |
| - 't': ','.join(self.tickers), |
160 |
| - 'f': ','.join(self.filters), |
161 |
| - 'o': self.order, |
162 |
| - 's': self.signal |
163 |
| - } |
164 |
| - |
165 |
| - self.page_content, self.url = http_request('https://finviz.com/screener.ashx', payload) |
166 |
| - self.page_content = html.fromstring(self.page_content.text) # Parses the page with the default lxml parser |
167 |
| - |
168 |
| - self.__get_table_headers() |
169 |
| - |
170 |
| - if self.rows is None: |
171 |
| - self.__get_total_rows() |
172 |
| - |
173 |
| - self.__get_page_urls() |
174 |
| - |
175 |
| - if self.page_urls is None: |
176 |
| - raise Exception("No results matching the criteria: {}" |
177 |
| - .format(self.url.split('?', 1)[1])) |
178 |
| - |
179 |
| - loop = asyncio.get_event_loop() |
180 |
| - loop.run_until_complete(self.__async_scraper()) |
| 1 | +from finviz.async_connector import Connector |
| 2 | +from lxml import html |
| 3 | +from lxml import etree |
| 4 | +import requests |
| 5 | +import urllib3 |
| 6 | +import os |
| 7 | + |
| 8 | +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
| 9 | + |
| 10 | +TABLE = { |
| 11 | + 'Overview': '110', |
| 12 | + 'Valuation': '120', |
| 13 | + 'Ownership': '130', |
| 14 | + 'Performance': '140', |
| 15 | + 'Custom': '150', |
| 16 | + 'Financial': '160', |
| 17 | + 'Technical': '170' |
| 18 | +} |
| 19 | + |
| 20 | + |
| 21 | +def http_request(url, payload=None): |
| 22 | + |
| 23 | + if payload is None: |
| 24 | + payload = {} |
| 25 | + |
| 26 | + content = requests.get(url, params=payload, verify=False) |
| 27 | + content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx) |
| 28 | + |
| 29 | + return content, content.url |
| 30 | + |
| 31 | + |
| 32 | +class Screener(object): |
| 33 | + |
| 34 | + def __init__(self, tickers=None, filters=None, rows=None, order='', signal='', table='Overview'): |
| 35 | + |
| 36 | + if tickers is None: |
| 37 | + self.tickers = [] |
| 38 | + else: |
| 39 | + self.tickers = tickers |
| 40 | + |
| 41 | + if filters is None: |
| 42 | + self.filters = [] |
| 43 | + else: |
| 44 | + self.filters = filters |
| 45 | + |
| 46 | + self.rows = rows |
| 47 | + self.order = order |
| 48 | + self.signal = signal |
| 49 | + self.table = table |
| 50 | + self.page_content = None |
| 51 | + self.url = None |
| 52 | + self.headers = None |
| 53 | + self.page_urls = None |
| 54 | + self.data = None |
| 55 | + |
| 56 | + self.__search_screener() |
| 57 | + |
| 58 | + def to_csv(self, directory=None): |
| 59 | + |
| 60 | + from save_data import export_to_csv |
| 61 | + |
| 62 | + if directory is None: |
| 63 | + directory = os.getcwd() |
| 64 | + |
| 65 | + export_to_csv(self.headers, self.data, directory) |
| 66 | + |
| 67 | + def __get_total_rows(self): |
| 68 | + |
| 69 | + total_element = self.page_content.cssselect('td[width="140"]') |
| 70 | + self.rows = int(etree.tostring(total_element[0]).decode("utf-8").split('</b>')[1].split(' ')[0]) |
| 71 | + |
| 72 | + def __get_page_urls(self): |
| 73 | + |
| 74 | + try: |
| 75 | + total_pages = int([i.text.split('/')[1] for i in self.page_content.cssselect('option[value="1"]')][0]) |
| 76 | + except IndexError: # No results found |
| 77 | + return None |
| 78 | + |
| 79 | + urls = [] |
| 80 | + |
| 81 | + for page_number in range(1, total_pages + 1): |
| 82 | + |
| 83 | + sequence = 1 + (page_number - 1) * 20 |
| 84 | + |
| 85 | + if sequence - 20 <= self.rows < sequence: |
| 86 | + break |
| 87 | + else: |
| 88 | + urls.append(self.url + '&r={}'.format(str(sequence))) |
| 89 | + |
| 90 | + self.page_urls = urls |
| 91 | + |
| 92 | + def __get_table_headers(self): |
| 93 | + |
| 94 | + first_row = self.page_content.cssselect('tr[valign="middle"]') |
| 95 | + |
| 96 | + headers = [] |
| 97 | + for table_content in first_row[0]: |
| 98 | + |
| 99 | + if table_content.text is None: |
| 100 | + sorted_text_list = etree.tostring(table_content.cssselect('img')[0]).decode("utf-8").split('/>') |
| 101 | + headers.append(sorted_text_list[1]) |
| 102 | + else: |
| 103 | + headers.append(table_content.text) |
| 104 | + |
| 105 | + self.headers = headers |
| 106 | + |
| 107 | + def __get_table_data(self, page=None): |
| 108 | + |
| 109 | + def parse_row(line): |
| 110 | + |
| 111 | + row_data = [] |
| 112 | + |
| 113 | + for tags in line: |
| 114 | + if tags.text is not None: |
| 115 | + row_data.append(tags.text) |
| 116 | + else: |
| 117 | + row_data.append([span.text for span in tags.cssselect('span')][0]) |
| 118 | + |
| 119 | + return row_data |
| 120 | + |
| 121 | + data_sets = [] |
| 122 | + page = html.fromstring(page) |
| 123 | + all_rows = [i.cssselect('a') for i in page.cssselect('tr[valign="top"]')[1:]] |
| 124 | + |
| 125 | + for row in all_rows: |
| 126 | + |
| 127 | + if int(row[0].text) is self.rows: |
| 128 | + values = dict(zip(self.headers, parse_row(row))) |
| 129 | + data_sets.append(values) |
| 130 | + break |
| 131 | + |
| 132 | + else: |
| 133 | + values = dict(zip(self.headers, parse_row(row))) |
| 134 | + data_sets.append(values) |
| 135 | + |
| 136 | + return data_sets |
| 137 | + |
| 138 | + def __search_screener(self): |
| 139 | + |
| 140 | + payload = { |
| 141 | + 'v': TABLE[self.table], |
| 142 | + 't': ','.join(self.tickers), |
| 143 | + 'f': ','.join(self.filters), |
| 144 | + 'o': self.order, |
| 145 | + 's': self.signal |
| 146 | + } |
| 147 | + |
| 148 | + self.page_content, self.url = http_request('https://finviz.com/screener.ashx', payload) |
| 149 | + self.page_content = html.fromstring(self.page_content.text) # Parses the page with the default lxml parser |
| 150 | + |
| 151 | + self.__get_table_headers() |
| 152 | + |
| 153 | + if self.rows is None: |
| 154 | + self.__get_total_rows() |
| 155 | + |
| 156 | + self.__get_page_urls() |
| 157 | + |
| 158 | + if self.page_urls is None: |
| 159 | + raise Exception("No results matching the criteria: {}" |
| 160 | + .format(self.url.split('?', 1)[1])) |
| 161 | + |
| 162 | + async_connector = Connector(self.__get_table_data, self.page_urls) |
| 163 | + self.data = async_connector.run_connector() |
0 commit comments