@@ -18,47 +18,39 @@ def printtime():
1818 return 0
1919
2020
21- def getsccodecore (eachLine ):
22- # 伪装成浏览器
21+ def getsccodecore (datajson ):
2322 headers = {
2423 "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
2524 }
2625
27- failedTimes = 100
28- while True : # 在制定次数内一直循环,直到访问站点成功
29-
26+ failedTimes = 50
27+ while True :
3028 if failedTimes <= 0 :
3129 printtime ()
32- print ("失败次数过多,请检查网络环境! " )
30+ print ("final failed times exceed 50! " )
3331 break
3432
3533 failedTimes -= 1
3634 try :
37- # 以下except都是用来捕获当requests请求出现异常时,
38- # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
35+
3936 printtime ()
40- # eachLineurl = eachLine[:80]
41- # 使用正则表达式匹配完整的网址
42- match = re .search (r"https?://\S+/address/0x[a-fA-F0-9]{40}#code" , eachLine )
43- if match :
44- eachLineurl = match .group (0 ) # 提取完整的URL
45- print ("正在连接的的网址链接是 " + eachLineurl )
46- response = requests .get (eachLineurl , headers = headers , timeout = 5 )
37+ print ("fecth:" + datajson .get ("codeURL" ))
38+ response = requests .get (datajson .get ("codeURL" ), headers = headers , timeout = 5 )
4739 break
4840
4941 except requests .exceptions .ConnectionError :
5042 printtime ()
51- print ("ConnectionError!请等待3秒! " )
43+ print ("ConnectionError! please wait 3 second! " )
5244 time .sleep (3 )
5345
5446 except requests .exceptions .ChunkedEncodingError :
5547 printtime ()
56- print ("ChunkedEncodingError!请等待3秒! " )
48+ print ("ChunkedEncodingError! please wait 3 second! " )
5749 time .sleep (3 )
5850
5951 except :
6052 printtime ()
61- print ("Unfortunitely,出现未知错误!请等待3秒! " )
53+ print ("Unfortunitely Error! please wait 3 second! " )
6254 time .sleep (3 )
6355
6456 response .encoding = response .apparent_encoding
@@ -68,146 +60,120 @@ def getsccodecore(eachLine):
6860 filepath = getPathCodeDirectory ()
6961 os .makedirs (filepath , exist_ok = True )
7062
71- # filename为合约地址 '0x5f3ed22b53ac0a001f0feedc2a3985999377c2ab'
72- match = re .search (r"0x[a-fA-F0-9]{40}" , eachLine )
73- if match :
74- filename = match .group (0 )
75-
63+ filename = datajson .get ("address" )
7664 if os .path .exists (filepath + filename + ".sol" ):
7765 printtime ()
78- print (filename + ".sol已存在! " )
66+ print (filename + ".sol already exists! " )
7967 return 0
8068
8169 fo = open (filepath + filename + ".sol" , "w+" , encoding = "utf-8" )
8270 for eachpre in targetPRE :
8371 try :
84- # 尝试将 eachpre.text 解析为 JSON
8572 json .loads (eachpre .text )
8673 print ("Skipping Settings JSON" )
87- continue # 如果成功解析为 JSON,跳过写入
74+ continue
8875 except json .JSONDecodeError :
89- # 如果解析失败,说明它不是 JSON
9076 fo .write (eachpre .text )
9177
9278 fo .close ()
9379 printtime ()
94- print (filename + ".sol新建完成! " )
80+ print (filename + ".sol created! " )
9581
9682 return 0
9783
9884
9985def getsccode ():
100- filepath = getFilePathAddress_txt ()
86+ jsonfilepath = getUrlJsonFilePath ()
10187 try :
102- SCAddress = open (filepath , "r" )
88+ with open (jsonfilepath , "r" ) as file :
89+ datas = json .load (file )
10390
10491 except :
10592 printtime ()
106- print ("打开智能合约URL地址仓库错误!请检查文件目录是否正确! " )
93+ print ("read json file error! " )
10794
108- # 一行一行的读取 address.txt 的内容
109- for eachLine in SCAddress :
110- # print("test:" + eachLine[:80])
111- # address.txt每一行前80字段 https://cn.etherscan.com/address/0xEeE690AAA67d1eE33365c02C3Bf477A93867052f#code
112- getsccodecore (eachLine ) # 这个才是获取智能合约代码的核心函数
95+ for data in datas :
96+ getsccodecore (data )
11397
114- SCAddress .close ()
98+ file .close ()
11599 return 0
116100
117101
118- def getSCAddress (eachurl , filepath ):
119- # 伪装成某种浏览器,防止被服务器拒绝服务
102+ def getSCAddress (eachurl ):
103+ json_array = []
104+
120105 headers = {
121106 "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
122107 }
123108
124- # 设置访问网址失败的最高次数,达到制定次数后,报告错误,停止程序
125109 failedTimes = 50
126110
127- while True : # 一直循环,直到在制定的次数内访问站点成功
111+ while True :
128112
129113 if failedTimes <= 0 :
130114 printtime ()
131- print ("失败次数过多,请检查网络环境! " )
115+ print ("final failed times exceed 50! " )
132116 break
133117
134- failedTimes -= 1 # 每执行一次就要减1
118+ failedTimes -= 1
135119 try :
136- # 以下except都是用来捕获当requests请求出现异常时,
137- # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
138- print ("正在连接的的网址链接是 " + eachurl )
139-
120+ print ("fecth:" + eachurl )
140121 response = requests .get (url = eachurl , headers = headers , timeout = 5 )
141-
142- # 执行到这一句意味着成功访问,于是退出while循环
143122 break
123+
144124 except requests .exceptions .ConnectionError :
145125 printtime ()
146- print ("ConnectionError!请等待3秒! " )
126+ print ("connectionError! waiting 3 second! " )
147127 time .sleep (3 )
148128
149129 except requests .exceptions .ChunkedEncodingError :
150130 printtime ()
151- print ("ChunkedEncodingError!请等待3秒! " )
131+ print ("ChunkedEncodingError! waiting 3 second! " )
152132 time .sleep (3 )
153133
154134 except :
155135 printtime ()
156- print ("出现未知错误!请等待3秒! " )
136+ print ("error! waiting 3 second! " )
157137 time .sleep (3 )
158138
159- # 转换成UTF-8编码
160139 response .encoding = response .apparent_encoding
161140
162- # 煲汤 爬虫解析
163141 soup = BeautifulSoup (response .text , "html.parser" )
164142
165- # 查找这个字段,这个字段下,包含智能合约代码的URL地址
166143 # targetDiv = soup.find_all('div', 'table-responsive')
167144
168145 try :
169146 targetTBody = soup .find_all ("tbody" , "align-middle text-nowrap" )[0 ]
170147
171148 except :
172149 printtime ()
173- print ("targetTBody未成功获取! " )
174- return 1
150+ print ("get targetTBody failed! " )
151+ return [], 1
175152
176- # 以追加的方式打开文件。
177- # 如果文件不存在,则新建;如果文件已存在,则在文件指针末尾追加
178- fo = open (filepath , "a" )
179-
180- # 把每一个地址,都写到文件里面保存下来
181153 for targetTR in targetTBody :
182- # print(targetTR)
183- # 获取每一行
184154 if targetTR .name == "tr" :
185- # 获取一行所有列
186155 data = targetTR .find_all ("td" )
187-
188- Address = data [0 ].getText ()
156+ # /address/0xceea87307db481e5ffbd412784e17e02e3851ace#code
157+ herf = targetTR .td .find ("a" , "me-1" ).attrs ["href" ]
158+ match = re .search (r"0x[a-fA-F0-9]{40}" , herf )
159+ if match :
160+ Address = match .group (0 )
189161 Name = data [1 ].getText ()
190162 Compiler = data [2 ].getText ()
191163 Version = data [3 ].getText ()
192164 Balance = data [4 ].getText ()
193165 VerifiedTime = data [7 ].getText ()
194166 if "Solidity" .lower () in Compiler .lower ():
195- fo .write (
196- etherscan_url
197- + targetTR .td .find ("a" , "me-1" ).attrs ["href" ]
198- + " "
199- + VerifiedTime
200- + ":"
201- + Name
202- + "==>Version:"
203- + Version
204- + "==>Balance:"
205- + Balance
206- + "\n "
207- )
167+ item = {
168+ "contractnName" : Name ,
169+ "address" : Address ,
170+ "codeURL" : etherscan_url + herf ,
171+ "version" : Version ,
172+ "verified" : VerifiedTime ,
173+ }
174+ json_array .append (item )
208175
209- fo .close ()
210- return 0
176+ return json_array , 0
211177
212178
213179def getUrlList ():
@@ -248,75 +214,69 @@ def getUrlList10000():
248214 return urlList
249215
250216
251- def getFilePathAddress_txt ():
252- # 获取当前脚本所在的目录
217+ def getUrlJsonFilePath ():
253218 current_directory = os .path .dirname (os .path .abspath (__file__ ))
254219 address_directory_path = os .path .join (current_directory , "temp_address" )
255220 os .makedirs (address_directory_path , exist_ok = True )
256- # 构建相对路径到address.txt文件
257- relative_file_path = "address_500.txt"
221+ relative_file_path = "address_500.json"
258222 if listNumber == config .ListNumber .TenThousand :
259- relative_file_path = "address_10000.txt"
260- # 构建访问address.txt的完整路径
223+ relative_file_path = "address_10000.json"
261224 address_file_path = os .path .join (address_directory_path , relative_file_path )
262225 return address_file_path
263226
264227
265228def getPathCodeDirectory ():
266- # 获取当前脚本所在的目录
267229 current_directory = os .path .dirname (os .path .abspath (__file__ ))
268- # 构建相对路径到code文件夹
269230 relative_file_path = "temp_code/code500/"
270231 if listNumber == config .ListNumber .TenThousand :
271232 relative_file_path = "temp_code/code10000/"
272-
273- # 构建访问code目录的完整路径
274233 code_directory_path = os .path .join (current_directory , relative_file_path )
275234 return code_directory_path
276235
277236
278237def updatescurl ():
279- # TODO:
280- # getUrlList10000 getUrlList500
281238 urlList = getUrlList ()
282239
283- # filepath是保存要爬取的智能合约地址的文件的存放路径
284- # 请根据自己的需求改成自己想要的路径。
285-
286- filepath = getFilePathAddress_txt ()
240+ filepath = getUrlJsonFilePath ()
287241
288- # 把旧的存放合约地址的文件清除干净
289242 try :
290243 if os .path .exists (filepath ):
291244 os .remove (filepath )
292245 printtime ()
293- print ("已清除%s目录下的旧文件(仓库)!" % filepath )
246+ print ("Remove" , filepath )
294247 except IOError :
295248
296249 printtime ()
297- print ("出现一个不能处理的错误,终止程序: IOError!" )
250+ print ("IOError!" )
298251
299- # 函数不正常执行,返回1
300252 return 1
301253
302- # 读取urlList里的每一个URL网页里的智能合约地址
254+ all_json_data = []
303255 for eachurl in urlList :
304256 time = 0
305- while 1 == getSCAddress (eachurl , filepath ):
257+ while True :
258+ json_array , result = getSCAddress (eachurl )
259+ if result == 0 :
260+ all_json_data .extend (json_array )
261+ break
306262 time += 1
307263 if time == 10 :
308264 break
265+
309266 pass
310267
311- # 函数正常执行,返回0
268+ json_data = json .dumps (all_json_data , indent = 2 )
269+ with open (filepath , "w" ) as fo :
270+ fo .write (json_data )
271+
312272 return 0
313273
314274
315275def main ():
316- # 更新要爬取的智能合约的地址
276+ # get the smart contract code url list
317277 updatescurl ()
318278
319- # 根据智能合约的地址去爬取智能合约的代码
279+ # get the smart contract code
320280 getsccode ()
321281
322282
0 commit comments