Skip to content

Commit df382f2

Browse files
authored
Merge pull request #2 from EthanOK/dev_v1.1.1
refact: code
2 parents 800c9cd + 963466c commit df382f2

1 file changed

Lines changed: 67 additions & 107 deletions

File tree

SmartContactSpider.py

Lines changed: 67 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -18,47 +18,39 @@ def printtime():
1818
return 0
1919

2020

21-
def getsccodecore(eachLine):
22-
# 伪装成浏览器
21+
def getsccodecore(datajson):
2322
headers = {
2423
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
2524
}
2625

27-
failedTimes = 100
28-
while True: # 在制定次数内一直循环,直到访问站点成功
29-
26+
failedTimes = 50
27+
while True:
3028
if failedTimes <= 0:
3129
printtime()
32-
print("失败次数过多,请检查网络环境!")
30+
print("final failed times exceed 50!")
3331
break
3432

3533
failedTimes -= 1
3634
try:
37-
# 以下except都是用来捕获当requests请求出现异常时,
38-
# 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
35+
3936
printtime()
40-
# eachLineurl = eachLine[:80]
41-
# 使用正则表达式匹配完整的网址
42-
match = re.search(r"https?://\S+/address/0x[a-fA-F0-9]{40}#code", eachLine)
43-
if match:
44-
eachLineurl = match.group(0) # 提取完整的URL
45-
print("正在连接的的网址链接是 " + eachLineurl)
46-
response = requests.get(eachLineurl, headers=headers, timeout=5)
37+
print("fecth:" + datajson.get("codeURL"))
38+
response = requests.get(datajson.get("codeURL"), headers=headers, timeout=5)
4739
break
4840

4941
except requests.exceptions.ConnectionError:
5042
printtime()
51-
print("ConnectionError!请等待3秒!")
43+
print("ConnectionError! please wait 3 second!")
5244
time.sleep(3)
5345

5446
except requests.exceptions.ChunkedEncodingError:
5547
printtime()
56-
print("ChunkedEncodingError!请等待3秒!")
48+
print("ChunkedEncodingError! please wait 3 second!")
5749
time.sleep(3)
5850

5951
except:
6052
printtime()
61-
print("Unfortunitely,出现未知错误!请等待3秒!")
53+
print("Unfortunitely Error! please wait 3 second!")
6254
time.sleep(3)
6355

6456
response.encoding = response.apparent_encoding
@@ -68,146 +60,120 @@ def getsccodecore(eachLine):
6860
filepath = getPathCodeDirectory()
6961
os.makedirs(filepath, exist_ok=True)
7062

71-
# filename为合约地址 '0x5f3ed22b53ac0a001f0feedc2a3985999377c2ab'
72-
match = re.search(r"0x[a-fA-F0-9]{40}", eachLine)
73-
if match:
74-
filename = match.group(0)
75-
63+
filename = datajson.get("address")
7664
if os.path.exists(filepath + filename + ".sol"):
7765
printtime()
78-
print(filename + ".sol已存在!")
66+
print(filename + ".sol already exists!")
7967
return 0
8068

8169
fo = open(filepath + filename + ".sol", "w+", encoding="utf-8")
8270
for eachpre in targetPRE:
8371
try:
84-
# 尝试将 eachpre.text 解析为 JSON
8572
json.loads(eachpre.text)
8673
print("Skipping Settings JSON")
87-
continue # 如果成功解析为 JSON,跳过写入
74+
continue
8875
except json.JSONDecodeError:
89-
# 如果解析失败,说明它不是 JSON
9076
fo.write(eachpre.text)
9177

9278
fo.close()
9379
printtime()
94-
print(filename + ".sol新建完成!")
80+
print(filename + ".sol created!")
9581

9682
return 0
9783

9884

9985
def getsccode():
100-
filepath = getFilePathAddress_txt()
86+
jsonfilepath = getUrlJsonFilePath()
10187
try:
102-
SCAddress = open(filepath, "r")
88+
with open(jsonfilepath, "r") as file:
89+
datas = json.load(file)
10390

10491
except:
10592
printtime()
106-
print("打开智能合约URL地址仓库错误!请检查文件目录是否正确!")
93+
print("read json file error!")
10794

108-
# 一行一行的读取 address.txt 的内容
109-
for eachLine in SCAddress:
110-
# print("test:" + eachLine[:80])
111-
# address.txt每一行前80字段 https://cn.etherscan.com/address/0xEeE690AAA67d1eE33365c02C3Bf477A93867052f#code
112-
getsccodecore(eachLine) # 这个才是获取智能合约代码的核心函数
95+
for data in datas:
96+
getsccodecore(data)
11397

114-
SCAddress.close()
98+
file.close()
11599
return 0
116100

117101

118-
def getSCAddress(eachurl, filepath):
119-
# 伪装成某种浏览器,防止被服务器拒绝服务
102+
def getSCAddress(eachurl):
103+
json_array = []
104+
120105
headers = {
121106
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
122107
}
123108

124-
# 设置访问网址失败的最高次数,达到制定次数后,报告错误,停止程序
125109
failedTimes = 50
126110

127-
while True: # 一直循环,直到在制定的次数内访问站点成功
111+
while True:
128112

129113
if failedTimes <= 0:
130114
printtime()
131-
print("失败次数过多,请检查网络环境!")
115+
print("final failed times exceed 50!")
132116
break
133117

134-
failedTimes -= 1 # 每执行一次就要减1
118+
failedTimes -= 1
135119
try:
136-
# 以下except都是用来捕获当requests请求出现异常时,
137-
# 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
138-
print("正在连接的的网址链接是 " + eachurl)
139-
120+
print("fecth:" + eachurl)
140121
response = requests.get(url=eachurl, headers=headers, timeout=5)
141-
142-
# 执行到这一句意味着成功访问,于是退出while循环
143122
break
123+
144124
except requests.exceptions.ConnectionError:
145125
printtime()
146-
print("ConnectionError!请等待3秒!")
126+
print("connectionError! waiting 3 second!")
147127
time.sleep(3)
148128

149129
except requests.exceptions.ChunkedEncodingError:
150130
printtime()
151-
print("ChunkedEncodingError!请等待3秒!")
131+
print("ChunkedEncodingError! waiting 3 second!")
152132
time.sleep(3)
153133

154134
except:
155135
printtime()
156-
print("出现未知错误!请等待3秒!")
136+
print("error! waiting 3 second!")
157137
time.sleep(3)
158138

159-
# 转换成UTF-8编码
160139
response.encoding = response.apparent_encoding
161140

162-
# 煲汤 爬虫解析
163141
soup = BeautifulSoup(response.text, "html.parser")
164142

165-
# 查找这个字段,这个字段下,包含智能合约代码的URL地址
166143
# targetDiv = soup.find_all('div', 'table-responsive')
167144

168145
try:
169146
targetTBody = soup.find_all("tbody", "align-middle text-nowrap")[0]
170147

171148
except:
172149
printtime()
173-
print("targetTBody未成功获取!")
174-
return 1
150+
print("get targetTBody failed!")
151+
return [], 1
175152

176-
# 以追加的方式打开文件。
177-
# 如果文件不存在,则新建;如果文件已存在,则在文件指针末尾追加
178-
fo = open(filepath, "a")
179-
180-
# 把每一个地址,都写到文件里面保存下来
181153
for targetTR in targetTBody:
182-
# print(targetTR)
183-
# 获取每一行
184154
if targetTR.name == "tr":
185-
# 获取一行所有列
186155
data = targetTR.find_all("td")
187-
188-
Address = data[0].getText()
156+
# /address/0xceea87307db481e5ffbd412784e17e02e3851ace#code
157+
herf = targetTR.td.find("a", "me-1").attrs["href"]
158+
match = re.search(r"0x[a-fA-F0-9]{40}", herf)
159+
if match:
160+
Address = match.group(0)
189161
Name = data[1].getText()
190162
Compiler = data[2].getText()
191163
Version = data[3].getText()
192164
Balance = data[4].getText()
193165
VerifiedTime = data[7].getText()
194166
if "Solidity".lower() in Compiler.lower():
195-
fo.write(
196-
etherscan_url
197-
+ targetTR.td.find("a", "me-1").attrs["href"]
198-
+ " "
199-
+ VerifiedTime
200-
+ ":"
201-
+ Name
202-
+ "==>Version:"
203-
+ Version
204-
+ "==>Balance:"
205-
+ Balance
206-
+ "\n"
207-
)
167+
item = {
168+
"contractnName": Name,
169+
"address": Address,
170+
"codeURL": etherscan_url + herf,
171+
"version": Version,
172+
"verified": VerifiedTime,
173+
}
174+
json_array.append(item)
208175

209-
fo.close()
210-
return 0
176+
return json_array, 0
211177

212178

213179
def getUrlList():
@@ -248,75 +214,69 @@ def getUrlList10000():
248214
return urlList
249215

250216

251-
def getFilePathAddress_txt():
252-
# 获取当前脚本所在的目录
217+
def getUrlJsonFilePath():
253218
current_directory = os.path.dirname(os.path.abspath(__file__))
254219
address_directory_path = os.path.join(current_directory, "temp_address")
255220
os.makedirs(address_directory_path, exist_ok=True)
256-
# 构建相对路径到address.txt文件
257-
relative_file_path = "address_500.txt"
221+
relative_file_path = "address_500.json"
258222
if listNumber == config.ListNumber.TenThousand:
259-
relative_file_path = "address_10000.txt"
260-
# 构建访问address.txt的完整路径
223+
relative_file_path = "address_10000.json"
261224
address_file_path = os.path.join(address_directory_path, relative_file_path)
262225
return address_file_path
263226

264227

265228
def getPathCodeDirectory():
266-
# 获取当前脚本所在的目录
267229
current_directory = os.path.dirname(os.path.abspath(__file__))
268-
# 构建相对路径到code文件夹
269230
relative_file_path = "temp_code/code500/"
270231
if listNumber == config.ListNumber.TenThousand:
271232
relative_file_path = "temp_code/code10000/"
272-
273-
# 构建访问code目录的完整路径
274233
code_directory_path = os.path.join(current_directory, relative_file_path)
275234
return code_directory_path
276235

277236

278237
def updatescurl():
279-
# TODO:
280-
# getUrlList10000 getUrlList500
281238
urlList = getUrlList()
282239

283-
# filepath是保存要爬取的智能合约地址的文件的存放路径
284-
# 请根据自己的需求改成自己想要的路径。
285-
286-
filepath = getFilePathAddress_txt()
240+
filepath = getUrlJsonFilePath()
287241

288-
# 把旧的存放合约地址的文件清除干净
289242
try:
290243
if os.path.exists(filepath):
291244
os.remove(filepath)
292245
printtime()
293-
print("已清除%s目录下的旧文件(仓库)!" % filepath)
246+
print("Remove", filepath)
294247
except IOError:
295248

296249
printtime()
297-
print("出现一个不能处理的错误,终止程序:IOError!")
250+
print("IOError!")
298251

299-
# 函数不正常执行,返回1
300252
return 1
301253

302-
# 读取urlList里的每一个URL网页里的智能合约地址
254+
all_json_data = []
303255
for eachurl in urlList:
304256
time = 0
305-
while 1 == getSCAddress(eachurl, filepath):
257+
while True:
258+
json_array, result = getSCAddress(eachurl)
259+
if result == 0:
260+
all_json_data.extend(json_array)
261+
break
306262
time += 1
307263
if time == 10:
308264
break
265+
309266
pass
310267

311-
# 函数正常执行,返回0
268+
json_data = json.dumps(all_json_data, indent=2)
269+
with open(filepath, "w") as fo:
270+
fo.write(json_data)
271+
312272
return 0
313273

314274

315275
def main():
316-
# 更新要爬取的智能合约的地址
276+
# get the smart contract code url list
317277
updatescurl()
318278

319-
# 根据智能合约的地址去爬取智能合约的代码
279+
# get the smart contract code
320280
getsccode()
321281

322282

0 commit comments

Comments
 (0)