Skip to content

Commit f9227b9

Browse files
committed
welcome optimism 🥳
0 parents  commit f9227b9

7 files changed

+759
-0
lines changed

README.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
<sup>
2+
👉 Go check out the parent repository - https://github.com/tintinweb/smart-contract-sanctuary - too!
3+
</sup>
4+
5+
# Smart Contract Sanctuary - Optimism
6+
🐦🌴🌴🌴🦕 A home for ethereum smart contracts verified on Etherscan. 🏠
7+
8+
This repository is part of the 💡 **[smart-contract-sanctuary](https://github.com/tintinweb/smart-contract-sanctuary)** project. Check out the [parent repository](https://github.com/tintinweb/smart-contract-sanctuary) for more information.
9+
10+
11+
12+
## 🎓 Citation
13+
14+
If you are using this dataset in your research and paper, here's how you can cite this dataset:
15+
16+
- APA6
17+
```
18+
Ortner, M., Eskandari, S. (n.d.). Smart Contract Sanctuary. Retrieved from https://github.com/tintinweb/smart-contract-sanctuary.
19+
```
20+
21+
- LateX (Bib)
22+
```
23+
@article{smart_contract_sanctuary,
24+
title={Smart Contract Sanctuary},
25+
url={https://github.com/tintinweb/smart-contract-sanctuary},
26+
author={Ortner, Martin and Eskandari, Shayan}}
27+
```

utils/connector/__init__.py

Whitespace-only changes.

utils/connector/etherscan.py

+286
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
#!/usr/bin/env python
2+
# -*- coding: UTF-8 -*-
3+
# github.com/tintinweb
4+
#
5+
"""
6+
7+
HACKy - non productive - script to download contracts from etherscan.io with throtteling.
8+
Will eventually being turned into a simple etherscan.io api library. Feel free to take over that part and
9+
contribute if interested.
10+
11+
"""
12+
import re
13+
import os
14+
import requests
15+
import random
16+
from retry import retry
17+
from bs4 import BeautifulSoup
18+
import json
19+
20+
import logging
21+
22+
logger = logging.getLogger(__name__)
23+
DEBUG_RAISE = False
24+
DEBUG_PRINT_CONTRACTS = False
25+
26+
def is_json(myjson):
27+
try:
28+
json_object = json.loads(myjson)
29+
except ValueError as e:
30+
return False
31+
return True
32+
33+
34+
class ContractNotFound(BaseException):
35+
def __init__(self, msg):
36+
self.msg = msg
37+
38+
class UserAgent(object):
39+
"""
40+
User-Agent handling retries and errors ...
41+
"""
42+
43+
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
44+
45+
def __init__(self, baseurl, proxies={}):
46+
self.baseurl, self.proxies = baseurl, proxies
47+
self.session = None
48+
self.initialize()
49+
50+
def initialize(self):
51+
self.session = requests.session()
52+
self.session.headers.update({
53+
"user-agent":self.UA + str(random.randint(0,100))
54+
})
55+
56+
def get(self, path, params={}, headers={}, proxies={}):
57+
new_headers = self.session.headers.copy()
58+
new_headers.update(headers)
59+
60+
proxies = proxies or self.proxies
61+
_e = None
62+
63+
resp = self.session.get("%s%s%s"%(self.baseurl, "/" if not path.startswith("/") else "", path),
64+
params=params, headers=new_headers, proxies=proxies)
65+
if resp.status_code != 200:
66+
raise Exception("Unexpected Status Code: %s!=200" % resp.status_code)
67+
return resp
68+
69+
70+
def post(self, path, params={}, headers={}, json=None):
71+
new_headers = self.session.headers.copy()
72+
new_headers.update(headers)
73+
74+
resp = self.session.post("%s%s%s"%(self.baseurl, "/" if not path.startswith("/") else "", path),
75+
params=params, headers=new_headers, json=json)
76+
if resp.status_code != 200:
77+
raise Exception("Unexpected Status Code: %s!=200" % resp.status_code)
78+
return resp
79+
80+
class EtherScanIoApi(object):
81+
"""
82+
Base EtherScan.io Api implementation
83+
"""
84+
85+
def __init__(self, baseurl=None, proxies={}):
86+
baseurl = baseurl or "https://www.etherscan.io"
87+
self.session = UserAgent(baseurl=baseurl, proxies=proxies)
88+
89+
@retry(Exception, delay=1, backoff=2, max_delay=10, tries=5, jitter=(1,4), logger=logger)
90+
def _request_contract_list(self, page, amount=100):
91+
resp = self.session.get("/contractsVerified/%d?ps=%s" % (page, amount))
92+
pageResult = re.findall(r'Page <strong(?:[^>]+)>(\d+)</strong> of <strong(?:[^>]+)>(\d+)</strong>', resp.text)
93+
if len(pageResult)>0:
94+
return resp, pageResult
95+
raise Exception("Invalid html response: Page marker not found")
96+
97+
@retry(Exception, delay=1, backoff=2, max_delay=10, tries=10, jitter=(1,4), logger=logger)
98+
def _request_contract_source(self, address):
99+
resp = self.session.get("/address/%s"%address).text
100+
if "You have reached your maximum request limit for this resource. Please try again later" in resp:
101+
print("[[THROTTELING]]")
102+
raise Exception("Throtteling")
103+
104+
print("=======================================================")
105+
print(address)
106+
#print(resp)
107+
sources = []
108+
# remove settings box. this is not solidity source
109+
if "<span class='text-secondary'>Settings</span><pre class='js-sourcecopyarea editor' id='editor' style='margin-top: 5px;'>" in resp:
110+
resp = resp.split("<span class='text-secondary'>Settings</span><pre class='js-sourcecopyarea editor' id='editor' style='margin-top: 5px;'>",1)[0]
111+
112+
for rawSource in re.split("<pre class='js-sourcecopyarea editor' id='editor\d*' style='margin-top: 5px;'>",resp)[1:]:
113+
src = rawSource.split("</pre><br>",1)[0]
114+
soup = BeautifulSoup(src, features="html.parser")
115+
source = soup.get_text() # normalize html.
116+
117+
if source.startswith("{") and "outputSelection" in source and "pragma" not in source and is_json(source):
118+
continue # ignore settings
119+
120+
if DEBUG_PRINT_CONTRACTS:
121+
print(source)
122+
if "&lt;" in source or "&gt;" in source or "&le;" in source or "&ge;" in source or "&amp;" in source or "&vert;" in source or "&quot;" in source:
123+
raise Exception("HTML IN OUTPUT!! - BeautifulSoup messed up..")
124+
source = source.replace("&lt;", "<").replace("&gt;", ">").replace("&le;","<=").replace("&ge;",">=").replace("&amp;","&").replace("&vert;","|").replace("&quot;",'"')
125+
sources.append(source)
126+
if not sources:
127+
raise Exception("unable to find source-code. rate limited? retry..")
128+
return "\n\n".join(sources)
129+
130+
def get_contracts(self, start=0, end=None):
131+
page = start
132+
133+
while not end or page <= end:
134+
resp, pageResult = self._request_contract_list(page)
135+
page, lastpage = pageResult[0]
136+
page, lastpage = int(page),int(lastpage)
137+
if not end:
138+
end = lastpage
139+
rows = self._parse_tbodies(resp.text)[0] # only use first tbody
140+
for col in rows:
141+
142+
contract = {'address': self._extract_text_from_html(col[0]).split(" ",1)[0],
143+
'name': self._extract_text_from_html(col[1]),
144+
'compiler': self._extract_text_from_html(col[3]),
145+
'balance': self._extract_text_from_html(col[4]),
146+
'txcount': int(self._extract_text_from_html(col[5])),
147+
'settings': self._extract_text_from_html(col[6]),
148+
'date': self._extract_text_from_html(col[7]),
149+
}
150+
yield contract
151+
page += 1
152+
153+
def get_contract_source(self, address):
154+
return self._request_contract_source(address)
155+
156+
def _extract_text_from_html(self, s):
157+
return re.sub('<[^<]+?>', '', s).strip()
158+
# return ''.join(re.findall(r">(.+?)</", s)) if ">" in s and "</" in s else s
159+
160+
def _extract_hexstr_from_html_attrib(self, s):
161+
return ''.join(re.findall(r".+/([^']+)'", s)) if ">" in s and "</" in s else s
162+
163+
def _get_pageable_data(self, path, start=0, length=10):
164+
params = {
165+
"start": start,
166+
"length": length,
167+
}
168+
resp = self.session.get(path, params=params).json()
169+
# cleanup HTML from response
170+
for item in resp['data']:
171+
keys = item.keys()
172+
for san_k in set(keys).intersection(set(("account", "blocknumber", "type", "direction"))):
173+
item[san_k] = self._extract_text_from_html(item[san_k])
174+
for san_k in set(keys).intersection(("parenthash", "from", "to", "address")):
175+
item[san_k] = self._extract_hexstr_from_html_attrib(item[san_k])
176+
return resp
177+
178+
def _parse_tbodies(self, data):
179+
tbodies = []
180+
for tbody in re.findall(r"<tbody.*?>(.+?)</tbody>", data, re.DOTALL):
181+
#print(tbody)
182+
rows = []
183+
for tr in re.findall(r"<tr.*?>(.+?)</tr>", tbody):
184+
rows.append(re.findall(r"<td.*?>(.+?)</td>", tr))
185+
tbodies.append(rows)
186+
return tbodies
187+
188+
189+
190+
class TronScanApi(object):
191+
"""
192+
Base EtherScan.io Api implementation
193+
"""
194+
195+
def __init__(self, baseurl=None, proxies={}):
196+
baseurl = baseurl or "http://apilist.tronscan.org"
197+
self.session = UserAgent(baseurl=baseurl, proxies=proxies)
198+
199+
@retry(Exception, delay=1, backoff=2, max_delay=10, tries=5, jitter=(1,4), logger=logger)
200+
def _request_contract_list(self, start, amount=1000):
201+
resp = self.session.get("api/contracts?count=true&limit=%s&confirm=0&start=%s&verified-only=true&open-source-only=false&sort=-verify_time&search="%(amount, start))
202+
return resp.json()
203+
204+
@retry(Exception, delay=1, backoff=2, max_delay=10, tries=10, jitter=(1,4), logger=logger)
205+
def _request_contract_source(self, address):
206+
resp = self.session.post("api/solidity/contract/info", json={"contractAddress":address})
207+
208+
print("=======================================================")
209+
print(address)
210+
#print(resp)
211+
respj = resp.json()
212+
if(respj["code"] <0):
213+
raise ContractNotFound("server error: %r"%respj)
214+
215+
sources = respj["data"]["contract_code"]
216+
217+
if not sources:
218+
raise Exception("unable to find source-code. rate limited? retry..")
219+
220+
import base64
221+
sources = ["//SourceUnit: %s\n\n%s"%(s["name"],base64.b64decode(s["code"]).decode("utf-8")) for s in sources]
222+
return "\n\n".join(sources)
223+
224+
def get_contracts(self, start=0, end=None):
225+
entry = start
226+
227+
while not end or entry <= end:
228+
pageResult = self._request_contract_list(entry)
229+
if not len(pageResult["data"]):
230+
break # no more entries
231+
232+
for entryData in pageResult["data"]:
233+
entry+=1
234+
yield entryData
235+
236+
def get_contract_source(self, address):
237+
return self._request_contract_source(address)
238+
239+
240+
241+
if __name__=="__main__":
242+
import sys
243+
if len(sys.argv)>1:
244+
prefix = sys.argv.pop()
245+
else:
246+
prefix = "www"
247+
248+
output_directory = "../contracts/%s/"%("mainnet" if prefix=="www" else prefix)
249+
overwrite = False
250+
amount = 1000000
251+
252+
e = EtherScanIoApi(baseurl="https://%s.etherscan.io"%(prefix))
253+
for nr,c in enumerate(e.get_contracts()):
254+
with open(os.path.join(output_directory,"contracts.json"),'a') as f:
255+
f.write("%s\n"%json.dumps(c))
256+
print("got contract: %s" % c)
257+
dst = os.path.join(output_directory, c["address"].replace("0x", "")[:2].lower()) # index by 1st byte
258+
if not os.path.isdir(dst):
259+
os.makedirs(dst)
260+
fpath = os.path.join(dst, "%s_%s.sol" % (
261+
c["address"].replace("0x", ""), str(c['name']).replace("\\", "_").replace("/", "_")))
262+
if not overwrite and os.path.exists(fpath):
263+
print(
264+
"[%d/%d] skipping, already exists --> %s (%-20s) -> %s" % (nr, amount, c["address"], c["name"], fpath))
265+
continue
266+
267+
try:
268+
source = e.get_contract_source(c["address"]).strip()
269+
if not len(source):
270+
raise Exception(c)
271+
except Exception as e:
272+
print(e)
273+
if DEBUG_RAISE:
274+
raise
275+
continue
276+
277+
278+
with open(fpath, "wb") as fw:
279+
fw.write(bytes(source, "utf8"))
280+
281+
print("[%d/%d] dumped --> %s (%-20s) -> %s" % (nr, amount, c["address"], c["name"], fpath))
282+
283+
nr += 1
284+
if nr >= amount:
285+
print("[%d/%d] finished. maximum amount of contracts to download reached." % (nr, amount))
286+
break

0 commit comments

Comments
 (0)