-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscholarly_scraper.py
executable file
·42 lines (32 loc) · 1.32 KB
/
scholarly_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
from libraries.scraper_api import ScraperAPIClient
from libraries.scholarly import scholarly, ProxyGenerator
import argparse
class ScraperAPI(ProxyGenerator):
def __init__(self, api_key):
self._api_key = api_key
self._client = ScraperAPIClient(api_key)
assert api_key is not None
super(ScraperAPI, self).__init__()
self._TIMEOUT = 120
self._session = self._client
self._session.proxies = {}
def _new_session(self):
self.got_403 = False
return self._session
def _close_session(self):
pass # no need to close the ScraperAPI client
if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-s', '--search', required=True, type=str, dest="search")
parser.add_argument('-n', '--name', required=True, type=str, dest="name")
parser.add_argument('-k', '--api_key', required=True, type=str, dest="api_key")
args = parser.parse_args()
pg = ScraperAPI(args.api_key)
scholarly.use_proxy(pg)
scholarly.set_timeout(120)
search_query = scholarly.search_pubs(args.search)
publi = next(search_query)
cites = scholarly.citedby(publi)
df = pd.DataFrame([p["bib"] for p in cites])
df.to_csv(args.name, index=False)