-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetPubMedAbstracts_multithread.py
108 lines (89 loc) · 3.65 KB
/
getPubMedAbstracts_multithread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import json
import csv
import threading
from time import sleep
from collections import defaultdict
# import eventlet
import requests
from bs4 import BeautifulSoup
# output = os.path.join('data/pudmed', 'abstracts.csv')
output = 'data/pudmed'
# common settings between esearch and efetch
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'db=pubmed'
# esearch specific settings
search_eutil = 'esearch.fcgi?'
# search_term = '&term=' + query
search_date = '&mindate=1800/01/01&maxdate=2021/12/31'
search_usehistory = '&usehistory=y'
search_rettype = '&retmode=json'
# search_url = base_url+search_eutil+db+search_term+search_usehistory+search_rettype
search_url = base_url+search_eutil+db+search_date+search_usehistory+search_rettype
print('{}\n esearch stage: \n\turl:{}'.format('-'*200, search_url))
ret = requests.get(search_url)
search_data = json.loads(ret.content.decode('utf-8'))
total_abstract_count = search_data['esearchresult']['count']
fetch_querykey = "&query_key=" + search_data['esearchresult']['querykey']
fetch_webenv = "&WebEnv=" + search_data['esearchresult']['webenv']
print('\tparse the return:')
formater = '{:>20s} {:>20s} {:>50s}'
header = ['total abstract count', 'query key', 'webenv']
print(formater.format(*header))
print(formater.format(total_abstract_count, fetch_querykey, fetch_webenv))
def get_abstract(retstarts, thd, retmax=60):
_file_path = output+'/abstract'+str(thd)+'.csv'
# other efetch settings
with open(_file_path, "a", newline='') as fw:
fw.seek(0)
fw.truncate()
writer = csv.writer(fw)
writer.writerow(['PMID', 'Title', 'Abstract'])
for retstart in retstarts:
print('thread_{} running, retstart={}\n'.format(thd, retstart))
fetch_eutil = 'efetch.fcgi?'
retmax = retmax
retstart = retstart
fetch_retmode = "&retmode=xml"
fetch_rettype = "&rettype=abstract"
fetch_retstart = "&retstart=" + str(retstart)
fetch_retmax = "&retmax=" + str(retmax)
fetch_url = base_url + fetch_eutil + db + fetch_querykey + fetch_webenv + fetch_retstart + fetch_retmax + fetch_retmode + fetch_rettype
fetch_data = None
# eventlet is not working for MacOS
# with eventlet.Timeout(20, False): # in case request outtime
try:
ret = requests.get(fetch_url)
fetch_data = ret.content.decode('utf-8')
except requests.exceptions.Timeout:
pass
if fetch_data is None:
continue
soup = BeautifulSoup(fetch_data, 'xml')
with open(_file_path, "a", newline='') as fw:
writer = csv.writer(fw)
i = 1
for articles in soup.find_all('PubmedArticle'):
try:
pmid = articles.find('PMID').text.strip()
title = articles.find('ArticleTitle').text.strip()
abstract = articles.find('AbstractText').text.strip()
if pmid is None or title is None or abstract is None:
continue
else:
i += 1
writer.writerow([pmid, title, abstract])
except Exception:
# print('some thing lack.')
pass
sleep(1)
retmax = 600
retstart = 0
page_nums = int(int(total_abstract_count) / retmax)
thd_nums = 10
pools = defaultdict(list)
for j in range(page_nums):
pools[j % thd_nums].append(j*retmax)
for i in range(thd_nums):
thd = threading.Thread(target=get_abstract, args=(pools[i], i, retmax))
thd.start()