-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtweeter_scraper.py
99 lines (91 loc) · 2.87 KB
/
tweeter_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import urllib2
import time
import os
import datetime
import sys
from random import randint
def menu():
sane = 1
while sane == 1:
print "[ - ] Please enter absolute path to output directory: "
in_path = raw_input()+"\\tweeter_scraper_out"
if os.path.exists(in_path):
sane = 0
else:
try:
os.mkdir(in_path)
sane = 0
except:
os.system('cls' if os.name == 'nt' else 'clear')
print "[ - ] Invalid path, try again."
return(in_path)
def main(in_path):
print "[ + ] Gathering information..."
in_path = in_path
target_list = []
done_list = []
cnt = 0
while True:
if cnt != 0:
rand = randint(5,180)
print "[ - ] Sleeping "+str(rand)+" seconds until check for new items."
time.sleep(rand)
try:
resp = urllib2.urlopen("https://twitter.com/dumpmon")
except:
tmp_t = randint(360,720)
time.sleep(tmp_t)
print "[ - ] Communication error, sleeping "+str(tmp_t)+" seconds..."
html = resp.readlines()
out_log = in_path+"\\out_log.txt"
out_log_fo = open(out_log, 'a+')
out_log_items = out_log_fo.readlines()
for done in out_log_items:
if done.strip() not in done_list:
done_list.append(done.strip())
for line in html:
if "data-expanded-url=" in line:
startCut = line.find('data-expanded-url=')+18
endCut = line[startCut:len(line)].find(' class=')+startCut
target = line[startCut+1:endCut-1]
target_list.append(target)
for targ in target_list:
if targ not in done_list:
try:
time.sleep(randint(1,15))
resp = urllib2.urlopen(targ)
except urllib2.HTTPError:
print "[ - ] Caught a 404, will try one more time in 2-4 minutes..."
time.sleep(randint(120,240))
try:
resp = urllib2.urlopen(targ)
except urllib2.HTTPError:
print "[ - ] 404, "+targ+", skipping, "+str(time.strftime("%m%d%y_%H%M%S"))
out_log_fo.write(targ+"\n")
continue
html = resp.read()
if html.strip() == "Please refresh the page to continue...":
page = "http://pastebin.com/"+targ[targ.rfind("=")+1:len(targ)]
print "[ - ] Attempting... "+page
resp = urllib2.urlopen(page)
html = resp.read()
start_raw_cut = html.find('<textarea id="paste_code" class="paste_code" name="paste_code" onkeydown="return catchTab(this,event)">')+103
end_raw_cut = html[start_raw_cut:len(html)].find('</textarea>')+start_raw_cut
html = html[start_raw_cut:end_raw_cut]
time_det = str(time.strftime("%m%d%y_%H%M%S"))
dump_file = in_path+"\\"+time_det+'.txt'
dump_file_fo = open(dump_file, 'w')
dump_file_fo.write(html)
dump_file_fo.close()
done_list.append(targ)
out_log_fo.write(targ+"\n")
print "[ + ] Dump "+targ+" grabbed @ "+str(time.strftime("%m%d%y_%H%M%S"))
out_log_fo.close()
cnt+=1
print "[ - ] Checked "+str(cnt)+" times."
out_log_fo.close()
try:
main(menu())
except KeyboardInterrupt:
print "[ - ] Interrupt caught, exiting."
sys.exit(0)