1
+ import os ,re ,tqdm ,requests ,sys ,time ,colorama
2
+ from urllib import request
3
+ from lxml import etree
4
+
5
+ def httpget (url ):
6
+ i = 1
7
+ while i <= 3 :
8
+ try :
9
+ headers = {b'accept' : b'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' , b'accept-encoding' : b'gzip, deflate, br' , b'accept-language' : b'zh-CN,zh;q=0.9' , b'cache-control' : b'max-age=0' , b'cookie' : b'UM_distinctid=17075d85cff10a-01607eb9ead8a8-376b4502-100200-17075d85d00120; CNZZDATA1255357127=319138885-1582524254-%7C1583800577' , b'referer' : b'https://www.meitulu.com/' , b'sec-fetch-mode' : b'navigate' , b'sec-fetch-site' : b'same-origin' , b'sec-fetch-user' : b'?1' , b'upgrade-insecure-requests' : b'1' , b'user-agent' : b'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' }
10
+ r = requests .get (url = url ,headers = headers ,timeout = 3 )
11
+ r .raise_for_status ()
12
+ return r .text
13
+ except requests .RequestException as e :
14
+ print (colorama .Back .RED + '发生错误:' + str (e ))
15
+ print ('[{}/3]正在尝试重连!' .format (str (i )))
16
+ i += 1
17
+ print (colorama .Back .RED + '重连失败,请复制错误信息报告作者!' )
18
+ input ('请按Enter键退出!' )
19
+ sys .exit ()
20
+
21
+ def get_input (maxint ,text ):
22
+ while True :
23
+ userin = input (text )
24
+ if userin .lower ()== 'q' :sys .exit ()
25
+ if userin .isdecimal ()== True :
26
+ if int (userin )<= maxint and int (userin )> 0 :
27
+ return int (userin )
28
+ break
29
+ print (colorama .Back .RED + '您的输入非法,请重新输入!' )
30
+
31
+ class mtl ():
32
+ def __init__ (self ):
33
+ self .host = 'https://www.meitulu.com'
34
+ self .titles ,self .allurls = [],[]
35
+
36
+ def search (self ):
37
+ while True :
38
+ while True :
39
+ keyword = input ('请输入搜索关键词: ' )
40
+ if keyword == '' :print (colorama .Back .RED + '关键词不能为空!' )
41
+ elif keyword == 'q' :sys .exit ()
42
+ elif str .isalnum (keyword )== False :print (colorama .Back .RED + '您的输入非法,请重新输入!' )
43
+ else :break
44
+
45
+ t0 = time .time ()
46
+ url = self .host + '/search/' + keyword
47
+ html = httpget (url )
48
+ ehtml = etree .HTML (html )
49
+ self .results = ehtml .xpath ("//ul[@class='img']/li" )
50
+ if len (self .results )== 0 :
51
+ print (colorama .Back .RED + '没有匹配的结果,换个关键词试试吧!' )
52
+ else :
53
+ t1 = time .time ()
54
+ print (colorama .Back .GREEN + '共找到匹配结果{}条,耗时{}秒' .format (str (len (self .results )),str (round (t1 - t0 ,3 ))))
55
+ break
56
+
57
+ def makeurls (self ):
58
+ i = get_input (len (self .results ),'请输入爬取图集数量: ' )
59
+ for result in self .results [:i ]: #result:一个图集,self.results:所有图集
60
+ title = result .xpath ("./p[@class='p_title']/a/text()" )[0 ] #获取图集标题
61
+ str_num = result .xpath ("./p[1]/text()" )[0 ]
62
+ num = re .search (r'(?<=:).*(?=张)' , str_num ).group ().strip () #提取图片数量
63
+ url0 = result .xpath ("./a/img/@src" )[0 ].replace ('0.jpg' ,'{}.jpg' ) #图片链接模板
64
+
65
+ urls = [] #用于储存一个图集中所有图片链接
66
+ for i in range (int (num )): #生成图片链接
67
+ urls .append (url0 .format (str (i + 1 )))
68
+
69
+ self .titles .append (title )
70
+ self .allurls .append (urls )
71
+
72
+ def download (self ):
73
+ i1 = 0 #下载图集数
74
+ c = 0 #下载图片计数
75
+ t0 = time .time ()
76
+ print ('已开始下载任务!' )
77
+ for title in self .titles :
78
+ print ('-------------------->>正在下载第{}组,还剩{}组<<--------------------' .format (str (i1 + 1 ),str (len (self .titles )- i1 - 1 )))
79
+ print (' ·图册标题:' + title )
80
+ fdir = './Photos/' + title + '/'
81
+ if os .path .isdir (fdir ) == False :
82
+ os .makedirs (fdir )
83
+ pbar = tqdm .tqdm (range (len (self .allurls [i1 ])),ascii = True ,ncols = 90 )
84
+ for i2 in pbar :
85
+ path = fdir + '{}.jpg' .format (str (i2 + 1 ))
86
+ if os .path .isfile (path )== False :
87
+ pbar .set_description_str (colorama .Fore .GREEN + ' ·下载进度' )
88
+ try :
89
+ request .urlretrieve (self .allurls [i1 ][i2 ],path )
90
+ c += 1
91
+ except Exception :
92
+ pbar .set_description_str (colorama .Fore .RED + ' ·下载出错' )
93
+ time .sleep (1.5 )
94
+ else :
95
+ pbar .set_description_str (colorama .Fore .YELLOW + ' ·图片已存在' )
96
+ time .sleep (0.05 )
97
+ i1 += 1
98
+ t1 = time .time ()
99
+ print (colorama .Back .GREEN + '\n 已完成下载任务,共下载图集{}个(图片{}张),耗时{}秒' .format (str (i1 ),str (c ),str (round (t1 - t0 ,3 ))))
100
+
101
+ def run (self ):
102
+ colorama .init (True )
103
+ self .search ()
104
+ self .makeurls ()
105
+ self .download ()
106
+
107
+ if __name__ == "__main__" :
108
+ os .system ('title MeiTuLuSpider[V2] @吾爱破解 lihaisanhui' )
109
+ print ('欢迎使用美图录Spider[V2,2020.03.11]!\n 前往数据源:https://www.meitulu.com 下载更多精彩图片!\n ' )
110
+ mtl = mtl ()
111
+ mtl .run ()
112
+ input ('请按Enter键退出!' )
0 commit comments