diff --git a/crawler.py b/crawler.py index 1d814e8f..ec1e6d35 100644 --- a/crawler.py +++ b/crawler.py @@ -29,7 +29,7 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False): - base_url = "https://www.google.com/search?tbm=isch" + base_url = "https://www.google.com/search?tbm=isch&hl=en" keywords_str = "&q=" + quote(keywords) query_url = base_url + keywords_str if face_only is True: @@ -42,7 +42,7 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False): def google_image_url_from_webpage(driver): - time.sleep(10) + # time.sleep(10) image_elements = driver.find_elements_by_class_name("rg_l") image_urls = list() url_pattern = "imgurl=\S*&imgrefurl" diff --git a/downloader.py b/downloader.py index 9c2a72ab..9286bc73 100644 --- a/downloader.py +++ b/downloader.py @@ -12,8 +12,12 @@ headers = { - 'Connection': 'close', - 'User-Agent': 'Chrome/54.0.2840.100' + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Proxy-Connection": "keep-alive", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36", + "Accept-Encoding": "gzip, deflate, sdch", + # 'Connection': 'close', } @@ -59,6 +63,7 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time :param concurrency: number of requests process simultaneously :return: none """ + with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor: futures = list() count = 0 diff --git a/mainwindow.ui b/mainwindow.ui index e5a133e6..db0f39d8 100644 --- a/mainwindow.ui +++ b/mainwindow.ui @@ -737,6 +737,9 @@ per keywords HTTP + + true + buttonGroup_2 @@ -762,7 +765,7 @@ per keywords Socks5 - true + false buttonGroup_2 @@ -776,6 +779,15 @@ per keywords 12 + + input ip:port + + + xxx.xxx.xxx.xx:port + + + xxx.xxx.xxx.xx:port + diff --git a/ui_mainwindow.py b/ui_mainwindow.py index bafc89a2..fecde092 100644 --- a/ui_mainwindow.py +++ b/ui_mainwindow.py @@ -416,6 +416,7 @@ def setupUi(self, MainWindow): font.setPointSize(12) self.radioButton_http.setFont(font) self.radioButton_http.setFocusPolicy(QtCore.Qt.TabFocus) + self.radioButton_http.setChecked(True) self.radioButton_http.setObjectName(_fromUtf8("radioButton_http")) self.buttonGroup_2 = QtGui.QButtonGroup(MainWindow) self.buttonGroup_2.setObjectName(_fromUtf8("buttonGroup_2")) @@ -431,7 +432,7 @@ def setupUi(self, MainWindow): font.setPointSize(12) self.radioButton_socks5.setFont(font) self.radioButton_socks5.setFocusPolicy(QtCore.Qt.TabFocus) - self.radioButton_socks5.setChecked(True) + self.radioButton_socks5.setChecked(False) self.radioButton_socks5.setObjectName(_fromUtf8("radioButton_socks5")) self.buttonGroup_2.addButton(self.radioButton_socks5) self.horizontalLayout_3.addWidget(self.radioButton_socks5) @@ -557,6 +558,9 @@ def retranslateUi(self, MainWindow): self.checkBox_proxy.setText(_translate("MainWindow", "&Proxy:", None)) self.radioButton_http.setText(_translate("MainWindow", "HTTP", None)) self.radioButton_socks5.setText(_translate("MainWindow", "Socks5", None)) + self.lineEdit_proxy.setToolTip(_translate("MainWindow", "input ip:port", None)) + self.lineEdit_proxy.setStatusTip(_translate("MainWindow", "xxx.xxx.xxx.xx:port", None)) + self.lineEdit_proxy.setPlaceholderText(_translate("MainWindow", "xxx.xxx.xxx.xx:port", None)) self.menuAbout.setTitle(_translate("MainWindow", "Help", None)) self.actionAbout.setText(_translate("MainWindow", "About", None)) diff --git a/utils.py b/utils.py index a4210a30..9cf5c718 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,12 @@ # -*- coding: utf-8 -*- +def gen_valid_dir_name_for_keywords(keywords): + keep = ["-", "_", "."] + keywords = keywords.replace(" ", "_").replace(":", "-") + return "".join(c for c in keywords if c.isalnum() or c in keep).rstrip() + + class AppConfig(object): def __init__(self): self.engine = "Google" @@ -29,7 +35,7 @@ def to_command_paras(self): str_paras += ' -j ' + str(self.num_threads) - str_paras += ' -o "' + self.output_dir + '/' + self.keywords + '"' + str_paras += ' -o "' + self.output_dir + '/' + gen_valid_dir_name_for_keywords(self.keywords) + '"' if self.face_only: str_paras += ' -F '