diff --git a/crawler.py b/crawler.py
index 1d814e8f..ec1e6d35 100644
--- a/crawler.py
+++ b/crawler.py
@@ -29,7 +29,7 @@
def google_gen_query_url(keywords, face_only=False, safe_mode=False):
- base_url = "https://www.google.com/search?tbm=isch"
+ base_url = "https://www.google.com/search?tbm=isch&hl=en"
keywords_str = "&q=" + quote(keywords)
query_url = base_url + keywords_str
if face_only is True:
@@ -42,7 +42,7 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False):
def google_image_url_from_webpage(driver):
- time.sleep(10)
+ # time.sleep(10)
image_elements = driver.find_elements_by_class_name("rg_l")
image_urls = list()
url_pattern = "imgurl=\S*&imgrefurl"
diff --git a/downloader.py b/downloader.py
index 9c2a72ab..9286bc73 100644
--- a/downloader.py
+++ b/downloader.py
@@ -12,8 +12,12 @@
headers = {
- 'Connection': 'close',
- 'User-Agent': 'Chrome/54.0.2840.100'
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+ "Proxy-Connection": "keep-alive",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
+ "Accept-Encoding": "gzip, deflate, sdch",
+ # 'Connection': 'close',
}
@@ -59,6 +63,7 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time
:param concurrency: number of requests process simultaneously
:return: none
"""
+
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
futures = list()
count = 0
diff --git a/mainwindow.ui b/mainwindow.ui
index e5a133e6..db0f39d8 100644
--- a/mainwindow.ui
+++ b/mainwindow.ui
@@ -737,6 +737,9 @@ per keywords
HTTP
+
+ true
+
buttonGroup_2
@@ -762,7 +765,7 @@ per keywords
Socks5
- true
+ false
buttonGroup_2
@@ -776,6 +779,15 @@ per keywords
12
+
+ input ip:port
+
+
+ xxx.xxx.xxx.xx:port
+
+
+ xxx.xxx.xxx.xx:port
+
diff --git a/ui_mainwindow.py b/ui_mainwindow.py
index bafc89a2..fecde092 100644
--- a/ui_mainwindow.py
+++ b/ui_mainwindow.py
@@ -416,6 +416,7 @@ def setupUi(self, MainWindow):
font.setPointSize(12)
self.radioButton_http.setFont(font)
self.radioButton_http.setFocusPolicy(QtCore.Qt.TabFocus)
+ self.radioButton_http.setChecked(True)
self.radioButton_http.setObjectName(_fromUtf8("radioButton_http"))
self.buttonGroup_2 = QtGui.QButtonGroup(MainWindow)
self.buttonGroup_2.setObjectName(_fromUtf8("buttonGroup_2"))
@@ -431,7 +432,7 @@ def setupUi(self, MainWindow):
font.setPointSize(12)
self.radioButton_socks5.setFont(font)
self.radioButton_socks5.setFocusPolicy(QtCore.Qt.TabFocus)
- self.radioButton_socks5.setChecked(True)
+ self.radioButton_socks5.setChecked(False)
self.radioButton_socks5.setObjectName(_fromUtf8("radioButton_socks5"))
self.buttonGroup_2.addButton(self.radioButton_socks5)
self.horizontalLayout_3.addWidget(self.radioButton_socks5)
@@ -557,6 +558,9 @@ def retranslateUi(self, MainWindow):
self.checkBox_proxy.setText(_translate("MainWindow", "&Proxy:", None))
self.radioButton_http.setText(_translate("MainWindow", "HTTP", None))
self.radioButton_socks5.setText(_translate("MainWindow", "Socks5", None))
+ self.lineEdit_proxy.setToolTip(_translate("MainWindow", "input ip:port", None))
+ self.lineEdit_proxy.setStatusTip(_translate("MainWindow", "xxx.xxx.xxx.xx:port", None))
+ self.lineEdit_proxy.setPlaceholderText(_translate("MainWindow", "xxx.xxx.xxx.xx:port", None))
self.menuAbout.setTitle(_translate("MainWindow", "Help", None))
self.actionAbout.setText(_translate("MainWindow", "About", None))
diff --git a/utils.py b/utils.py
index a4210a30..9cf5c718 100644
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,12 @@
# -*- coding: utf-8 -*-
+def gen_valid_dir_name_for_keywords(keywords):
+ keep = ["-", "_", "."]
+ keywords = keywords.replace(" ", "_").replace(":", "-")
+ return "".join(c for c in keywords if c.isalnum() or c in keep).rstrip()
+
+
class AppConfig(object):
def __init__(self):
self.engine = "Google"
@@ -29,7 +35,7 @@ def to_command_paras(self):
str_paras += ' -j ' + str(self.num_threads)
- str_paras += ' -o "' + self.output_dir + '/' + self.keywords + '"'
+ str_paras += ' -o "' + self.output_dir + '/' + gen_valid_dir_name_for_keywords(self.keywords) + '"'
if self.face_only:
str_paras += ' -F '