Skip to content

Commit 868da3c

Browse files
committed
fix: [title beautifullsoup] add signal, BeautifulSoup html.parser is stuck
1 parent 8692d9b commit 868da3c

File tree

1 file changed

+24
-5
lines changed

1 file changed

+24
-5
lines changed

bin/lib/crawlers.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,19 @@
2727

2828
from pyfaup.faup import Faup
2929

30+
31+
import signal
32+
33+
class TimeoutException(Exception):
34+
pass
35+
36+
def timeout_handler(signum, frame):
37+
raise TimeoutException
38+
39+
40+
signal.signal(signal.SIGALRM, timeout_handler)
41+
42+
3043
# interact with splash_crawler API
3144
import requests
3245
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
@@ -310,12 +323,18 @@ def extract_favicon_from_html(html, url):
310323
# # # # # # # #
311324

312325
def extract_title_from_html(html):
313-
soup = BeautifulSoup(html, 'html.parser')
314-
title = soup.title
315-
if title:
316-
title = title.string
326+
signal.alarm(60)
327+
try:
328+
soup = BeautifulSoup(html, 'html.parser')
329+
title = soup.title
317330
if title:
318-
return str(title)
331+
title = title.string
332+
if title:
333+
return str(title)
334+
except TimeoutException:
335+
pass
336+
else:
337+
signal.alarm(0)
319338
return ''
320339

321340
def extract_description_from_html(html):

0 commit comments

Comments
 (0)