Skip to content

Commit 109ce56

Browse files
committed
fix: [crawler] title extraction, sigalarm raised by signal.alarm and sleep
1 parent 9425e01 commit 109ce56

File tree

4 files changed

+41
-29
lines changed

4 files changed

+41
-29
lines changed

bin/crawlers/Crawler.py

+28-3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from lib import ail_logger
1818
from lib import crawlers
1919
from lib.ConfigLoader import ConfigLoader
20+
from lib.exceptions import TimeoutException
2021
from lib.Tag import get_domain_vanity_tags
2122
from lib.objects import CookiesNames
2223
from lib.objects import Etags
@@ -30,6 +31,15 @@
3031

3132
logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
3233

34+
# SIGNAL ALARM
35+
import signal
36+
def timeout_handler(signum, frame):
37+
raise TimeoutException
38+
39+
40+
signal.signal(signal.SIGALRM, timeout_handler)
41+
42+
3343
class Crawler(AbstractModule):
3444

3545
def __init__(self):
@@ -104,7 +114,10 @@ def refresh_lacus_status(self):
104114
self.is_lacus_up = False
105115
if not self.is_lacus_up:
106116
print("Can't reach lacus server", int(time.time()))
107-
time.sleep(30)
117+
try:
118+
time.sleep(30)
119+
except TimeoutException:
120+
pass
108121

109122
def print_crawler_start_info(self, url, domain_url):
110123
print()
@@ -183,7 +196,10 @@ def get_message(self):
183196
capture.update(-1)
184197
self.refresh_lacus_status()
185198

186-
time.sleep(self.pending_seconds)
199+
try:
200+
time.sleep(self.pending_seconds)
201+
except TimeoutException:
202+
pass
187203

188204
def enqueue_capture(self, task_uuid, priority):
189205
task = crawlers.CrawlerTask(task_uuid)
@@ -364,7 +380,16 @@ def save_capture_response(self, parent_id, entries):
364380
dom_hash.add(self.date.replace('/', ''), item)
365381
dom_hash.add_correlation('domain', '', self.domain.id)
366382

367-
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
383+
# TITLE
384+
signal.alarm(60)
385+
try:
386+
title_content = crawlers.extract_title_from_html(entries['html'])
387+
except TimeoutException:
388+
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
389+
title_content = None
390+
else:
391+
signal.alarm(0)
392+
368393
if title_content:
369394
title = Titles.create_title(title_content)
370395
title.add(item.get_date(), item)

bin/lib/crawlers.py

+3-23
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,6 @@
2828

2929
from pyfaup.faup import Faup
3030

31-
32-
import signal
33-
34-
class TimeoutException(Exception):
35-
pass
36-
37-
def timeout_handler(signum, frame):
38-
raise TimeoutException
39-
40-
41-
signal.signal(signal.SIGALRM, timeout_handler)
42-
43-
4431
# interact with splash_crawler API
4532
import requests
4633
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
@@ -73,7 +60,7 @@ def timeout_handler(signum, frame):
7360

7461
faup = Faup()
7562

76-
logger_crawler = logging.getLogger('crawlers.log')
63+
# logger_crawler = logging.getLogger('crawlers.log')
7764

7865
# # # # # # # #
7966
# #
@@ -325,21 +312,14 @@ def extract_favicon_from_html(html, url):
325312
# #
326313
# # # # # # # #
327314

328-
def extract_title_from_html(html, item_id):
329-
# signal.alarm(60)
330-
# try:
315+
# /!\ REQUIRE ALARM SIGNAL
316+
def extract_title_from_html(html):
331317
soup = BeautifulSoup(html, 'html.parser')
332318
title = soup.title
333319
if title:
334320
title = title.string
335321
if title:
336322
return str(title)
337-
# except TimeoutException:
338-
# signal.alarm(0)
339-
# logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
340-
# else:
341-
# signal.alarm(0)
342-
# signal.alarm(0)
343323
return ''
344324

345325
def extract_description_from_html(html):

bin/lib/exceptions.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
#!/usr/bin/env python3
22
# -*-coding:UTF-8 -*
33

4-
from pymisp import PyMISPError
4+
# from pymisp import PyMISPError
5+
6+
# SIGNAL ALARM
7+
class TimeoutException(Exception):
8+
pass
59

610
class AILError(Exception):
711
def __init__(self, message):

bin/modules/abstract_module.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from lib import ail_logger
2222
from lib.ail_queues import AILQueue
2323
from lib import regex_helper
24-
from lib.exceptions import ModuleQueueError
24+
from lib.exceptions import ModuleQueueError, TimeoutException
2525
from lib.objects.ail_objects import get_obj_from_global_id
2626

2727
logging.config.dictConfig(ail_logger.get_config(name='modules'))
@@ -193,7 +193,10 @@ def run(self):
193193
self.computeNone()
194194
# Wait before next process
195195
self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
196-
time.sleep(self.pending_seconds)
196+
try:
197+
time.sleep(self.pending_seconds)
198+
except TimeoutException:
199+
pass
197200

198201
def _module_name(self):
199202
"""

0 commit comments

Comments
 (0)