Skip to content

Commit 0287a13

Browse files
committed
fix: [crawler] log timeout + debug signal timeout
1 parent 38d1d01 commit 0287a13

File tree

2 files changed

+7
-2
lines changed

2 files changed

+7
-2
lines changed

bin/crawlers/Crawler.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ def save_capture_response(self, parent_id, entries):
364364
dom_hash.add(self.date.replace('/', ''), item)
365365
dom_hash.add_correlation('domain', '', self.domain.id)
366366

367-
title_content = crawlers.extract_title_from_html(entries['html'])
367+
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
368368
if title_content:
369369
title = Titles.create_title(title_content)
370370
title.add(item.get_date(), item)

bin/lib/crawlers.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import gzip
1111
import hashlib
1212
import json
13+
import logging
1314
import os
1415
import pickle
1516
import re
@@ -72,6 +73,8 @@ def timeout_handler(signum, frame):
7273

7374
faup = Faup()
7475

76+
logger_crawler = logging.getLogger('crawlers.log')
77+
7578
# # # # # # # #
7679
# #
7780
# DOMAINS #
@@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
322325
# #
323326
# # # # # # # #
324327

325-
def extract_title_from_html(html):
328+
def extract_title_from_html(html, item_id):
326329
signal.alarm(60)
327330
try:
328331
soup = BeautifulSoup(html, 'html.parser')
@@ -333,8 +336,10 @@ def extract_title_from_html(html):
333336
return str(title)
334337
except TimeoutException:
335338
signal.alarm(0)
339+
logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
336340
else:
337341
signal.alarm(0)
342+
signal.alarm(0)
338343
return ''
339344

340345
def extract_description_from_html(html):

0 commit comments

Comments
 (0)