fix: [crawler] log timeout + debug signal timeout

Terrtia · Terrtia · commit 0287a1380b7a · 2025-01-08T15:14:54.000+01:00
diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
@@ -364,7 +364,7 @@ def save_capture_response(self, parent_id, entries):
             dom_hash.add(self.date.replace('/', ''), item)
             dom_hash.add_correlation('domain', '', self.domain.id)
 
-            title_content = crawlers.extract_title_from_html(entries['html'])
+            title_content = crawlers.extract_title_from_html(entries['html'], item_id)
             if title_content:
                 title = Titles.create_title(title_content)
                 title.add(item.get_date(), item)
diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
@@ -10,6 +10,7 @@
 import gzip
 import hashlib
 import json
+import logging
 import os
 import pickle
 import re
@@ -72,6 +73,8 @@ def timeout_handler(signum, frame):
 
 faup = Faup()
 
+logger_crawler = logging.getLogger('crawlers.log')
+
 # # # # # # # #
 #             #
 #   DOMAINS   #
@@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
 #             #
 # # # # # # # #
 
-def extract_title_from_html(html):
+def extract_title_from_html(html, item_id):
     signal.alarm(60)
     try:
         soup = BeautifulSoup(html, 'html.parser')
@@ -333,8 +336,10 @@ def extract_title_from_html(html):
                 return str(title)
     except TimeoutException:
         signal.alarm(0)
+        logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
     else:
         signal.alarm(0)
+    signal.alarm(0)
     return ''
 
 def extract_description_from_html(html):