Merge pull request #37 from VinciGit00/refactor-fetch-node

lurenss · web-flow · commit 03e6881f008b · 2024-03-08T17:51:19.000+01:00
Refactor fetch node, add integration for preprocessing
diff --git a/scrapegraphai/nodes/fetch_html_node.py b/scrapegraphai/nodes/fetch_html_node.py
@@ -1,8 +1,24 @@
 """ 
 Module for fetching the HTML node
 """
+from typing import Any
 from langchain_community.document_loaders import AsyncHtmlLoader
+from langchain_core.documents import Document
 from .base_node import BaseNode
+from ..utils.remover import remover
+
+
+def _build_metadata(soup: Any, url: str) -> dict:
+    """Build metadata from BeautifulSoup output."""
+    metadata = {"source": url}
+    if title := soup.find("title"):
+        metadata["title"] = title.get_text()
+    if description := soup.find("meta", attrs={"name": "description"}):
+        metadata["description"] = description.get(
+            "content", "No description found.")
+    if html := soup.find("html"):
+        metadata["language"] = html.get("lang", "No language found.")
+    return metadata
 
 
 class FetchHTMLNode(BaseNode):
@@ -65,7 +81,10 @@ def execute(self, state: dict) -> dict:
 
         loader = AsyncHtmlLoader(url)
         document = loader.load()
+        metadata = document[0].metadata
+        document = remover(str(document[0]))
 
-        state["document"] = document
+        state["document"] = [
+            Document(page_content=document, metadata=metadata)]
 
         return state
diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py
@@ -1,40 +1,32 @@
 """
 Module for removing the unused html tags
 """
+from bs4 import BeautifulSoup
 
 
-def remover(file: str, only_body: bool = False) -> str:
+def remover(html_content: str) -> str:
     """
-    This function elaborates the HTML file and remove all the not necessary tag
+    This function processes the HTML content, removes unnecessary tags,
+     and retrieves the title and body content.
 
     Parameters:
-        file (str): the file to parse
+        html_content (str): the HTML content to parse
 
     Returns:
-        str: the parsed file
+        str: the parsed title followed by the body content without script tags
     """
 
-    res = ""
+    soup = BeautifulSoup(html_content, 'html.parser')
 
-    if only_body:
-        is_body = True
-    else:
-        is_body = False
+    # Estrai il titolo
+    title_tag = soup.find('title')
+    title = title_tag.get_text() if title_tag else ""
 
-    for elem in file.splitlines():
-        if "<title>" in elem:
-            res = res + elem
+    # Rimuovi i tag <script> in tutto il documento
+    [script.extract() for script in soup.find_all('script')]
 
-        if "<body>" in elem:
-            is_body = True
+    # Estrai il corpo del documento
+    body_content = soup.find('body')
+    body = str(body_content) if body_content else ""
 
-        if "</body>" in elem:
-            break
-
-        if "<script>" in elem:
-            continue
-
-        if is_body:
-            res = res + elem
-
-    return res.replace("\\n", "")
+    return title + body