Skip to content

Commit 03e6881

Browse files
authored
Merge pull request #37 from VinciGit00/refactor-fetch-node
Refactor fetch node, add integration for preprocessing
2 parents a64850d + cffd954 commit 03e6881

File tree

2 files changed

+36
-25
lines changed

2 files changed

+36
-25
lines changed

scrapegraphai/nodes/fetch_html_node.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,24 @@
11
"""
22
Module for fetching the HTML node
33
"""
4+
from typing import Any
45
from langchain_community.document_loaders import AsyncHtmlLoader
6+
from langchain_core.documents import Document
57
from .base_node import BaseNode
8+
from ..utils.remover import remover
9+
10+
11+
def _build_metadata(soup: Any, url: str) -> dict:
12+
"""Build metadata from BeautifulSoup output."""
13+
metadata = {"source": url}
14+
if title := soup.find("title"):
15+
metadata["title"] = title.get_text()
16+
if description := soup.find("meta", attrs={"name": "description"}):
17+
metadata["description"] = description.get(
18+
"content", "No description found.")
19+
if html := soup.find("html"):
20+
metadata["language"] = html.get("lang", "No language found.")
21+
return metadata
622

723

824
class FetchHTMLNode(BaseNode):
@@ -65,7 +81,10 @@ def execute(self, state: dict) -> dict:
6581

6682
loader = AsyncHtmlLoader(url)
6783
document = loader.load()
84+
metadata = document[0].metadata
85+
document = remover(str(document[0]))
6886

69-
state["document"] = document
87+
state["document"] = [
88+
Document(page_content=document, metadata=metadata)]
7089

7190
return state

scrapegraphai/utils/remover.py

+16-24
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,32 @@
11
"""
22
Module for removing the unused html tags
33
"""
4+
from bs4 import BeautifulSoup
45

56

6-
def remover(file: str, only_body: bool = False) -> str:
7+
def remover(html_content: str) -> str:
78
"""
8-
This function elaborates the HTML file and remove all the not necessary tag
9+
This function processes the HTML content, removes unnecessary tags,
10+
and retrieves the title and body content.
911
1012
Parameters:
11-
file (str): the file to parse
13+
html_content (str): the HTML content to parse
1214
1315
Returns:
14-
str: the parsed file
16+
str: the parsed title followed by the body content without script tags
1517
"""
1618

17-
res = ""
19+
soup = BeautifulSoup(html_content, 'html.parser')
1820

19-
if only_body:
20-
is_body = True
21-
else:
22-
is_body = False
21+
# Estrai il titolo
22+
title_tag = soup.find('title')
23+
title = title_tag.get_text() if title_tag else ""
2324

24-
for elem in file.splitlines():
25-
if "<title>" in elem:
26-
res = res + elem
25+
# Rimuovi i tag <script> in tutto il documento
26+
[script.extract() for script in soup.find_all('script')]
2727

28-
if "<body>" in elem:
29-
is_body = True
28+
# Estrai il corpo del documento
29+
body_content = soup.find('body')
30+
body = str(body_content) if body_content else ""
3031

31-
if "</body>" in elem:
32-
break
33-
34-
if "<script>" in elem:
35-
continue
36-
37-
if is_body:
38-
res = res + elem
39-
40-
return res.replace("\\n", "")
32+
return title + body

0 commit comments

Comments
 (0)