File tree 2 files changed +36
-25
lines changed
2 files changed +36
-25
lines changed Original file line number Diff line number Diff line change 1
1
"""
2
2
Module for fetching the HTML node
3
3
"""
4
+ from typing import Any
4
5
from langchain_community .document_loaders import AsyncHtmlLoader
6
+ from langchain_core .documents import Document
5
7
from .base_node import BaseNode
8
+ from ..utils .remover import remover
9
+
10
+
11
+ def _build_metadata (soup : Any , url : str ) -> dict :
12
+ """Build metadata from BeautifulSoup output."""
13
+ metadata = {"source" : url }
14
+ if title := soup .find ("title" ):
15
+ metadata ["title" ] = title .get_text ()
16
+ if description := soup .find ("meta" , attrs = {"name" : "description" }):
17
+ metadata ["description" ] = description .get (
18
+ "content" , "No description found." )
19
+ if html := soup .find ("html" ):
20
+ metadata ["language" ] = html .get ("lang" , "No language found." )
21
+ return metadata
6
22
7
23
8
24
class FetchHTMLNode (BaseNode ):
@@ -65,7 +81,10 @@ def execute(self, state: dict) -> dict:
65
81
66
82
loader = AsyncHtmlLoader (url )
67
83
document = loader .load ()
84
+ metadata = document [0 ].metadata
85
+ document = remover (str (document [0 ]))
68
86
69
- state ["document" ] = document
87
+ state ["document" ] = [
88
+ Document (page_content = document , metadata = metadata )]
70
89
71
90
return state
Original file line number Diff line number Diff line change 1
1
"""
2
2
Module for removing the unused html tags
3
3
"""
4
+ from bs4 import BeautifulSoup
4
5
5
6
6
- def remover (file : str , only_body : bool = False ) -> str :
7
+ def remover (html_content : str ) -> str :
7
8
"""
8
- This function elaborates the HTML file and remove all the not necessary tag
9
+ This function processes the HTML content, removes unnecessary tags,
10
+ and retrieves the title and body content.
9
11
10
12
Parameters:
11
- file (str): the file to parse
13
+ html_content (str): the HTML content to parse
12
14
13
15
Returns:
14
- str: the parsed file
16
+ str: the parsed title followed by the body content without script tags
15
17
"""
16
18
17
- res = ""
19
+ soup = BeautifulSoup ( html_content , 'html.parser' )
18
20
19
- if only_body :
20
- is_body = True
21
- else :
22
- is_body = False
21
+ # Estrai il titolo
22
+ title_tag = soup .find ('title' )
23
+ title = title_tag .get_text () if title_tag else ""
23
24
24
- for elem in file .splitlines ():
25
- if "<title>" in elem :
26
- res = res + elem
25
+ # Rimuovi i tag <script> in tutto il documento
26
+ [script .extract () for script in soup .find_all ('script' )]
27
27
28
- if "<body>" in elem :
29
- is_body = True
28
+ # Estrai il corpo del documento
29
+ body_content = soup .find ('body' )
30
+ body = str (body_content ) if body_content else ""
30
31
31
- if "</body>" in elem :
32
- break
33
-
34
- if "<script>" in elem :
35
- continue
36
-
37
- if is_body :
38
- res = res + elem
39
-
40
- return res .replace ("\\ n" , "" )
32
+ return title + body
You can’t perform that action at this time.
0 commit comments