6
6
from .base_node import BaseNode
7
7
8
8
9
- class ParseHTMLNode (BaseNode ):
9
+ class ParseNode (BaseNode ):
10
10
"""
11
- A node responsible for parsing HTML content from a document using specified tags .
11
+ A node responsible for parsing HTML content from a document.
12
12
It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting
13
- specific parts of an HTML document based on the tags provided in the state .
13
+ specific parts of an HTML document.
14
14
15
15
This node enhances the scraping workflow by allowing for targeted extraction of
16
16
content, thereby optimizing the processing of large HTML documents.
@@ -28,14 +28,18 @@ class ParseHTMLNode(BaseNode):
28
28
the specified tags, if provided, and updates the state with the parsed content.
29
29
"""
30
30
31
- def __init__ (self , node_name : str ):
31
+ def __init__ (self , doc_type : str = "html" , chunks_size : int = 4000 , node_name : str = "ParseHTMLNode" ):
32
32
"""
33
33
Initializes the ParseHTMLNode with a node name.
34
34
Args:
35
+ doc_type (str): type of the input document
36
+ chunks_size (int): size of the chunks to split the document
35
37
node_name (str): name of the node
36
38
node_type (str, optional): type of the node
37
39
"""
38
40
super ().__init__ (node_name , "node" )
41
+ self .doc_type = doc_type
42
+ self .chunks_size = chunks_size
39
43
40
44
def execute (self , state ):
41
45
"""
@@ -57,23 +61,27 @@ def execute(self, state):
57
61
information for parsing is missing.
58
62
"""
59
63
60
- print ("---PARSING HTML DOCUMENT---" )
64
+ print ("---PARSING DOCUMENT---" )
61
65
try :
62
66
document = state ["document" ]
63
67
except KeyError as e :
64
68
print (f"Error: { e } not found in state." )
65
69
raise
66
-
70
+
67
71
text_splitter = RecursiveCharacterTextSplitter .from_tiktoken_encoder (
68
- chunk_size = 4000 ,
72
+ chunk_size = self . chunks_size ,
69
73
chunk_overlap = 0 ,
70
74
)
71
75
72
- docs_transformed = Html2TextTransformer (
73
- ).transform_documents (document )[0 ]
76
+ # Parse the document based on the specified doc_type
77
+ if self .doc_type == "html" :
78
+ docs_transformed = Html2TextTransformer (
79
+ ).transform_documents (document )[0 ]
80
+ elif self .doc_type == "text" :
81
+ docs_transformed = document
74
82
75
83
chunks = text_splitter .split_text (docs_transformed .page_content )
76
84
77
- state .update ({"document_chunks " : chunks })
85
+ state .update ({"parsed_document " : chunks })
78
86
79
87
return state
0 commit comments