Merge pull request #39 from VinciGit00/fix-bug-merge

VinciGit00 · web-flow · commit 55702b28a0bd · 2024-03-12T11:57:47.000+01:00
Fix bug merge
diff --git a/examples/.env.example b/examples/.env.example
diff --git a/examples/graph_examples/custom_graph_example.py b/examples/graph_examples/custom_graph_example.py
@@ -6,7 +6,7 @@
 from dotenv import load_dotenv
 from scrapegraphai.models import OpenAI
 from scrapegraphai.graphs import BaseGraph
-from scrapegraphai.nodes import FetchHTMLNode, ParseHTMLNode, GenerateAnswerNode
+from scrapegraphai.nodes import FetchHTMLNode, ParseNode, RAGNode, GenerateAnswerNode
 
 load_dotenv()
 
@@ -22,26 +22,29 @@
 
 # define the nodes for the graph
 fetch_html_node = FetchHTMLNode("fetch_html")
-parse_document_node = ParseHTMLNode("parse_document")
+parse_document_node = ParseNode(doc_type="html", chunks_size=4000, node_name="parse_document")
+rag_node = RAGNode(model, "rag")
 generate_answer_node = GenerateAnswerNode(model, "generate_answer")
 
 # create the graph
 graph = BaseGraph(
     nodes={
         fetch_html_node,
         parse_document_node,
+        rag_node,
         generate_answer_node
     },
     edges={
         (fetch_html_node, parse_document_node),
-        (parse_document_node, generate_answer_node)
+        (parse_document_node, rag_node),
+        (rag_node, generate_answer_node)
     },
     entry_point=fetch_html_node
 )
 
 # execute the graph
-inputs = {"user_input": "Give me the news",
-          "url": "https://www.ansa.it/sito/notizie/topnews/index.shtml"}
+inputs = {"user_input": "List me the projects with their description",
+          "url": "https://perinim.github.io/projects/"}
 result = graph.execute(inputs)
 
 # get the answer from the result
diff --git a/examples/graph_examples/smart_scraper_example.py b/examples/graph_examples/smart_scraper_example.py
@@ -16,8 +16,8 @@
 }
 
 # Define URL and PROMPT
-URL = "https://www.google.com/search?client=safari&rls=en&q=ristoranti+trento&ie=UTF-8&oe=UTF-8"
-PROMPT = "List me all the https inside the page"
+URL = "https://www.ansa.it/veneto/"
+PROMPT = "List me all the news with their description."
 
 # Create the SmartScraperGraph instance
 smart_scraper_graph = SmartScraperGraph(PROMPT, URL, llm_config)
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -5,6 +5,7 @@
 from .base_graph import BaseGraph
 from ..nodes import (
     FetchHTMLNode,
+    ParseNode,
     RAGNode,
     GenerateAnswerNode
     )
@@ -73,18 +74,22 @@ def _create_graph(self):
         Returns:
             BaseGraph: An instance of the BaseGraph class.
         """
+        # define the nodes for the graph
         fetch_html_node = FetchHTMLNode("fetch_html")
+        parse_document_node = ParseNode(doc_type="html", chunks_size=4000, node_name="parse_document")
         rag_node = RAGNode(self.llm, "rag")
         generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer")
 
         return BaseGraph(
             nodes={
                 fetch_html_node,
+                parse_document_node,
                 rag_node,
                 generate_answer_node,
             },
             edges={
-                (fetch_html_node, rag_node),
+                (fetch_html_node, parse_document_node),
+                (parse_document_node, rag_node),
                 (rag_node, generate_answer_node)
             },
             entry_point=fetch_html_node
diff --git a/scrapegraphai/graphs/speech_summary_graph.py b/scrapegraphai/graphs/speech_summary_graph.py
@@ -6,6 +6,7 @@
 from .base_graph import BaseGraph
 from ..nodes import (
     FetchHTMLNode,
+    ParseNode,
     RAGNode,
     GenerateAnswerNode,
     TextToSpeechNode,
@@ -79,6 +80,7 @@ def _create_graph(self):
             BaseGraph: An instance of the BaseGraph class.
         """
         fetch_html_node = FetchHTMLNode("fetch_html")
+        parse_document_node = ParseNode(doc_type="html", chunks_size=4000, node_name="parse_document")
         rag_node = RAGNode(self.llm, "rag")
         generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer")
         text_to_speech_node = TextToSpeechNode(
@@ -87,12 +89,14 @@ def _create_graph(self):
         return BaseGraph(
             nodes={
                 fetch_html_node,
+                parse_document_node,
                 rag_node,
                 generate_answer_node,
                 text_to_speech_node
             },
             edges={
-                (fetch_html_node, rag_node),
+                (fetch_html_node, parse_document_node),
+                (parse_document_node, rag_node),
                 (rag_node, generate_answer_node),
                 (generate_answer_node, text_to_speech_node)
             },
diff --git a/scrapegraphai/helpers/nodes_metadata.py b/scrapegraphai/helpers/nodes_metadata.py
@@ -20,12 +20,12 @@
         },
         "returns": "Updated state with probable HTML tags under 'tags' key."
     },
-    "ParseHTMLNode": {
-        "description": "Parses HTML content to extract specific data.",
+    "ParseNode": {
+        "description": "Parses document content to extract specific data.",
         "type": "node",
         "args": {
-            "document": "HTML content as a string.",
-            "tags": "List of HTML tags to focus on during parsing."
+            "doc_type": "Type of the input document. Default is 'html'.",
+            "document": "The document content to be parsed.",
         },
         "returns": "Updated state with extracted data under 'parsed_document' key."
     },
@@ -38,7 +38,7 @@
         "type": "node",
         "args": {
             "user_input": "The user's query or question guiding the retrieval.",
-            "document": "The HTML content to be processed and compressed."
+            "document": "The document content to be processed and compressed."
         },
         "returns": """Updated state with 'relevant_chunks' key containing
          the most relevant text chunks."""
@@ -48,7 +48,7 @@
         "type": "node",
         "args": {
             "user_input": "User's query or question.",
-            "parsed_document": "Data extracted from the HTML document."
+            "parsed_document": "Data extracted from the input document."
         },
         "returns": "Updated state with the answer under 'answer' key."
     },
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -5,9 +5,8 @@
 from .conditional_node import ConditionalNode
 from .get_probable_tags_node import GetProbableTagsNode
 from .generate_answer_node import GenerateAnswerNode
-from .parse_html_node import ParseHTMLNode
+from .parse_node import ParseNode
 from .rag_node import RAGNode
 from .text_to_speech_node import TextToSpeechNode
 from .image_to_text_node import ImageToTextNode
-from .fetch_text_node import FetchTextNode
-from .parse_text_node import ParseTextNode
+from .fetch_text_node import FetchTextNode
diff --git a/scrapegraphai/nodes/fetch_html_node.py b/scrapegraphai/nodes/fetch_html_node.py
@@ -81,10 +81,11 @@ def execute(self, state: dict) -> dict:
 
         loader = AsyncHtmlLoader(url)
         document = loader.load()
-        metadata = document[0].metadata
-        document = remover(str(document[0]))
+        # metadata = document[0].metadata
+        # document = remover(str(document[0]))
 
-        state["document"] = [
-            Document(page_content=document, metadata=metadata)]
+        # state["document"] = [
+        #     Document(page_content=document, metadata=metadata)]
+        state["document"] = document
 
         return state
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -11,7 +11,6 @@
 
 # Imports from the library
 from .base_node import BaseNode
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 class GenerateAnswerNode(BaseNode):
@@ -71,7 +70,7 @@ def execute(self, state: dict) -> dict:
         print("---GENERATING ANSWER---")
         try:
             user_input = state["user_input"]
-            document = state["document_chunks"]
+            document = state["document"]
         except KeyError as e:
             print(f"Error: {e} not found in state.")
             raise
@@ -111,34 +110,28 @@ def execute(self, state: dict) -> dict:
             prompt = PromptTemplate(
                 template=template_chunks,
                 input_variables=["question"],
-                partial_variables={"context": chunk,
+                partial_variables={"context": chunk.page_content,
                                    "chunk_id": i + 1, "format_instructions": format_instructions},
             )
             # Dynamically name the chains based on their index
-            chains_dict[f"chunk{i+1}"] = prompt | self.llm | output_parser
+            chain_name = f"chunk{i+1}"
+            chains_dict[chain_name] = prompt | self.llm | output_parser
 
-        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-            chunk_size=4000,
-            chunk_overlap=0,
-        )
-
-        chunks = text_splitter.split_text(str(chains_dict))
+        # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel
+        map_chain = RunnableParallel(**chains_dict)
+        # Chain
+        answer_map = map_chain.invoke({"question": user_input})
 
+        # Merge the answers from the chunks
         merge_prompt = PromptTemplate(
             template=template_merge,
             input_variables=["context", "question"],
             partial_variables={"format_instructions": format_instructions},
         )
         merge_chain = merge_prompt | self.llm | output_parser
+        answer = merge_chain.invoke(
+            {"context": answer_map, "question": user_input})
 
-        answer_lines = []
-        for chunk in chunks:
-            answer_temp = merge_chain.invoke(
-                {"context": chunk, "question": user_input})
-            answer_lines.append(answer_temp)
-
-        unique_answer_lines = list(set(answer_lines))
-        answer = '\n'.join(unique_answer_lines)
-
+        # Update the state with the generated answer
         state.update({"answer": answer})
-        return state
+        return state
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -6,11 +6,11 @@
 from .base_node import BaseNode
 
 
-class ParseHTMLNode(BaseNode):
+class ParseNode(BaseNode):
     """
-    A node responsible for parsing HTML content from a document using specified tags. 
+    A node responsible for parsing HTML content from a document. 
     It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting
-    specific parts of an HTML document based on the tags provided in the state.
+    specific parts of an HTML document.
 
     This node enhances the scraping workflow by allowing for targeted extraction of 
     content, thereby optimizing the processing of large HTML documents.
@@ -28,14 +28,18 @@ class ParseHTMLNode(BaseNode):
         the specified tags, if provided, and updates the state with the parsed content.
     """
 
-    def __init__(self, node_name: str):
+    def __init__(self, doc_type: str = "html", chunks_size: int = 4000, node_name: str = "ParseHTMLNode"):
         """
         Initializes the ParseHTMLNode with a node name.
         Args:
+            doc_type (str): type of the input document
+            chunks_size (int): size of the chunks to split the document
             node_name (str): name of the node
             node_type (str, optional): type of the node
         """
         super().__init__(node_name, "node")
+        self.doc_type = doc_type
+        self.chunks_size = chunks_size
 
     def execute(self,  state):
         """
@@ -57,23 +61,27 @@ def execute(self,  state):
                       information for parsing is missing.
         """
 
-        print("---PARSING HTML DOCUMENT---")
+        print("---PARSING DOCUMENT---")
         try:
             document = state["document"]
         except KeyError as e:
             print(f"Error: {e} not found in state.")
             raise
-
+        
         text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-            chunk_size=4000,
+            chunk_size=self.chunks_size,
             chunk_overlap=0,
         )
 
-        docs_transformed = Html2TextTransformer(
-        ).transform_documents(document)[0]
+        # Parse the document based on the specified doc_type
+        if self.doc_type == "html":
+            docs_transformed = Html2TextTransformer(
+            ).transform_documents(document)[0]
+        elif self.doc_type == "text":
+            docs_transformed = document
 
         chunks = text_splitter.split_text(docs_transformed.page_content)
 
-        state.update({"document_chunks": chunks})
+        state.update({"parsed_document": chunks})
 
         return state
diff --git a/scrapegraphai/nodes/parse_text_node.py b/scrapegraphai/nodes/parse_text_node.py
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py

Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,8 @@`
`16`	`16`	`}`
`17`	`17`
`18`	`18`	`# Define URL and PROMPT`
`19`		`-URL = "https://www.google.com/search?client=safari&rls=en&q=ristoranti+trento&ie=UTF-8&oe=UTF-8"`
`20`		`-PROMPT = "List me all the https inside the page"`
	`19`	`+URL = "https://www.ansa.it/veneto/"`
	`20`	`+PROMPT = "List me all the news with their description."`
`21`	`21`
`22`	`22`	`# Create the SmartScraperGraph instance`
`23`	`23`	`smart_scraper_graph = SmartScraperGraph(PROMPT, URL, llm_config)`