Merge pull request #31 from VinciGit00/refactoring-parse-html

PeriniM · web-flow · commit 20a78acede8c · 2024-02-28T17:57:39.000+01:00
Refactoring parse html
diff --git a/README.md b/README.md
@@ -24,20 +24,13 @@ Is it possible to try also the colab version
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sEZBonBMGP44CtO6GQTwAlL0BGJXjtfd?usp=sharing)
 
-Try out ScrapeGraphAI in your browser:
-
-[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/VinciGit00/Scrapegraph-ai)
-
+Follow the procedure on the following link to setup your OpenAI API key: [link](https://scrapegraph-ai.readthedocs.io/en/latest/index.html).
 
 ## 📖 Documentation
 
 The documentation for ScrapeGraphAI can be found [here](https://scrapegraph-ai.readthedocs.io/en/latest/).
 Behind this there is also the docusaurus documentation [here](https://scrapegraph-doc.onrender.com/)).
 
-## Setup the api keys
-
-Follow the procedure on the following link to setup your OpenAI API key: [link](https://scrapegraph-ai.readthedocs.io/en/latest/index.html).
-
 ## 💻 Usage
 
 ### Case 1: Extracting information using a prompt
diff --git a/examples/graph_examples/graph_evaluation_example.py b/examples/graph_examples/graph_evaluation_example.py
@@ -2,8 +2,8 @@
 Module for evaluating the graph
 """
 import os
-from scrapegraphai.evaluators import TrulensEvaluator
 from dotenv import load_dotenv
+from scrapegraphai.evaluators import TrulensEvaluator
 
 load_dotenv()
 
@@ -26,6 +26,7 @@
 # Create the TrulensEvaluator instance
 trulens_evaluator = TrulensEvaluator(openai_key)
 # Evaluate SmartScraperGraph on the list of inputs
-(results_df, answer) = trulens_evaluator.evaluate(list_of_inputs, dashboard=False)
+(results_df, answer) = trulens_evaluator.evaluate(
+    list_of_inputs, dashboard=False)
 
 print(answer)
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -65,7 +65,7 @@ def execute(self, state: dict) -> dict:
         print("---GENERATING ANSWER---")
         try:
             user_input = state["user_input"]
-            document = state["document"]
+            document = state["document_chunks"]
         except KeyError as e:
             print(f"Error: {e} not found in state.")
             raise
@@ -104,7 +104,7 @@ def execute(self, state: dict) -> dict:
             prompt = PromptTemplate(
                 template=template_chunks,
                 input_variables=["question"],
-                partial_variables={"context": chunk.page_content,
+                partial_variables={"context": chunk,
                                    "chunk_id": i + 1, "format_instructions": format_instructions},
             )
             # Dynamically name the chains based on their index
diff --git a/scrapegraphai/nodes/parse_html_node.py b/scrapegraphai/nodes/parse_html_node.py
@@ -1,7 +1,8 @@
 """
 Module for parsing the HTML node
 """
-from langchain_community.document_transformers import BeautifulSoupTransformer
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_transformers import Html2TextTransformer
 from .base_node import BaseNode
 
 
@@ -36,7 +37,7 @@ def __init__(self, node_name: str, node_type: str = "ParseHTMLNode"):
         """
         super().__init__(node_name, node_type)
 
-    def execute(self, state):
+    def execute(self,  state):
         """
         Executes the node's logic to parse the HTML document based on specified tags. 
         If tags are provided in the state, the document is parsed accordingly; otherwise, 
@@ -63,15 +64,16 @@ def execute(self, state):
             print(f"Error: {e} not found in state.")
             raise
 
-        tags = state.get("tags", None)
+        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            chunk_size=4000,
+            chunk_overlap=0,
+        )
 
-        if not tags:
-            print("No specific tags provided; returning document as is.")
-            return state
+        docs_transformed = Html2TextTransformer(
+        ).transform_documents(document)[0]
+
+        chunks = text_splitter.split_text(docs_transformed.page_content)
+
+        state.update({"document_chunks": chunks})
 
-        bs_transformer = BeautifulSoupTransformer()
-        parsed_document = bs_transformer.transform_documents(
-            document, tags_to_extract=tags)
-        print("Document parsed with specified tags.")
-        state.update({"parsed_document": parsed_document})
         return state