Merge pull request #26 from VinciGit00/multi-chunks-implementation

PeriniM · web-flow · commit 69fb6cea7115 · 2024-02-22T00:00:41.000+01:00
Multi chunks implementation
diff --git a/examples/custom_graph_example.py b/examples/custom_graph_example.py
@@ -40,7 +40,7 @@
 )
 
 # execute the graph
-inputs = {"user_input": "What is the title of the page?", "url": "https://example.com"}
+inputs = {"user_input": "Give me the news", "url": "https://www.ansa.it/sito/notizie/topnews/index.shtml"}
 result = graph.execute(inputs)
 
 # get the answer from the result
diff --git a/examples/smart_scraper_example.py b/examples/smart_scraper_example.py
@@ -15,8 +15,12 @@
     "model_name": "gpt-3.5-turbo",
 }
 
-smart_scraper_graph = SmartScraperGraph("List me all the titles and project descriptions",
-                             "https://perinim.github.io/projects/", llm_config)
+# Define URL and prompt
+url = "https://perinim.github.io/projects/"
+prompt = "List me all the titles and project descriptions"
+
+# Create the SmartScraperGraph instance
+smart_scraper_graph = SmartScraperGraph(prompt, url, llm_config)
 
 answer = smart_scraper_graph.run()
 print(answer)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scrapegraphai"
-version = "0.0.3"
+version = "0.0.4"
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
     "Marco Vinciguerra <mvincig11@gmail.com>",
@@ -26,6 +26,8 @@ langchain = "0.1.6"
 langchain_community = "0.0.19"
 langchain_core = "0.1.22"
 langchain_openai = "0.0.5"
+html2text = "2020.1.16"
+faiss-cpu = "1.7.4"
 beautifulsoup4 = "4.12.3"
 pandas = "2.0.3"
 python-dotenv = "1.0.1"
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,8 @@ langchain==0.1.6
 langchain_community==0.0.19
 langchain_core==0.1.22
 langchain_openai==0.0.5
+faiss-cpu==1.7.4
+html2text==2020.1.16
 beautifulsoup4==4.12.3
 pandas==2.0.3
 python-dotenv==1.0.1
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -5,12 +5,9 @@
 from .base_graph import BaseGraph
 from ..nodes import (
     FetchHTMLNode,
-    ConditionalNode,
-    GetProbableTagsNode,
-    GenerateAnswerNode,
-    ParseHTMLNode
-)
-
+    RAGNode,
+    GenerateAnswerNode
+    )
 
 class SmartScraperGraph:
     """
@@ -77,25 +74,18 @@ def _create_graph(self):
             BaseGraph: An instance of the BaseGraph class.
         """
         fetch_html_node = FetchHTMLNode("fetch_html")
-        get_probable_tags_node = GetProbableTagsNode(
-            self.llm, "get_probable_tags")
-        parse_document_node = ParseHTMLNode("parse_document")
+        rag_node = RAGNode(self.llm, "rag")
         generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer")
-        conditional_node = ConditionalNode(
-            "conditional", [parse_document_node, generate_answer_node])
 
         return BaseGraph(
             nodes={
                 fetch_html_node,
-                get_probable_tags_node,
-                conditional_node,
-                parse_document_node,
+                rag_node,
                 generate_answer_node,
             },
             edges={
-                (fetch_html_node, get_probable_tags_node),
-                (get_probable_tags_node, conditional_node),
-                (parse_document_node, generate_answer_node)
+                (fetch_html_node, rag_node),
+                (rag_node, generate_answer_node)
             },
             entry_point=fetch_html_node
         )
diff --git a/scrapegraphai/helpers/nodes_metadata.py b/scrapegraphai/helpers/nodes_metadata.py
@@ -29,6 +29,18 @@
         },
         "returns": "Updated state with extracted data under 'parsed_document' key."
     },
+    "RAGNode": {
+        "description": """A node responsible for reducing the amount of text to be processed 
+        by identifying and retrieving the most relevant chunks of text based on the user's query. 
+        Utilizes RecursiveCharacterTextSplitter for chunking, Html2TextTransformer for HTML to text 
+        conversion, and a combination of FAISS and OpenAIEmbeddings for efficient information retrieval.""",
+        "type": "node",
+        "args": {
+            "user_input": "The user's query or question guiding the retrieval.",
+            "document": "The HTML content to be processed and compressed."
+        },
+        "returns": "Updated state with 'relevant_chunks' key containing the most relevant text chunks."
+    },
     "GenerateAnswerNode": {
         "description": "Generates an answer based on the user's input and parsed document.",
         "type": "node",
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -6,5 +6,6 @@
 from .get_probable_tags_node import GetProbableTagsNode
 from .generate_answer_node import GenerateAnswerNode
 from .parse_html_node import ParseHTMLNode
+from .rag_node import RAGNode
 from .text_to_speech_node import TextToSpeechNode
 from .image_to_text_node import ImageToTextNode
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -3,6 +3,7 @@
 """
 from langchain_core.output_parsers import JsonOutputParser
 from langchain.prompts import PromptTemplate
+from langchain_core.runnables import RunnableParallel
 from .base_node import BaseNode
 
 
@@ -78,22 +79,48 @@ def execute(self, state: dict) -> dict:
         output_parser = JsonOutputParser()
         format_instructions = output_parser.get_format_instructions()
 
-        template = """You are a website scraper and you have just scraped the
+        template_chunks = """You are a website scraper and you have just scraped the
         following content from a website.
-         You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n The content is as follows: {context}
+        You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n
+        The website is big so I am giving you one chunk at the time to be merged later with the other chunks.\n
+        Content of {chunk_id}: {context}
         Question: {question}
                 """
+        
+        template_merge = """You are a website scraper and you have just scraped the
+        following content from a website.
+        You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n
+        You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n
+        Content to merge: {context}
+        Question: {question}
+                """
+
+        chains_dict = {}
+
+        for i, chunk in enumerate(context):
+            prompt = PromptTemplate(
+                template=template_chunks,
+                input_variables=["question"],
+                partial_variables={"context": chunk.page_content, "chunk_id": i + 1, "format_instructions": format_instructions},
+            )
+            # Dynamically name the chains based on their index
+            chain_name = f"chunk{i+1}"
+            chains_dict[chain_name] = prompt | self.llm | output_parser
 
-        schema_prompt = PromptTemplate(
-            template=template,
+        # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel
+        map_chain = RunnableParallel(**chains_dict)           
+        # Chain
+        answer_map = map_chain.invoke({"question": user_input})
+
+        # Merge the answers from the chunks
+        merge_prompt = PromptTemplate(
+            template=template_merge,
             input_variables=["context", "question"],
             partial_variables={"format_instructions": format_instructions},
         )
-
-        # Chain
-        schema_chain = schema_prompt | self.llm | output_parser
-        answer = schema_chain.invoke(
-            {"context": context, "question": user_input})
+        merge_chain = merge_prompt | self.llm | output_parser
+        answer = merge_chain.invoke(
+            {"context": answer_map, "question": user_input})
 
         # Update the state with the generated answer
         state.update({"answer": answer})
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@`
`40`	`40`	`)`
`41`	`41`
`42`	`42`	`# execute the graph`
`43`		`-inputs = {"user_input": "What is the title of the page?", "url": "https://example.com"}`
	`43`	`+inputs = {"user_input": "Give me the news", "url": "https://www.ansa.it/sito/notizie/topnews/index.shtml"}`
`44`	`44`	`result = graph.execute(inputs)`
`45`	`45`
`46`	`46`	`# get the answer from the result`