Merge pull request #36 from VinciGit00/multiple-chunking-for-generating-answer

lurenss · web-flow · commit a64850d29a71 · 2024-03-08T17:51:03.000+01:00
Multiple chunking for generating answer
diff --git a/commit_and_push.sh b/commit_and_push.sh
@@ -21,7 +21,7 @@ commit_message="$1"
 
 # Run Pylint on the specified Python files
 pylint scrapegraphai/**/*.py scrapegraphai/*.py examples/**/*.py tests/**/*.py
-#Maket the pull
+#Make the pull
 git pull
 
 # Add the modified files to the Git repository
diff --git a/examples/graph_examples/smart_scraper_example.py b/examples/graph_examples/smart_scraper_example.py
@@ -16,8 +16,8 @@
 }
 
 # Define URL and PROMPT
-URL = "https://perinim.github.io/projects/"
-PROMPT = "List me all the titles and project descriptions"
+URL = "https://www.google.com/search?client=safari&rls=en&q=ristoranti+trento&ie=UTF-8&oe=UTF-8"
+PROMPT = "List me all the https inside the page"
 
 # Create the SmartScraperGraph instance
 smart_scraper_graph = SmartScraperGraph(PROMPT, URL, llm_config)
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -11,6 +11,7 @@
 
 # Imports from the library
 from .base_node import BaseNode
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 class GenerateAnswerNode(BaseNode):
@@ -114,24 +115,30 @@ def execute(self, state: dict) -> dict:
                                    "chunk_id": i + 1, "format_instructions": format_instructions},
             )
             # Dynamically name the chains based on their index
-            chain_name = f"chunk{i+1}"
-            chains_dict[chain_name] = prompt | self.llm | output_parser
+            chains_dict[f"chunk{i+1}"] = prompt | self.llm | output_parser
 
-        # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel
-        map_chain = RunnableParallel(**chains_dict)
-        # Chain
-        answer_map = map_chain.invoke({"question": user_input})
+        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            chunk_size=4000,
+            chunk_overlap=0,
+        )
+
+        chunks = text_splitter.split_text(str(chains_dict))
 
-        # Merge the answers from the chunks
         merge_prompt = PromptTemplate(
             template=template_merge,
             input_variables=["context", "question"],
             partial_variables={"format_instructions": format_instructions},
         )
         merge_chain = merge_prompt | self.llm | output_parser
-        answer = merge_chain.invoke(
-            {"context": answer_map, "question": user_input})
 
-        # Update the state with the generated answer
+        answer_lines = []
+        for chunk in chunks:
+            answer_temp = merge_chain.invoke(
+                {"context": chunk, "question": user_input})
+            answer_lines.append(answer_temp)
+
+        unique_answer_lines = list(set(answer_lines))
+        answer = '\n'.join(unique_answer_lines)
+
         state.update({"answer": answer})
         return state

Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,8 @@`
`16`	`16`	`}`
`17`	`17`
`18`	`18`	`# Define URL and PROMPT`
`19`		`-URL = "https://perinim.github.io/projects/"`
`20`		`-PROMPT = "List me all the titles and project descriptions"`
	`19`	`+URL = "https://www.google.com/search?client=safari&rls=en&q=ristoranti+trento&ie=UTF-8&oe=UTF-8"`
	`20`	`+PROMPT = "List me all the https inside the page"`
`21`	`21`
`22`	`22`	`# Create the SmartScraperGraph instance`
`23`	`23`	`smart_scraper_graph = SmartScraperGraph(PROMPT, URL, llm_config)`