Skip to content

Commit 69fb6ce

Browse files
authored
Merge pull request #26 from VinciGit00/multi-chunks-implementation
Multi chunks implementation
2 parents 74acc64 + ed48f27 commit 69fb6ce

10 files changed

+234
-41
lines changed

examples/custom_graph_example.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
)
4141

4242
# execute the graph
43-
inputs = {"user_input": "What is the title of the page?", "url": "https://example.com"}
43+
inputs = {"user_input": "Give me the news", "url": "https://www.ansa.it/sito/notizie/topnews/index.shtml"}
4444
result = graph.execute(inputs)
4545

4646
# get the answer from the result

examples/smart_scraper_example.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,12 @@
1515
"model_name": "gpt-3.5-turbo",
1616
}
1717

18-
smart_scraper_graph = SmartScraperGraph("List me all the titles and project descriptions",
19-
"https://perinim.github.io/projects/", llm_config)
18+
# Define URL and prompt
19+
url = "https://perinim.github.io/projects/"
20+
prompt = "List me all the titles and project descriptions"
21+
22+
# Create the SmartScraperGraph instance
23+
smart_scraper_graph = SmartScraperGraph(prompt, url, llm_config)
2024

2125
answer = smart_scraper_graph.run()
2226
print(answer)

poetry.lock

+56-11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "scrapegraphai"
3-
version = "0.0.3"
3+
version = "0.0.4"
44
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
55
authors = [
66
"Marco Vinciguerra <mvincig11@gmail.com>",
@@ -26,6 +26,8 @@ langchain = "0.1.6"
2626
langchain_community = "0.0.19"
2727
langchain_core = "0.1.22"
2828
langchain_openai = "0.0.5"
29+
html2text = "2020.1.16"
30+
faiss-cpu = "1.7.4"
2931
beautifulsoup4 = "4.12.3"
3032
pandas = "2.0.3"
3133
python-dotenv = "1.0.1"

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ langchain==0.1.6
22
langchain_community==0.0.19
33
langchain_core==0.1.22
44
langchain_openai==0.0.5
5+
faiss-cpu==1.7.4
6+
html2text==2020.1.16
57
beautifulsoup4==4.12.3
68
pandas==2.0.3
79
python-dotenv==1.0.1

scrapegraphai/graphs/smart_scraper_graph.py

+7-17
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,9 @@
55
from .base_graph import BaseGraph
66
from ..nodes import (
77
FetchHTMLNode,
8-
ConditionalNode,
9-
GetProbableTagsNode,
10-
GenerateAnswerNode,
11-
ParseHTMLNode
12-
)
13-
8+
RAGNode,
9+
GenerateAnswerNode
10+
)
1411

1512
class SmartScraperGraph:
1613
"""
@@ -77,25 +74,18 @@ def _create_graph(self):
7774
BaseGraph: An instance of the BaseGraph class.
7875
"""
7976
fetch_html_node = FetchHTMLNode("fetch_html")
80-
get_probable_tags_node = GetProbableTagsNode(
81-
self.llm, "get_probable_tags")
82-
parse_document_node = ParseHTMLNode("parse_document")
77+
rag_node = RAGNode(self.llm, "rag")
8378
generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer")
84-
conditional_node = ConditionalNode(
85-
"conditional", [parse_document_node, generate_answer_node])
8679

8780
return BaseGraph(
8881
nodes={
8982
fetch_html_node,
90-
get_probable_tags_node,
91-
conditional_node,
92-
parse_document_node,
83+
rag_node,
9384
generate_answer_node,
9485
},
9586
edges={
96-
(fetch_html_node, get_probable_tags_node),
97-
(get_probable_tags_node, conditional_node),
98-
(parse_document_node, generate_answer_node)
87+
(fetch_html_node, rag_node),
88+
(rag_node, generate_answer_node)
9989
},
10090
entry_point=fetch_html_node
10191
)

scrapegraphai/helpers/nodes_metadata.py

+12
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,18 @@
2929
},
3030
"returns": "Updated state with extracted data under 'parsed_document' key."
3131
},
32+
"RAGNode": {
33+
"description": """A node responsible for reducing the amount of text to be processed
34+
by identifying and retrieving the most relevant chunks of text based on the user's query.
35+
Utilizes RecursiveCharacterTextSplitter for chunking, Html2TextTransformer for HTML to text
36+
conversion, and a combination of FAISS and OpenAIEmbeddings for efficient information retrieval.""",
37+
"type": "node",
38+
"args": {
39+
"user_input": "The user's query or question guiding the retrieval.",
40+
"document": "The HTML content to be processed and compressed."
41+
},
42+
"returns": "Updated state with 'relevant_chunks' key containing the most relevant text chunks."
43+
},
3244
"GenerateAnswerNode": {
3345
"description": "Generates an answer based on the user's input and parsed document.",
3446
"type": "node",

scrapegraphai/nodes/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,6 @@
66
from .get_probable_tags_node import GetProbableTagsNode
77
from .generate_answer_node import GenerateAnswerNode
88
from .parse_html_node import ParseHTMLNode
9+
from .rag_node import RAGNode
910
from .text_to_speech_node import TextToSpeechNode
1011
from .image_to_text_node import ImageToTextNode

scrapegraphai/nodes/generate_answer_node.py

+36-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44
from langchain_core.output_parsers import JsonOutputParser
55
from langchain.prompts import PromptTemplate
6+
from langchain_core.runnables import RunnableParallel
67
from .base_node import BaseNode
78

89

@@ -78,22 +79,48 @@ def execute(self, state: dict) -> dict:
7879
output_parser = JsonOutputParser()
7980
format_instructions = output_parser.get_format_instructions()
8081

81-
template = """You are a website scraper and you have just scraped the
82+
template_chunks = """You are a website scraper and you have just scraped the
8283
following content from a website.
83-
You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n The content is as follows: {context}
84+
You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n
85+
The website is big so I am giving you one chunk at the time to be merged later with the other chunks.\n
86+
Content of {chunk_id}: {context}
8487
Question: {question}
8588
"""
89+
90+
template_merge = """You are a website scraper and you have just scraped the
91+
following content from a website.
92+
You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n
93+
You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n
94+
Content to merge: {context}
95+
Question: {question}
96+
"""
97+
98+
chains_dict = {}
99+
100+
for i, chunk in enumerate(context):
101+
prompt = PromptTemplate(
102+
template=template_chunks,
103+
input_variables=["question"],
104+
partial_variables={"context": chunk.page_content, "chunk_id": i + 1, "format_instructions": format_instructions},
105+
)
106+
# Dynamically name the chains based on their index
107+
chain_name = f"chunk{i+1}"
108+
chains_dict[chain_name] = prompt | self.llm | output_parser
86109

87-
schema_prompt = PromptTemplate(
88-
template=template,
110+
# Use dictionary unpacking to pass the dynamically named chains to RunnableParallel
111+
map_chain = RunnableParallel(**chains_dict)
112+
# Chain
113+
answer_map = map_chain.invoke({"question": user_input})
114+
115+
# Merge the answers from the chunks
116+
merge_prompt = PromptTemplate(
117+
template=template_merge,
89118
input_variables=["context", "question"],
90119
partial_variables={"format_instructions": format_instructions},
91120
)
92-
93-
# Chain
94-
schema_chain = schema_prompt | self.llm | output_parser
95-
answer = schema_chain.invoke(
96-
{"context": context, "question": user_input})
121+
merge_chain = merge_prompt | self.llm | output_parser
122+
answer = merge_chain.invoke(
123+
{"context": answer_map, "question": user_input})
97124

98125
# Update the state with the generated answer
99126
state.update({"answer": answer})

0 commit comments

Comments
 (0)