Skip to content

Commit 98a3598

Browse files
authored
Merge pull request #24 from VinciGit00/graph-builder
OpenAI Model Wrappers, Graph Building Tools, and Speech Summary Graph Pipeline
2 parents 0c1b22e + 3dc7bf1 commit 98a3598

33 files changed

+824
-189
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,7 @@ docs/source/_static/
2121
.env
2222
venv/
2323
.vscode/
24+
25+
# exclude pdf, mp3
26+
*.pdf
27+
*.mp3

examples/.env.example

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
API_KEY="your openai.com api key"
1+
OPENAI_APIKEY="your openai.com api key"

examples/fetch_html_node_example.py examples/custom_graph_example.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,21 @@
44

55
import os
66
from dotenv import load_dotenv
7-
8-
from langchain_openai import ChatOpenAI
7+
from scrapegraphai.models import OpenAI
98
from scrapegraphai.graphs import BaseGraph
109
from scrapegraphai.nodes import FetchHTMLNode, ParseHTMLNode, GenerateAnswerNode
1110

12-
# load the environment variables
1311
load_dotenv()
14-
openai_key = os.getenv("API_KEY")
15-
if not openai_key:
16-
print("Error: OpenAI API key not found in environment variables.")
1712

1813
# Define the configuration for the language model
14+
openai_key = os.getenv("OPENAI_APIKEY")
1915
llm_config = {
2016
"api_key": openai_key,
2117
"model_name": "gpt-3.5-turbo",
2218
"temperature": 0,
2319
"streaming": True
2420
}
25-
model = ChatOpenAI(**llm_config)
21+
model = OpenAI(llm_config)
2622

2723
# define the nodes for the graph
2824
fetch_html_node = FetchHTMLNode("fetch_html")
@@ -44,9 +40,9 @@
4440
)
4541

4642
# execute the graph
47-
inputs = {"keys": {"user_input": "What is the title of the page?", "url": "https://example.com"}}
43+
inputs = {"user_input": "What is the title of the page?", "url": "https://example.com"}
4844
result = graph.execute(inputs)
4945

5046
# get the answer from the result
51-
answer = result["keys"].get("answer", "No answer found.")
47+
answer = result.get("answer", "No answer found.")
5248
print(answer)

examples/graph_builder_example.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import os
2+
from dotenv import load_dotenv
3+
from scrapegraphai.builders import GraphBuilder
4+
5+
load_dotenv()
6+
7+
# Define the configuration for the language model
8+
openai_key = os.getenv("OPENAI_APIKEY")
9+
llm_config = {
10+
"api_key": openai_key,
11+
"model_name": "gpt-3.5-turbo",
12+
"temperature": 0,
13+
"streaming": True
14+
}
15+
16+
# Example usage of GraphBuilder
17+
user_prompt = "Extract the news and generate a text summary with a voiceover."
18+
graph_builder = GraphBuilder(user_prompt, llm_config)
19+
graph_json = graph_builder.build_graph()
20+
21+
# Convert the resulting JSON to Graphviz format
22+
graphviz_graph = graph_builder.convert_json_to_graphviz(graph_json)
23+
24+
# Save the graph to a file and open it in the default viewer
25+
graphviz_graph.render('ScrapeGraphAI_generated_graph', view=True)

examples/smart_scraper_example.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
9+
load_dotenv()
10+
11+
# Define the configuration for the language model
12+
openai_key = os.getenv("OPENAI_APIKEY")
13+
llm_config = {
14+
"api_key": openai_key,
15+
"model_name": "gpt-3.5-turbo",
16+
}
17+
18+
smart_scraper_graph = SmartScraperGraph("List me all the titles and project descriptions",
19+
"https://perinim.github.io/projects/", llm_config)
20+
21+
answer = smart_scraper_graph.run()
22+
print(answer)
+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""
2+
Basic example of scraping pipeline using SpeechSummaryGraph
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SpeechSummaryGraph
8+
9+
load_dotenv()
10+
11+
# Define the configuration for the language model
12+
openai_key = os.getenv("OPENAI_APIKEY")
13+
llm_config = {
14+
"api_key": openai_key,
15+
}
16+
17+
# Save the audio to a file
18+
curr_dir = os.path.dirname(os.path.realpath(__file__))
19+
output_file_path = os.path.join(curr_dir, "website_summary.mp3")
20+
21+
speech_summary_graph = SpeechSummaryGraph("Make a summary of the webpage to be converted to audio for blind people.",
22+
"https://perinim.github.io/projects/", llm_config,
23+
output_file_path)
24+
25+
final_state = speech_summary_graph.run()
26+
print(final_state.get("answer", "No answer found."))

examples/vision_speech_example.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.models import OpenAIImageToText, OpenAITextToSpeech
8+
from scrapegraphai.utils import save_audio_from_bytes
9+
10+
load_dotenv()
11+
12+
# Define the configuration for the language model
13+
openai_key = os.getenv("OPENAI_APIKEY")
14+
llm_config = {
15+
"api_key": openai_key,
16+
"model_name": "gpt-4-vision-preview",
17+
}
18+
19+
model = OpenAIImageToText(llm_config)
20+
answer = model.run("https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/scrapegraphai_logo.png")
21+
print(answer)
22+
23+
text_to_speech = OpenAITextToSpeech(llm_config, model="tts-1", voice="alloy")
24+
25+
text = "Today is a wonderful day to build something people love!"
26+
audio = text_to_speech.run(text)
27+
28+
# Save the audio to a file
29+
curr_dir = os.path.dirname(os.path.realpath(__file__))
30+
file_path = os.path.join(curr_dir, "text2speech.mp3")
31+
32+
save_audio_from_bytes(audio, file_path)
33+
34+
print(f"Speech file saved to: {file_path}")

poetry.lock

+20-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ pandas = "2.0.3"
3131
python-dotenv = "1.0.1"
3232
tiktoken = {version = ">=0.5.2,<0.6.0"}
3333
tqdm = "4.66.1"
34+
graphviz = "0.20.1"
3435

3536
[tool.poetry.dev-dependencies]
3637
pytest = "8.0.0"

scrapegraphai/builders/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""
2+
__init__.py file for builders folder
3+
"""
4+
5+
from .graph_builder import GraphBuilder
+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from langchain_core.prompts import ChatPromptTemplate
2+
from langchain.chains import create_extraction_chain
3+
from ..models import OpenAI
4+
from ..utils import nodes_metadata, graph_schema
5+
6+
class GraphBuilder:
7+
"""
8+
GraphBuilder is a dynamic tool for constructing web scraping graphs based on user prompts.
9+
It utilizes a natural language understanding model to interpret user prompts and
10+
automatically generates a graph configuration for scraping web content.
11+
12+
Attributes:
13+
prompt (str): The user's natural language prompt for the scraping task.
14+
llm (ChatOpenAI): An instance of the ChatOpenAI class configured with the specified llm_config.
15+
nodes_description (str): A string description of all available nodes and their arguments.
16+
chain (LLMChain): The extraction chain responsible for processing the prompt and creating the graph.
17+
18+
Methods:
19+
build_graph(): Executes the graph creation process based on the user prompt and returns the graph configuration.
20+
convert_json_to_graphviz(json_data): Converts a JSON graph configuration to a Graphviz object for visualization.
21+
22+
Args:
23+
prompt (str): The user's natural language prompt describing the desired scraping operation.
24+
url (str): The target URL from which data is to be scraped.
25+
llm_config (dict): Configuration parameters for the language model, where 'api_key' is mandatory,
26+
and 'model_name', 'temperature', and 'streaming' can be optionally included.
27+
28+
Raises:
29+
ValueError: If 'api_key' is not included in llm_config.
30+
"""
31+
32+
def __init__(self, user_prompt: str, llm_config: dict):
33+
"""
34+
Initializes the GraphBuilder with a user prompt and language model configuration.
35+
"""
36+
self.user_prompt = user_prompt
37+
self.llm_config = llm_config
38+
self.llm = self._create_llm()
39+
self.nodes_description = self._generate_nodes_description()
40+
self.chain = self._create_extraction_chain()
41+
42+
def _create_llm(self):
43+
"""
44+
Creates an instance of the OpenAI class with the provided language model configuration.
45+
46+
Returns:
47+
OpenAI: An instance of the OpenAI class.
48+
49+
Raises:
50+
ValueError: If 'api_key' is not provided in llm_config.
51+
"""
52+
llm_defaults = {
53+
"model_name": "gpt-3.5-turbo",
54+
"temperature": 0,
55+
"streaming": True
56+
}
57+
# Update defaults with any LLM parameters that were provided
58+
llm_params = {**llm_defaults, **self.llm_config}
59+
# Ensure the api_key is set, raise an error if it's not
60+
if "api_key" not in llm_params:
61+
raise ValueError("LLM configuration must include an 'api_key'.")
62+
# Create the OpenAI instance with the provided and default parameters
63+
return OpenAI(llm_params)
64+
65+
def _generate_nodes_description(self):
66+
"""
67+
Generates a string description of all available nodes and their arguments.
68+
69+
Returns:
70+
str: A string description of all available nodes and their arguments.
71+
"""
72+
73+
return "\n".join([
74+
f'- {node}: {data["description"]} (Type: {data["type"]}, Args: {", ".join(data["args"].keys())})'
75+
for node, data in nodes_metadata.items()
76+
])
77+
78+
def _create_extraction_chain(self):
79+
"""
80+
Creates an extraction chain for processing the user prompt and generating the graph configuration.
81+
82+
Returns:
83+
LLMChain: An instance of the LLMChain class.
84+
"""
85+
86+
create_graph_prompt_template = """
87+
You are an AI that designs direct graphs for web scraping tasks. Your goal is to create a web scraping pipeline that is efficient and tailored to the user's requirements. You have access to a set of default nodes, each with specific capabilities:
88+
89+
{nodes_description}
90+
91+
Based on the user's input: "{input}", identify the essential nodes required for the task and suggest a graph configuration that outlines the flow between the chosen nodes.
92+
""".format(nodes_description=self.nodes_description, input="{input}")
93+
extraction_prompt = ChatPromptTemplate.from_template(create_graph_prompt_template)
94+
return create_extraction_chain(prompt=extraction_prompt, schema=graph_schema, llm=self.llm)
95+
96+
def build_graph(self):
97+
"""
98+
Executes the graph creation process based on the user prompt and returns the graph configuration.
99+
100+
Returns:
101+
dict: A JSON representation of the graph configuration.
102+
"""
103+
return self.chain.invoke(self.user_prompt)
104+
105+
@staticmethod
106+
def convert_json_to_graphviz(json_data, format='pdf'):
107+
"""
108+
Converts a JSON graph configuration to a Graphviz object for visualization.
109+
110+
Args:
111+
json_data (dict): A JSON representation of the graph configuration.
112+
113+
Returns:
114+
graphviz.Digraph: A Graphviz object representing the graph configuration.
115+
"""
116+
import graphviz
117+
118+
graph = graphviz.Digraph(comment='ScrapeGraphAI Generated Graph', format=format,
119+
node_attr={'color': 'lightblue2', 'style': 'filled'})
120+
121+
graph_config = json_data["text"][0]
122+
123+
# Retrieve nodes, edges, and the entry point from the JSON data
124+
nodes = graph_config.get('nodes', [])
125+
edges = graph_config.get('edges', [])
126+
entry_point = graph_config.get('entry_point')
127+
128+
# Add nodes to the graph
129+
for node in nodes:
130+
# If this node is the entry point, use a double circle to denote it
131+
if node['node_name'] == entry_point:
132+
graph.node(node['node_name'], shape='doublecircle')
133+
else:
134+
graph.node(node['node_name'])
135+
136+
# Add edges to the graph
137+
for edge in edges:
138+
# An edge could potentially have multiple 'to' nodes if it's from a conditional node
139+
if isinstance(edge['to'], list):
140+
for to_node in edge['to']:
141+
graph.edge(edge['from'], to_node)
142+
else:
143+
graph.edge(edge['from'], edge['to'])
144+
145+
return graph

scrapegraphai/graphs/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
__init__.py file for graphs folder
33
"""
44
from .base_graph import BaseGraph
5-
from .smart_scraper_graph import SmartScraper
5+
from .smart_scraper_graph import SmartScraperGraph
6+
from .speech_summary_graph import SpeechSummaryGraph

0 commit comments

Comments
 (0)