ScrapeGraphAI
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎examples/.env.example
+1-1 b/‎examples/.env.example
+1-1
diff --git a/‎examples/fetch_html_node_example.py ‎examples/custom_graph_example.py
+5-9 b/‎examples/fetch_html_node_example.py ‎examples/custom_graph_example.py
+5-9
diff --git a/‎examples/graph_builder_example.py
+25 b/‎examples/graph_builder_example.py
+25
diff --git a/‎examples/smart_scraper_example.py
+22 b/‎examples/smart_scraper_example.py
+22
diff --git a/‎examples/speech_summary_graph_example.py
+26 b/‎examples/speech_summary_graph_example.py
+26
diff --git a/‎examples/vision_speech_example.py
+34 b/‎examples/vision_speech_example.py
+34
diff --git a/‎poetry.lock
+20-4 b/‎poetry.lock
+20-4
diff --git a/‎pyproject.toml
+1 b/‎pyproject.toml
+1
diff --git a/‎scrapegraphai/builders/__init__.py
+5 b/‎scrapegraphai/builders/__init__.py
+5
diff --git a/‎scrapegraphai/builders/graph_builder.py
+145 b/‎scrapegraphai/builders/graph_builder.py
+145
diff --git a/‎scrapegraphai/graphs/__init__.py
+2-1 b/‎scrapegraphai/graphs/__init__.py
+2-1
@@ -21,3 +21,7 @@ docs/source/_static/
 .env
 venv/
 .vscode/
+
+# exclude pdf, mp3
+*.pdf
+*.mp3
@@ -1 +1 @@
-API_KEY="your openai.com api key"
+OPENAI_APIKEY="your openai.com api key"
@@ -4,25 +4,21 @@
 
 import os
 from dotenv import load_dotenv
-
-from langchain_openai import ChatOpenAI
+from scrapegraphai.models import OpenAI
 from scrapegraphai.graphs import BaseGraph
 from scrapegraphai.nodes import FetchHTMLNode, ParseHTMLNode, GenerateAnswerNode
 
-# load the environment variables
 load_dotenv()
-openai_key = os.getenv("API_KEY")
-if not openai_key:
-    print("Error: OpenAI API key not found in environment variables.")
 
 # Define the configuration for the language model
+openai_key = os.getenv("OPENAI_APIKEY")
 llm_config = {
     "api_key": openai_key,
     "model_name": "gpt-3.5-turbo",
     "temperature": 0,
     "streaming": True
 }
-model = ChatOpenAI(**llm_config)
+model = OpenAI(llm_config)
 
 # define the nodes for the graph
 fetch_html_node = FetchHTMLNode("fetch_html")
@@ -44,9 +40,9 @@
 )
 
 # execute the graph
-inputs = {"keys": {"user_input": "What is the title of the page?", "url": "https://example.com"}}
+inputs = {"user_input": "What is the title of the page?", "url": "https://example.com"}
 result = graph.execute(inputs)
 
 # get the answer from the result
-answer = result["keys"].get("answer", "No answer found.")
+answer = result.get("answer", "No answer found.")
 print(answer)
@@ -0,0 +1,25 @@
+import os
+from dotenv import load_dotenv
+from scrapegraphai.builders import GraphBuilder
+
+load_dotenv()
+
+# Define the configuration for the language model
+openai_key = os.getenv("OPENAI_APIKEY")
+llm_config = {
+    "api_key": openai_key,
+    "model_name": "gpt-3.5-turbo",
+    "temperature": 0,
+    "streaming": True
+}
+
+# Example usage of GraphBuilder
+user_prompt = "Extract the news and generate a text summary with a voiceover."
+graph_builder = GraphBuilder(user_prompt, llm_config)
+graph_json = graph_builder.build_graph()
+
+# Convert the resulting JSON to Graphviz format
+graphviz_graph = graph_builder.convert_json_to_graphviz(graph_json)
+
+# Save the graph to a file and open it in the default viewer
+graphviz_graph.render('ScrapeGraphAI_generated_graph', view=True)
@@ -0,0 +1,22 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+
+load_dotenv()
+
+# Define the configuration for the language model
+openai_key = os.getenv("OPENAI_APIKEY")
+llm_config = {
+    "api_key": openai_key,
+    "model_name": "gpt-3.5-turbo",
+}
+
+smart_scraper_graph = SmartScraperGraph("List me all the titles and project descriptions",
+                             "https://perinim.github.io/projects/", llm_config)
+
+answer = smart_scraper_graph.run()
+print(answer)
@@ -0,0 +1,26 @@
+""" 
+Basic example of scraping pipeline using SpeechSummaryGraph
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SpeechSummaryGraph
+
+load_dotenv()
+
+# Define the configuration for the language model
+openai_key = os.getenv("OPENAI_APIKEY")
+llm_config = {
+    "api_key": openai_key,
+}
+
+# Save the audio to a file
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+output_file_path = os.path.join(curr_dir, "website_summary.mp3")
+
+speech_summary_graph = SpeechSummaryGraph("Make a summary of the webpage to be converted to audio for blind people.",
+                             "https://perinim.github.io/projects/", llm_config,
+                                output_file_path)
+
+final_state = speech_summary_graph.run()
+print(final_state.get("answer", "No answer found."))
@@ -0,0 +1,34 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.models import OpenAIImageToText, OpenAITextToSpeech
+from scrapegraphai.utils import save_audio_from_bytes
+
+load_dotenv()
+
+# Define the configuration for the language model
+openai_key = os.getenv("OPENAI_APIKEY")
+llm_config = {
+    "api_key": openai_key,
+    "model_name": "gpt-4-vision-preview",
+}
+
+model = OpenAIImageToText(llm_config)
+answer = model.run("https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/scrapegraphai_logo.png")
+print(answer)
+
+text_to_speech = OpenAITextToSpeech(llm_config, model="tts-1", voice="alloy")
+
+text = "Today is a wonderful day to build something people love!"
+audio = text_to_speech.run(text)
+
+# Save the audio to a file
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(curr_dir, "text2speech.mp3")
+
+save_audio_from_bytes(audio, file_path)
+
+print(f"Speech file saved to: {file_path}")
@@ -31,6 +31,7 @@ pandas = "2.0.3"
 python-dotenv = "1.0.1"
 tiktoken = {version = ">=0.5.2,<0.6.0"}
 tqdm = "4.66.1"
+graphviz = "0.20.1"
 
 [tool.poetry.dev-dependencies]
 pytest = "8.0.0"
 
@@ -0,0 +1,5 @@
+"""
+    __init__.py file for builders folder
+"""
+
+from .graph_builder import GraphBuilder
@@ -0,0 +1,145 @@
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_extraction_chain
+from ..models import OpenAI
+from ..utils import nodes_metadata, graph_schema
+
+class GraphBuilder:
+    """
+    GraphBuilder is a dynamic tool for constructing web scraping graphs based on user prompts. 
+    It utilizes a natural language understanding model to interpret user prompts and 
+    automatically generates a graph configuration for scraping web content.
+
+    Attributes:
+        prompt (str): The user's natural language prompt for the scraping task.
+        llm (ChatOpenAI): An instance of the ChatOpenAI class configured with the specified llm_config.
+        nodes_description (str): A string description of all available nodes and their arguments.
+        chain (LLMChain): The extraction chain responsible for processing the prompt and creating the graph.
+
+    Methods:
+        build_graph(): Executes the graph creation process based on the user prompt and returns the graph configuration.
+        convert_json_to_graphviz(json_data): Converts a JSON graph configuration to a Graphviz object for visualization.
+
+    Args:
+        prompt (str): The user's natural language prompt describing the desired scraping operation.
+        url (str): The target URL from which data is to be scraped.
+        llm_config (dict): Configuration parameters for the language model, where 'api_key' is mandatory, 
+                           and 'model_name', 'temperature', and 'streaming' can be optionally included.
+
+    Raises:
+        ValueError: If 'api_key' is not included in llm_config.
+    """
+
+    def __init__(self, user_prompt: str, llm_config: dict):
+        """
+        Initializes the GraphBuilder with a user prompt and language model configuration.
+        """
+        self.user_prompt = user_prompt
+        self.llm_config = llm_config
+        self.llm = self._create_llm()
+        self.nodes_description = self._generate_nodes_description()
+        self.chain = self._create_extraction_chain()
+        
+    def _create_llm(self):
+        """
+        Creates an instance of the OpenAI class with the provided language model configuration.
+
+        Returns:
+            OpenAI: An instance of the OpenAI class.
+
+        Raises:
+            ValueError: If 'api_key' is not provided in llm_config.
+        """
+        llm_defaults = {
+            "model_name": "gpt-3.5-turbo",
+            "temperature": 0,
+            "streaming": True
+        }
+        # Update defaults with any LLM parameters that were provided
+        llm_params = {**llm_defaults, **self.llm_config}
+        # Ensure the api_key is set, raise an error if it's not
+        if "api_key" not in llm_params:
+            raise ValueError("LLM configuration must include an 'api_key'.")
+        # Create the OpenAI instance with the provided and default parameters
+        return OpenAI(llm_params)
+
+    def _generate_nodes_description(self):
+        """
+        Generates a string description of all available nodes and their arguments.
+
+        Returns:
+            str: A string description of all available nodes and their arguments.
+        """
+
+        return "\n".join([
+            f'- {node}: {data["description"]} (Type: {data["type"]}, Args: {", ".join(data["args"].keys())})'
+            for node, data in nodes_metadata.items()
+        ])
+
+    def _create_extraction_chain(self):
+        """
+        Creates an extraction chain for processing the user prompt and generating the graph configuration.
+
+        Returns:
+            LLMChain: An instance of the LLMChain class.
+        """
+
+        create_graph_prompt_template = """
+        You are an AI that designs direct graphs for web scraping tasks. Your goal is to create a web scraping pipeline that is efficient and tailored to the user's requirements. You have access to a set of default nodes, each with specific capabilities:
+
+        {nodes_description}
+
+        Based on the user's input: "{input}", identify the essential nodes required for the task and suggest a graph configuration that outlines the flow between the chosen nodes.
+        """.format(nodes_description=self.nodes_description, input="{input}")
+        extraction_prompt = ChatPromptTemplate.from_template(create_graph_prompt_template)
+        return create_extraction_chain(prompt=extraction_prompt, schema=graph_schema, llm=self.llm)
+
+    def build_graph(self):
+        """
+        Executes the graph creation process based on the user prompt and returns the graph configuration.
+
+        Returns:
+            dict: A JSON representation of the graph configuration.
+        """
+        return self.chain.invoke(self.user_prompt)
+    
+    @staticmethod
+    def convert_json_to_graphviz(json_data, format='pdf'):
+        """
+        Converts a JSON graph configuration to a Graphviz object for visualization.
+
+        Args:
+            json_data (dict): A JSON representation of the graph configuration.
+
+        Returns:
+            graphviz.Digraph: A Graphviz object representing the graph configuration.
+        """
+        import graphviz
+
+        graph = graphviz.Digraph(comment='ScrapeGraphAI Generated Graph', format=format,
+                     node_attr={'color': 'lightblue2', 'style': 'filled'})
+        
+        graph_config = json_data["text"][0]
+
+        # Retrieve nodes, edges, and the entry point from the JSON data
+        nodes = graph_config.get('nodes', [])
+        edges = graph_config.get('edges', [])
+        entry_point = graph_config.get('entry_point')
+
+        # Add nodes to the graph
+        for node in nodes:
+            # If this node is the entry point, use a double circle to denote it
+            if node['node_name'] == entry_point:
+                graph.node(node['node_name'], shape='doublecircle')
+            else:
+                graph.node(node['node_name'])
+
+        # Add edges to the graph
+        for edge in edges:
+            # An edge could potentially have multiple 'to' nodes if it's from a conditional node
+            if isinstance(edge['to'], list):
+                for to_node in edge['to']:
+                    graph.edge(edge['from'], to_node)
+            else:
+                graph.edge(edge['from'], edge['to'])
+
+        return graph
@@ -2,4 +2,5 @@
 __init__.py file for graphs folder
 """
 from .base_graph import BaseGraph
-from .smart_scraper_graph import SmartScraper
+from .smart_scraper_graph import SmartScraperGraph
+from .speech_summary_graph import SpeechSummaryGraph
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-API_KEY="your openai.com api key"`
	`1`	`+OPENAI_APIKEY="your openai.com api key"`