1
+ from langchain_core .prompts import ChatPromptTemplate
2
+ from langchain .chains import create_extraction_chain
3
+ from ..models import OpenAI
4
+ from ..utils import nodes_metadata , graph_schema
5
+
6
+ class GraphBuilder :
7
+ """
8
+ GraphBuilder is a dynamic tool for constructing web scraping graphs based on user prompts.
9
+ It utilizes a natural language understanding model to interpret user prompts and
10
+ automatically generates a graph configuration for scraping web content.
11
+
12
+ Attributes:
13
+ prompt (str): The user's natural language prompt for the scraping task.
14
+ llm (ChatOpenAI): An instance of the ChatOpenAI class configured with the specified llm_config.
15
+ nodes_description (str): A string description of all available nodes and their arguments.
16
+ chain (LLMChain): The extraction chain responsible for processing the prompt and creating the graph.
17
+
18
+ Methods:
19
+ build_graph(): Executes the graph creation process based on the user prompt and returns the graph configuration.
20
+ convert_json_to_graphviz(json_data): Converts a JSON graph configuration to a Graphviz object for visualization.
21
+
22
+ Args:
23
+ prompt (str): The user's natural language prompt describing the desired scraping operation.
24
+ url (str): The target URL from which data is to be scraped.
25
+ llm_config (dict): Configuration parameters for the language model, where 'api_key' is mandatory,
26
+ and 'model_name', 'temperature', and 'streaming' can be optionally included.
27
+
28
+ Raises:
29
+ ValueError: If 'api_key' is not included in llm_config.
30
+ """
31
+
32
+ def __init__ (self , user_prompt : str , llm_config : dict ):
33
+ """
34
+ Initializes the GraphBuilder with a user prompt and language model configuration.
35
+ """
36
+ self .user_prompt = user_prompt
37
+ self .llm_config = llm_config
38
+ self .llm = self ._create_llm ()
39
+ self .nodes_description = self ._generate_nodes_description ()
40
+ self .chain = self ._create_extraction_chain ()
41
+
42
+ def _create_llm (self ):
43
+ """
44
+ Creates an instance of the OpenAI class with the provided language model configuration.
45
+
46
+ Returns:
47
+ OpenAI: An instance of the OpenAI class.
48
+
49
+ Raises:
50
+ ValueError: If 'api_key' is not provided in llm_config.
51
+ """
52
+ llm_defaults = {
53
+ "model_name" : "gpt-3.5-turbo" ,
54
+ "temperature" : 0 ,
55
+ "streaming" : True
56
+ }
57
+ # Update defaults with any LLM parameters that were provided
58
+ llm_params = {** llm_defaults , ** self .llm_config }
59
+ # Ensure the api_key is set, raise an error if it's not
60
+ if "api_key" not in llm_params :
61
+ raise ValueError ("LLM configuration must include an 'api_key'." )
62
+ # Create the OpenAI instance with the provided and default parameters
63
+ return OpenAI (llm_params )
64
+
65
+ def _generate_nodes_description (self ):
66
+ """
67
+ Generates a string description of all available nodes and their arguments.
68
+
69
+ Returns:
70
+ str: A string description of all available nodes and their arguments.
71
+ """
72
+
73
+ return "\n " .join ([
74
+ f'- { node } : { data ["description" ]} (Type: { data ["type" ]} , Args: { ", " .join (data ["args" ].keys ())} )'
75
+ for node , data in nodes_metadata .items ()
76
+ ])
77
+
78
+ def _create_extraction_chain (self ):
79
+ """
80
+ Creates an extraction chain for processing the user prompt and generating the graph configuration.
81
+
82
+ Returns:
83
+ LLMChain: An instance of the LLMChain class.
84
+ """
85
+
86
+ create_graph_prompt_template = """
87
+ You are an AI that designs direct graphs for web scraping tasks. Your goal is to create a web scraping pipeline that is efficient and tailored to the user's requirements. You have access to a set of default nodes, each with specific capabilities:
88
+
89
+ {nodes_description}
90
+
91
+ Based on the user's input: "{input}", identify the essential nodes required for the task and suggest a graph configuration that outlines the flow between the chosen nodes.
92
+ """ .format (nodes_description = self .nodes_description , input = "{input}" )
93
+ extraction_prompt = ChatPromptTemplate .from_template (create_graph_prompt_template )
94
+ return create_extraction_chain (prompt = extraction_prompt , schema = graph_schema , llm = self .llm )
95
+
96
+ def build_graph (self ):
97
+ """
98
+ Executes the graph creation process based on the user prompt and returns the graph configuration.
99
+
100
+ Returns:
101
+ dict: A JSON representation of the graph configuration.
102
+ """
103
+ return self .chain .invoke (self .user_prompt )
104
+
105
+ @staticmethod
106
+ def convert_json_to_graphviz (json_data , format = 'pdf' ):
107
+ """
108
+ Converts a JSON graph configuration to a Graphviz object for visualization.
109
+
110
+ Args:
111
+ json_data (dict): A JSON representation of the graph configuration.
112
+
113
+ Returns:
114
+ graphviz.Digraph: A Graphviz object representing the graph configuration.
115
+ """
116
+ import graphviz
117
+
118
+ graph = graphviz .Digraph (comment = 'ScrapeGraphAI Generated Graph' , format = format ,
119
+ node_attr = {'color' : 'lightblue2' , 'style' : 'filled' })
120
+
121
+ graph_config = json_data ["text" ][0 ]
122
+
123
+ # Retrieve nodes, edges, and the entry point from the JSON data
124
+ nodes = graph_config .get ('nodes' , [])
125
+ edges = graph_config .get ('edges' , [])
126
+ entry_point = graph_config .get ('entry_point' )
127
+
128
+ # Add nodes to the graph
129
+ for node in nodes :
130
+ # If this node is the entry point, use a double circle to denote it
131
+ if node ['node_name' ] == entry_point :
132
+ graph .node (node ['node_name' ], shape = 'doublecircle' )
133
+ else :
134
+ graph .node (node ['node_name' ])
135
+
136
+ # Add edges to the graph
137
+ for edge in edges :
138
+ # An edge could potentially have multiple 'to' nodes if it's from a conditional node
139
+ if isinstance (edge ['to' ], list ):
140
+ for to_node in edge ['to' ]:
141
+ graph .edge (edge ['from' ], to_node )
142
+ else :
143
+ graph .edge (edge ['from' ], edge ['to' ])
144
+
145
+ return graph
0 commit comments