Skip to content

Commit 71bd53d

Browse files
committed
refactoring of the code, add tests and examples
1 parent 98a3598 commit 71bd53d

28 files changed

+384
-105
lines changed

.github/workflows/pylint.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ jobs:
2020
pip install pylint
2121
pip install -r requirements.txt
2222
- name: Analysing the code with pylint
23-
run: pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraphai/**/*.py scrapegraphai/*.py examples/*.py tests/*.py
23+
run:pylint scrapegraphai/**/*.py scrapegraphai/*.py examples/*.py tests/**/*.py

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Try out ScrapeGraphAI in your browser:
2424
## 📖 Documentation
2525

2626
The documentation for ScrapeGraphAI can be found [here](https://scrapegraph-ai.readthedocs.io/en/latest/).
27-
Behind this there is also the docusaurus documentation [here]([https://scrapegraph-ai.readthedocs.io/en/latest/](https://scrapegraph-doc.onrender.com/)).
27+
Behind this there is also the docusaurus documentation [here](https://scrapegraph-doc.onrender.com/)).
2828

2929
## Setup the api keys
3030

commit_and_push.sh

+1-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ fi
2020
commit_message="$1"
2121

2222
# Run Pylint on the specified Python files
23-
pylint scrapegraphai/**/*.py scrapegraphai/*.py examples/*.py tests/*.py
24-
23+
pylint scrapegraphai/**/*.py scrapegraphai/*.py examples/*.py tests/**/*.py
2524
#Maket the pull
2625
git pull
2726

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Teest for convert_to_csv
3+
"""
4+
import os
5+
from scrapegraphai.utils.convert_to_csv import convert_to_csv
6+
7+
8+
def main():
9+
"""
10+
Example usage of the convert_to_csv function.
11+
"""
12+
# Example data
13+
data = {
14+
'Name': ['John', 'Alice', 'Bob'],
15+
'Age': [30, 25, 35],
16+
'City': ['New York', 'San Francisco', 'Seattle']
17+
}
18+
19+
# Example filename and position
20+
filename = "example_data"
21+
position = "./output"
22+
23+
try:
24+
# Convert data to CSV and save
25+
convert_to_csv(data, filename, position)
26+
print(
27+
f"Data saved successfully to {os.path.join(position, filename)}.csv")
28+
except ValueError as ve:
29+
print(f"ValueError: {ve}")
30+
except FileNotFoundError as fnfe:
31+
print(f"FileNotFoundError: {fnfe}")
32+
except PermissionError as pe:
33+
print(f"PermissionError: {pe}")
34+
except Exception as e:
35+
print(f"An unexpected error occurred: {e}")
36+
37+
38+
if __name__ == "__main__":
39+
main()
+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""
2+
Example of using convert_to_json function to save data in JSON format.
3+
"""
4+
import os
5+
from scrapegraphai.utils.convert_to_json import convert_to_json
6+
7+
# Data to save in JSON format
8+
data_to_save = {
9+
"name": "John Doe",
10+
"age": 30,
11+
"city": "New York"
12+
}
13+
14+
FILENAME = "example_data"
15+
DIRECTORY = "data_output"
16+
17+
try:
18+
convert_to_json(data_to_save, FILENAME, DIRECTORY)
19+
print(
20+
f"Data has been successfully saved to {os.path.join(DIRECTORY, FILENAME)}.json")
21+
except ValueError as value_error:
22+
print(value_error)
23+
except FileNotFoundError as file_not_found_error:
24+
print(file_not_found_error)
25+
except PermissionError as permission_error:
26+
print(permission_error)
27+
except Exception as exception:
28+
print(f"An error occurred: {exception}")

examples/utils/remover_example.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""
2+
Example of the remover method
3+
"""
4+
from scrapegraphai.utils.remover import remover
5+
6+
HTML_CONTENT = """
7+
<html>
8+
<head>
9+
<title>Test Page</title>
10+
</head>
11+
<body>
12+
<h1>This is a Test</h1>
13+
<p>Hello, World!</p>
14+
<script>alert("This is a script");</script>
15+
</body>
16+
</html>
17+
"""
18+
19+
parsed_content = remover(HTML_CONTENT)
20+
21+
print(parsed_content)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""
2+
Example for th e file save_audio_from_bytes
3+
"""
4+
from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes
5+
6+
BYTE_RESPONSE = b'\x12\x34\x56\x78\x90'
7+
8+
OUTPUT_PATH = "generated_speech.wav"
9+
10+
save_audio_from_bytes(BYTE_RESPONSE, OUTPUT_PATH)
+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""
2+
Example for calclating the tokenizer
3+
"""
4+
from scrapegraphai.utils.token_calculator import truncate_text_tokens
5+
6+
INPUT_TEXT = "http://nba.com"
7+
8+
MODEL_NAME = "gpt-3.5-turbo"
9+
ENCODING_NAME = "EMBEDDING_ENCODING"
10+
11+
tokenized_chunks = truncate_text_tokens(INPUT_TEXT, MODEL_NAME, ENCODING_NAME)
12+
13+
for i, chunk in enumerate(tokenized_chunks):
14+
print(f"Chunk {i+1}: {chunk}")
File renamed without changes.

scrapegraphai/builders/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
__init__.py file for builders folder
33
"""
44

5-
from .graph_builder import GraphBuilder
5+
from .graph_builder import GraphBuilder

scrapegraphai/builders/graph_builder.py

+29-17
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
1+
"""
2+
Module for making the graph building
3+
"""
4+
import graphviz
15
from langchain_core.prompts import ChatPromptTemplate
26
from langchain.chains import create_extraction_chain
37
from ..models import OpenAI
4-
from ..utils import nodes_metadata, graph_schema
8+
from ..helpers import nodes_metadata, graph_schema
9+
510

611
class GraphBuilder:
712
"""
@@ -11,19 +16,24 @@ class GraphBuilder:
1116
1217
Attributes:
1318
prompt (str): The user's natural language prompt for the scraping task.
14-
llm (ChatOpenAI): An instance of the ChatOpenAI class configured with the specified llm_config.
19+
llm (ChatOpenAI): An instance of the ChatOpenAI class configured
20+
with the specified llm_config.
1521
nodes_description (str): A string description of all available nodes and their arguments.
16-
chain (LLMChain): The extraction chain responsible for processing the prompt and creating the graph.
22+
chain (LLMChain): The extraction chain responsible for
23+
processing the prompt and creating the graph.
1724
1825
Methods:
19-
build_graph(): Executes the graph creation process based on the user prompt and returns the graph configuration.
20-
convert_json_to_graphviz(json_data): Converts a JSON graph configuration to a Graphviz object for visualization.
26+
build_graph(): Executes the graph creation process based on the user prompt
27+
and returns the graph configuration.
28+
convert_json_to_graphviz(json_data): Converts a JSON graph configuration
29+
to a Graphviz object for visualization.
2130
2231
Args:
2332
prompt (str): The user's natural language prompt describing the desired scraping operation.
2433
url (str): The target URL from which data is to be scraped.
25-
llm_config (dict): Configuration parameters for the language model, where 'api_key' is mandatory,
26-
and 'model_name', 'temperature', and 'streaming' can be optionally included.
34+
llm_config (dict): Configuration parameters for the
35+
language model, where 'api_key' is mandatory,
36+
and 'model_name', 'temperature', and 'streaming' can be optionally included.
2737
2838
Raises:
2939
ValueError: If 'api_key' is not included in llm_config.
@@ -38,7 +48,7 @@ def __init__(self, user_prompt: str, llm_config: dict):
3848
self.llm = self._create_llm()
3949
self.nodes_description = self._generate_nodes_description()
4050
self.chain = self._create_extraction_chain()
41-
51+
4252
def _create_llm(self):
4353
"""
4454
Creates an instance of the OpenAI class with the provided language model configuration.
@@ -77,7 +87,8 @@ def _generate_nodes_description(self):
7787

7888
def _create_extraction_chain(self):
7989
"""
80-
Creates an extraction chain for processing the user prompt and generating the graph configuration.
90+
Creates an extraction chain for processing the user prompt and
91+
generating the graph configuration.
8192
8293
Returns:
8394
LLMChain: An instance of the LLMChain class.
@@ -90,20 +101,22 @@ def _create_extraction_chain(self):
90101
91102
Based on the user's input: "{input}", identify the essential nodes required for the task and suggest a graph configuration that outlines the flow between the chosen nodes.
92103
""".format(nodes_description=self.nodes_description, input="{input}")
93-
extraction_prompt = ChatPromptTemplate.from_template(create_graph_prompt_template)
104+
extraction_prompt = ChatPromptTemplate.from_template(
105+
create_graph_prompt_template)
94106
return create_extraction_chain(prompt=extraction_prompt, schema=graph_schema, llm=self.llm)
95107

96108
def build_graph(self):
97109
"""
98-
Executes the graph creation process based on the user prompt and returns the graph configuration.
110+
Executes the graph creation process based on the user prompt and
111+
returns the graph configuration.
99112
100113
Returns:
101114
dict: A JSON representation of the graph configuration.
102115
"""
103116
return self.chain.invoke(self.user_prompt)
104-
117+
105118
@staticmethod
106-
def convert_json_to_graphviz(json_data, format='pdf'):
119+
def convert_json_to_graphviz(json_data, format: str = 'pdf'):
107120
"""
108121
Converts a JSON graph configuration to a Graphviz object for visualization.
109122
@@ -113,11 +126,10 @@ def convert_json_to_graphviz(json_data, format='pdf'):
113126
Returns:
114127
graphviz.Digraph: A Graphviz object representing the graph configuration.
115128
"""
116-
import graphviz
117129

118130
graph = graphviz.Digraph(comment='ScrapeGraphAI Generated Graph', format=format,
119-
node_attr={'color': 'lightblue2', 'style': 'filled'})
120-
131+
node_attr={'color': 'lightblue2', 'style': 'filled'})
132+
121133
graph_config = json_data["text"][0]
122134

123135
# Retrieve nodes, edges, and the entry point from the JSON data
@@ -142,4 +154,4 @@ def convert_json_to_graphviz(json_data, format='pdf'):
142154
else:
143155
graph.edge(edge['from'], edge['to'])
144156

145-
return graph
157+
return graph

scrapegraphai/graphs/speech_summary_graph.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""
2+
Module for extracting the summary from the speach
3+
"""
4+
from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes
15
from ..models import OpenAI, OpenAITextToSpeech
26
from .base_graph import BaseGraph
37
from ..nodes import (
@@ -7,8 +11,8 @@
711
GenerateAnswerNode,
812
ParseHTMLNode,
913
TextToSpeechNode,
10-
)
11-
from scrapegraphai.utils import save_audio_from_bytes
14+
)
15+
1216

1317
class SpeechSummaryGraph:
1418
"""
@@ -17,7 +21,8 @@ class SpeechSummaryGraph:
1721
1822
Attributes:
1923
url (str): The URL of the web page to scrape and summarize.
20-
llm_config (dict): Configuration parameters for the language model, with 'api_key' mandatory.
24+
llm_config (dict): Configuration parameters for the language model,
25+
with 'api_key' mandatory.
2126
summary_prompt (str): The prompt used to guide the summarization process.
2227
output_path (Path): The path where the generated MP3 file will be saved.
2328
@@ -40,9 +45,9 @@ def __init__(self, prompt: str, url: str, llm_config: dict, output_path: str):
4045
self.llm_config = llm_config
4146
self.llm = self._create_llm()
4247
self.output_path = output_path
43-
self.text_to_speech_model = OpenAITextToSpeech(llm_config, model="tts-1", voice="alloy")
48+
self.text_to_speech_model = OpenAITextToSpeech(
49+
llm_config, model="tts-1", voice="alloy")
4450
self.graph = self._create_graph()
45-
4651

4752
def _create_llm(self):
4853
"""

scrapegraphai/helpers/__init__.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""
2+
__init__.py for th e helpers folder
3+
4+
"""
5+
from .nodes_metadata import nodes_metadata
6+
from .schemas import graph_schema

scrapegraphai/utils/nodes_metadata.py scrapegraphai/helpers/nodes_metadata.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -43,24 +43,27 @@
4343
"type": "conditional_node",
4444
"args": {
4545
"key_name": "The key in the state to check for a condition.",
46-
"next_nodes": "A list of two nodes specifying the next node to execute based on the condition's outcome."
46+
"next_nodes": """A list of two nodes specifying the next node
47+
to execute based on the condition's outcome."""
4748
},
4849
"returns": "The name of the next node to execute."
4950
},
5051
"ImageToTextNode": {
51-
"description": "Converts image content to text by extracting visual information and interpreting it.",
52+
"description": """Converts image content to text by
53+
extracting visual information and interpreting it.""",
5254
"type": "node",
5355
"args": {
5456
"image_data": "Data of the image to be processed."
5557
},
5658
"returns": "Updated state with the textual description of the image under 'image_text' key."
5759
},
5860
"TextToSpeechNode": {
59-
"description": "Converts text into spoken words, allowing for auditory representation of the text.",
61+
"description": """Converts text into spoken words, allow
62+
ing for auditory representation of the text.""",
6063
"type": "node",
6164
"args": {
6265
"text": "The text to be converted into speech."
6366
},
6467
"returns": "Updated state with the speech audio file or data under 'speech_audio' key."
6568
}
66-
}
69+
}

0 commit comments

Comments
 (0)