Skip to content

Commit 22d636c

Browse files
authored
Merge pull request #35 from VinciGit00/creation_dummy_node
Creation dummy node
2 parents 527c810 + ab71a6b commit 22d636c

File tree

7 files changed

+261
-3
lines changed

7 files changed

+261
-3
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# 🕷️ ScrapeGraphAI: You Only Scrape Once
22

3-
ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines.
3+
ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines for websites and documents.
44
Just say which information you want to extract and the library will do it for you!
55

66
<p align="center">
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.models import OpenAI
8+
from scrapegraphai.graphs import BaseGraph
9+
from scrapegraphai.nodes import FetchTextNode, ParseTextNode, GenerateAnswerNode
10+
11+
load_dotenv()
12+
13+
# Define the configuration for the language model
14+
openai_key = os.getenv("OPENAI_APIKEY")
15+
llm_config = {
16+
"api_key": openai_key,
17+
"model_name": "gpt-3.5-turbo",
18+
"temperature": 0,
19+
"streaming": True
20+
}
21+
model = OpenAI(llm_config)
22+
23+
with open("text_example.txt", "r", encoding="utf-8") as file:
24+
text = file.read()
25+
26+
27+
# define the nodes for the graph
28+
fetch_html_node = FetchTextNode("load_html")
29+
parse_document_node = ParseTextNode("parse_document")
30+
generate_answer_node = GenerateAnswerNode(model, "generate_answer")
31+
32+
# create the graph
33+
graph = BaseGraph(
34+
nodes={
35+
fetch_html_node,
36+
parse_document_node,
37+
generate_answer_node
38+
},
39+
edges={
40+
(fetch_html_node, parse_document_node),
41+
(parse_document_node, generate_answer_node)
42+
},
43+
entry_point=fetch_html_node
44+
)
45+
46+
# execute the graph
47+
inputs = {"user_input": "Give me the name of all the news",
48+
"text": text}
49+
result = graph.execute(inputs)
50+
51+
# get the answer from the result
52+
answer = result.get("answer", "No answer found.")
53+
print(answer)
+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<body class="section-hp hpsection-page base-editorial-page base-page page basicpage"><div id="dtm_script" data-src="https://assets.adobedtm.com/launch-EN578e54a3f0c6478aa41c89022e30863c.min.js"></div>
2+
3+
<a class="c-card c-card--CA25-m c-card--CA25-t c-card--CA25-d c-card--no-abstract-m c-card--media c-card--base" href="https://sport.sky.it/nba/nba-classifica-marcatori-ogni-epoca" data-no-autoplay-policy-article-enabled="false">
4+
<article class="c-card__wrapper aem_card_check_wrapper">
5+
<div class="c-card__content">
6+
7+
<h2 class="c-card__title">Countdown LeBron: mancano 8 punti ai 40.000</h2>
8+
9+
10+
11+
12+
13+
14+
<div class="c-card__label-wrapper c-label-wrapper">
15+
16+
<span class="c-label c-label--article-heading">CLASSIFICA</span>
17+
18+
</div>
19+
20+
21+
22+
<p class="c-card__abstract">Nella notte, per battere Washington in una sfida meno semplice del previsto, ai Lakers sono...</p>
23+
24+
<div class="c-card__info">
25+
<time class="c-card__date" datetime="01 mar - 09:10">01 mar - 09:10
26+
</time>
27+
<span class="c-card__content-data">
28+
<i class="icon icon--media-outline icon--gallery-outline icon--xxsmall icon--c-neutral">
29+
<svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery-outline"><path d="M26.174 32.174v31.975h44.588V32.174H26.174zm-3.08-9.238h50.747A6.159 6.159 0 0 1 80 29.095v38.134a6.159 6.159 0 0 1-6.159 6.158H23.095a6.159 6.159 0 0 1-6.159-6.158V29.095a6.159 6.159 0 0 1 6.159-6.159zM9.239 55.665a4.619 4.619 0 0 1-9.238 0V16.777C0 10.825 4.825 6 10.777 6H64.08a4.619 4.619 0 1 1 0 9.238H10.777c-.85 0-1.54.69-1.54 1.54v38.887z" fill="currentColor" fill-rule="evenodd"></path></svg>
30+
</i>
31+
26 foto
32+
</span>
33+
</div>
34+
35+
36+
37+
38+
<!-- end -->
39+
</div>
40+
41+
<div class="c-card__img-wrapper">
42+
<figure class="o-aspect-ratio o-aspect-ratio--16-10 ">
43+
<img crossorigin="anonymous" class="c-card__img j-lazyload" alt="Countdown LeBron: mancano 8 punti ai 40.000" data-srcset="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 314w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 628w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 416w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 832w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 375w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 750w" sizes="(max-width: 1024px) 100vw, 30vw" loading="lazy" data-src="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg" src="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg" srcset="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 314w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 628w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 416w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 832w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 375w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 750w">
44+
45+
<noscript>
46+
<img crossorigin="anonymous" class="c-card__img" alt="Countdown LeBron: mancano 8 punti ai 40.000" srcset="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 314w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 628w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 416w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 832w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 375w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 750w" sizes="(max-width: 1024px) 100vw, 30vw" src="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg"/>
47+
</noscript>
48+
</figure>
49+
50+
51+
52+
53+
<i class="icon icon--media icon--gallery icon--medium icon--c-primary">
54+
<svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery">
55+
<path d="M17.005 20.221h60.211c1.538 0 2.784 1.28 2.784 2.858v48.317c0 1.578-1.246 2.858-2.784 2.858H17.005c-1.537 0-2.784-1.28-2.784-2.858V23.079c0-1.578 1.247-2.858 2.784-2.858zM5.873 11.873V60.62a2.937 2.937 0 0 1-5.873 0V11.286A5.286 5.286 0 0 1 5.286 6h61.08a2.937 2.937 0 1 1 0 5.873H5.873z"></path>
56+
</svg>
57+
</i>
58+
59+
</div>
60+
</article>
61+
</a>
62+
63+
</body>

scrapegraphai/nodes/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@
99
from .rag_node import RAGNode
1010
from .text_to_speech_node import TextToSpeechNode
1111
from .image_to_text_node import ImageToTextNode
12+
from .fetch_text_node import FetchTextNode
13+
from .parse_text_node import ParseTextNode
+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
""" 
2+
Module for FetchTextNode
3+
"""
4+
from .base_node import BaseNode
5+
6+
7+
class FetchTextNode(BaseNode):
8+
"""
9+
A node for loading raw text into the state.
10+
11+
Primarily used in scraping workflows, this node prepares the state by directly
12+
loading raw text content from a specified source, making it available for
13+
further processing by subsequent nodes in the graph.
14+
15+
Attributes:
16+
node_name (str): The unique identifier for the node.
17+
node_type (str): The type of the node ("node" in this case).
18+
19+
Args:
20+
node_name (str): The unique identifier for the node.
21+
22+
Methods:
23+
execute(state): Directly loads text content into the state and stores it
24+
under the 'document' key. Requires the 'url' key to be present in
25+
the state, representing the location of the text content.
26+
"""
27+
28+
def __init__(self, node_name: str):
29+
"""
30+
Initializes the FetchTextNode with a node name.
31+
32+
Args:
33+
node_name (str): The unique name for the node.
34+
"""
35+
super().__init__(node_name, "node")
36+
37+
def execute(self, state: dict) -> dict:
38+
"""
39+
Loads raw text content into the state.
40+
41+
Args:
42+
state (dict): The current state, expected to contain a 'text' key
43+
indicating the source of the text.
44+
45+
Returns:
46+
dict: The updated state with the text content stored under the 'document' key.
47+
48+
Raises:
49+
KeyError: If the 'url' key is missing from the state.
50+
"""
51+
print("---LOADING TEXT CODE---")
52+
53+
if 'text' not in state:
54+
raise KeyError("The 'url' key is required to load the text.")
55+
56+
state["document"] = state["text"]
57+
return state

scrapegraphai/nodes/generate_answer_node.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
"""
22
Module for generating the answer node
33
"""
4-
from langchain_core.output_parsers import JsonOutputParser
4+
# Imports from standard library
5+
from tqdm import tqdm
6+
7+
# Imports from Langchain
58
from langchain.prompts import PromptTemplate
9+
from langchain_core.output_parsers import JsonOutputParser
610
from langchain_core.runnables import RunnableParallel
11+
12+
# Imports from the library
713
from .base_node import BaseNode
814

915

@@ -99,7 +105,8 @@ def execute(self, state: dict) -> dict:
99105

100106
chains_dict = {}
101107

102-
for i, chunk in enumerate(context):
108+
# Use tqdm to add progress bar
109+
for i, chunk in enumerate(tqdm(context, desc="Processing chunks")):
103110
prompt = PromptTemplate(
104111
template=template_chunks,
105112
input_variables=["question"],
+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
Module for parsing the HTML node
3+
"""
4+
from langchain.text_splitter import RecursiveCharacterTextSplitter
5+
from .base_node import BaseNode
6+
7+
8+
class ParseTextNode(BaseNode):
9+
"""
10+
A node for extracting content from HTML documents based on provided tags.
11+
12+
This node leverages the BeautifulSoupTransformer to offer flexible parsing
13+
capabilities. It allows you to isolate specific elements within an HTML
14+
document, making it valuable for targeted content extraction in scraping workflows.
15+
16+
Attributes:
17+
node_name (str): Unique name for the node (defaults to "ParseHTMLNode").
18+
node_type (str): Indicates a standard operational node (set to "node").
19+
20+
Args:
21+
node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").
22+
23+
Methods:
24+
execute(state):
25+
* Extracts content from the 'document' field in the state based on tags (if provided in the state).
26+
* Stores the result in the 'parsed_document' field of the state.
27+
* Employs the RecursiveCharacterTextSplitter for handling larger documents.
28+
"""
29+
30+
def __init__(self, node_name: str = "ParseHTMLNode"):
31+
"""
32+
Initializes the ParseHTMLNode.
33+
34+
Args:
35+
node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").
36+
"""
37+
super().__init__(node_name, "node")
38+
39+
def execute(self, state):
40+
"""
41+
Parses HTML content and updates the state.
42+
43+
Args:
44+
state (dict): Expects the following keys:
45+
'document': The HTML content to parse.
46+
'tags' (optional): A list of HTML tags to target for extraction.
47+
48+
Returns:
49+
dict: Updated state with the following:
50+
'parsed_document': The extracted content
51+
(or the original document if no tags were provided).
52+
'document_chunks': The original document split into chunka
53+
(using RecursiveCharacterTextSplitter)
54+
for larger documents.
55+
56+
Raises:
57+
KeyError: If the required 'document' key is missing from the state.
58+
"""
59+
60+
print("---PARSING TEXT DOCUMENT---")
61+
62+
try:
63+
document = state["document"]
64+
except KeyError as e:
65+
print(f"Error: {e} not found in state.")
66+
raise
67+
68+
# ... (Add logic for parsing with BeautifulSoup based on 'tags' if present)
69+
70+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
71+
chunk_size=4000,
72+
chunk_overlap=0,
73+
)
74+
state["document_chunks"] = text_splitter.split_text(document)
75+
76+
return state

0 commit comments

Comments
 (0)