Merge pull request #35 from VinciGit00/creation_dummy_node

lurenss · web-flow · commit 22d636c47bc5 · 2024-03-03T21:54:33.000+01:00
Creation dummy node
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # 🕷️ ScrapeGraphAI: You Only Scrape Once
 
-ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines.
+ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines for websites and documents.
 Just say which information you want to extract and the library will do it for you!
 
 <p align="center">
diff --git a/examples/graph_examples/graph_from_text_example.py b/examples/graph_examples/graph_from_text_example.py
@@ -0,0 +1,53 @@
+"""
+Example of custom graph using existing nodes
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.models import OpenAI
+from scrapegraphai.graphs import BaseGraph
+from scrapegraphai.nodes import FetchTextNode, ParseTextNode, GenerateAnswerNode
+
+load_dotenv()
+
+# Define the configuration for the language model
+openai_key = os.getenv("OPENAI_APIKEY")
+llm_config = {
+    "api_key": openai_key,
+    "model_name": "gpt-3.5-turbo",
+    "temperature": 0,
+    "streaming": True
+}
+model = OpenAI(llm_config)
+
+with open("text_example.txt", "r", encoding="utf-8") as file:
+    text = file.read()
+
+
+# define the nodes for the graph
+fetch_html_node = FetchTextNode("load_html")
+parse_document_node = ParseTextNode("parse_document")
+generate_answer_node = GenerateAnswerNode(model, "generate_answer")
+
+# create the graph
+graph = BaseGraph(
+    nodes={
+        fetch_html_node,
+        parse_document_node,
+        generate_answer_node
+    },
+    edges={
+        (fetch_html_node, parse_document_node),
+        (parse_document_node, generate_answer_node)
+    },
+    entry_point=fetch_html_node
+)
+
+# execute the graph
+inputs = {"user_input": "Give me the name of all the news",
+          "text": text}
+result = graph.execute(inputs)
+
+# get the answer from the result
+answer = result.get("answer", "No answer found.")
+print(answer)
diff --git a/examples/graph_examples/text_example.txt b/examples/graph_examples/text_example.txt
@@ -0,0 +1,63 @@
+<body class="section-hp hpsection-page base-editorial-page base-page page basicpage"><div id="dtm_script" data-src="https://assets.adobedtm.com/launch-EN578e54a3f0c6478aa41c89022e30863c.min.js"></div>
+    
+<a class="c-card  c-card--CA25-m c-card--CA25-t c-card--CA25-d c-card--no-abstract-m c-card--media  c-card--base" href="https://sport.sky.it/nba/nba-classifica-marcatori-ogni-epoca" data-no-autoplay-policy-article-enabled="false">
+        <article class="c-card__wrapper aem_card_check_wrapper">
+            <div class="c-card__content">
+                
+                <h2 class="c-card__title">Countdown LeBron: mancano 8 punti ai 40.000</h2>
+
+                
+                
+
+
+                
+                    <div class="c-card__label-wrapper c-label-wrapper">
+                        
+                        <span class="c-label c-label--article-heading">CLASSIFICA</span>
+                        
+                    </div>
+                    
+                
+
+                <p class="c-card__abstract">Nella notte, per battere Washington in una sfida meno semplice del previsto, ai Lakers sono...</p>
+                
+                    <div class="c-card__info">
+                        <time class="c-card__date" datetime="01 mar - 09:10">01 mar - 09:10
+                        </time>
+                        <span class="c-card__content-data">
+                                <i class="icon icon--media-outline icon--gallery-outline icon--xxsmall icon--c-neutral">
+                                    <svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery-outline"><path d="M26.174 32.174v31.975h44.588V32.174H26.174zm-3.08-9.238h50.747A6.159 6.159 0 0 1 80 29.095v38.134a6.159 6.159 0 0 1-6.159 6.158H23.095a6.159 6.159 0 0 1-6.159-6.158V29.095a6.159 6.159 0 0 1 6.159-6.159zM9.239 55.665a4.619 4.619 0 0 1-9.238 0V16.777C0 10.825 4.825 6 10.777 6H64.08a4.619 4.619 0 1 1 0 9.238H10.777c-.85 0-1.54.69-1.54 1.54v38.887z" fill="currentColor" fill-rule="evenodd"></path></svg>
+                                </i>
+                                26 foto
+                            </span>
+                    </div>
+                    
+                
+                
+                
+                <!-- end -->
+            </div>
+
+            <div class="c-card__img-wrapper">
+                <figure class="o-aspect-ratio o-aspect-ratio--16-10 ">
+                    <img crossorigin="anonymous" class="c-card__img j-lazyload" alt="Countdown LeBron: mancano 8 punti ai 40.000" data-srcset="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 314w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 628w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg  416w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 832w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 375w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg  750w" sizes="(max-width: 1024px) 100vw, 30vw" loading="lazy" data-src="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg" src="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg" srcset="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 314w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 628w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg  416w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 832w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 375w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg  750w">
+                    
+                    <noscript>
+                        <img crossorigin="anonymous" class="c-card__img" alt="Countdown LeBron: mancano 8 punti ai 40.000" srcset="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 314w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-desktop-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 628w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg  416w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 832w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg 375w,https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-mobile-2x/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg  750w" sizes="(max-width: 1024px) 100vw, 30vw" src="https://static.sky.it/images/skysport/it/nba/nba-classifica-marcatori-ogni-epoca/LeBron_Record_Punti.jpg.transform/card-tablet/ebcd4e23041ef9af59c046ce3e919cf9000541a9/img.jpeg"/>
+                    </noscript>
+                </figure>
+
+                
+                    
+
+                    <i class="icon icon--media icon--gallery icon--medium icon--c-primary">
+                        <svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery">
+                            <path d="M17.005 20.221h60.211c1.538 0 2.784 1.28 2.784 2.858v48.317c0 1.578-1.246 2.858-2.784 2.858H17.005c-1.537 0-2.784-1.28-2.784-2.858V23.079c0-1.578 1.247-2.858 2.784-2.858zM5.873 11.873V60.62a2.937 2.937 0 0 1-5.873 0V11.286A5.286 5.286 0 0 1 5.286 6h61.08a2.937 2.937 0 1 1 0 5.873H5.873z"></path>
+                        </svg>
+                    </i>
+                
+            </div>
+        </article>
+    </a>
+
+</body>
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -9,3 +9,5 @@
 from .rag_node import RAGNode
 from .text_to_speech_node import TextToSpeechNode
 from .image_to_text_node import ImageToTextNode
+from .fetch_text_node import FetchTextNode
+from .parse_text_node import ParseTextNode
diff --git a/scrapegraphai/nodes/fetch_text_node.py b/scrapegraphai/nodes/fetch_text_node.py
@@ -0,0 +1,57 @@
+""" 
+Module for FetchTextNode
+"""
+from .base_node import BaseNode
+
+
+class FetchTextNode(BaseNode):
+    """
+    A node for loading raw text into the state.
+
+    Primarily used in scraping workflows, this node prepares the state by directly 
+    loading raw text content from a specified source, making it available for 
+    further processing by subsequent nodes in the graph.
+
+    Attributes:
+      node_name (str): The unique identifier for the node.
+      node_type (str): The type of the node ("node" in this case).
+
+    Args:
+      node_name (str): The unique identifier for the node.
+
+    Methods:
+      execute(state): Directly loads text content into the state and stores it
+          under the 'document' key. Requires the 'url' key to be present in 
+          the state, representing the location of the text content.
+    """
+
+    def __init__(self, node_name: str):
+        """
+        Initializes the FetchTextNode with a node name.
+
+        Args:
+          node_name (str): The unique name for the node.
+        """
+        super().__init__(node_name, "node")
+
+    def execute(self, state: dict) -> dict:
+        """
+        Loads raw text content into the state.
+
+        Args:
+          state (dict): The current state, expected to contain a 'text' key 
+              indicating the source of the text.
+
+        Returns:
+          dict: The updated state with the text content stored under the 'document' key.
+
+        Raises:
+          KeyError: If the 'url' key is missing from the state.
+        """
+        print("---LOADING TEXT CODE---")
+
+        if 'text' not in state:
+            raise KeyError("The 'url' key is required to load the text.")
+
+        state["document"] = state["text"]
+        return state
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,9 +1,15 @@
 """
 Module for generating the answer node
 """
-from langchain_core.output_parsers import JsonOutputParser
+# Imports from standard library
+from tqdm import tqdm
+
+# Imports from Langchain
 from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
+
+# Imports from the library
 from .base_node import BaseNode
 
 
@@ -99,7 +105,8 @@ def execute(self, state: dict) -> dict:
 
         chains_dict = {}
 
-        for i, chunk in enumerate(context):
+        # Use tqdm to add progress bar
+        for i, chunk in enumerate(tqdm(context, desc="Processing chunks")):
             prompt = PromptTemplate(
                 template=template_chunks,
                 input_variables=["question"],
diff --git a/scrapegraphai/nodes/parse_text_node.py b/scrapegraphai/nodes/parse_text_node.py
@@ -0,0 +1,76 @@
+"""
+Module for parsing the HTML node
+"""
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from .base_node import BaseNode
+
+
+class ParseTextNode(BaseNode):
+    """
+    A node for extracting content from HTML documents based on provided tags.
+
+    This node leverages the BeautifulSoupTransformer to offer flexible parsing 
+    capabilities. It allows you to isolate specific elements within an HTML 
+    document, making it valuable for targeted content extraction in scraping workflows.
+
+    Attributes:
+        node_name (str): Unique name for the node (defaults to "ParseHTMLNode").
+        node_type (str): Indicates a standard operational node (set to "node").
+
+    Args:
+        node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").
+
+    Methods:
+        execute(state):  
+            * Extracts content from the 'document' field in the state based on tags (if provided in the state).
+            * Stores the result in the 'parsed_document' field of the state.
+            * Employs the RecursiveCharacterTextSplitter for handling larger documents.
+    """
+
+    def __init__(self, node_name: str = "ParseHTMLNode"):
+        """
+        Initializes the ParseHTMLNode.
+
+        Args:
+            node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").
+        """
+        super().__init__(node_name, "node")
+
+    def execute(self, state):
+        """
+        Parses HTML content and updates the state.
+
+        Args:
+            state (dict):  Expects the following keys:
+                'document': The HTML content to parse.
+                'tags' (optional): A list of HTML tags to target for extraction.
+
+        Returns:
+            dict: Updated state with the following:
+                'parsed_document': The extracted content 
+                (or the original document if no tags were provided).
+                'document_chunks': The original document split into chunka
+                 (using RecursiveCharacterTextSplitter) 
+                for larger documents.
+
+        Raises:
+            KeyError: If the required 'document' key is missing from the state.
+        """
+
+        print("---PARSING TEXT DOCUMENT---")
+
+        try:
+            document = state["document"]
+        except KeyError as e:
+            print(f"Error: {e} not found in state.")
+            raise
+
+        # ... (Add logic for parsing with BeautifulSoup based on 'tags' if present)
+
+        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            chunk_size=4000,
+            chunk_overlap=0,
+        )
+        state["document_chunks"] = text_splitter.split_text(document)
+
+        return state