Skip to content

Commit 20a78ac

Browse files
authored
Merge pull request #31 from VinciGit00/refactoring-parse-html
Refactoring parse html
2 parents a46f45d + 5735f3e commit 20a78ac

File tree

4 files changed

+19
-23
lines changed

4 files changed

+19
-23
lines changed

README.md

+1-8
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,13 @@ Is it possible to try also the colab version
2424

2525
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sEZBonBMGP44CtO6GQTwAlL0BGJXjtfd?usp=sharing)
2626

27-
Try out ScrapeGraphAI in your browser:
28-
29-
[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/VinciGit00/Scrapegraph-ai)
30-
27+
Follow the procedure on the following link to setup your OpenAI API key: [link](https://scrapegraph-ai.readthedocs.io/en/latest/index.html).
3128

3229
## 📖 Documentation
3330

3431
The documentation for ScrapeGraphAI can be found [here](https://scrapegraph-ai.readthedocs.io/en/latest/).
3532
Behind this there is also the docusaurus documentation [here](https://scrapegraph-doc.onrender.com/)).
3633

37-
## Setup the api keys
38-
39-
Follow the procedure on the following link to setup your OpenAI API key: [link](https://scrapegraph-ai.readthedocs.io/en/latest/index.html).
40-
4134
## 💻 Usage
4235

4336
### Case 1: Extracting information using a prompt

examples/graph_examples/graph_evaluation_example.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Module for evaluating the graph
33
"""
44
import os
5-
from scrapegraphai.evaluators import TrulensEvaluator
65
from dotenv import load_dotenv
6+
from scrapegraphai.evaluators import TrulensEvaluator
77

88
load_dotenv()
99

@@ -26,6 +26,7 @@
2626
# Create the TrulensEvaluator instance
2727
trulens_evaluator = TrulensEvaluator(openai_key)
2828
# Evaluate SmartScraperGraph on the list of inputs
29-
(results_df, answer) = trulens_evaluator.evaluate(list_of_inputs, dashboard=False)
29+
(results_df, answer) = trulens_evaluator.evaluate(
30+
list_of_inputs, dashboard=False)
3031

3132
print(answer)

scrapegraphai/nodes/generate_answer_node.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def execute(self, state: dict) -> dict:
6565
print("---GENERATING ANSWER---")
6666
try:
6767
user_input = state["user_input"]
68-
document = state["document"]
68+
document = state["document_chunks"]
6969
except KeyError as e:
7070
print(f"Error: {e} not found in state.")
7171
raise
@@ -104,7 +104,7 @@ def execute(self, state: dict) -> dict:
104104
prompt = PromptTemplate(
105105
template=template_chunks,
106106
input_variables=["question"],
107-
partial_variables={"context": chunk.page_content,
107+
partial_variables={"context": chunk,
108108
"chunk_id": i + 1, "format_instructions": format_instructions},
109109
)
110110
# Dynamically name the chains based on their index

scrapegraphai/nodes/parse_html_node.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""
22
Module for parsing the HTML node
33
"""
4-
from langchain_community.document_transformers import BeautifulSoupTransformer
4+
from langchain.text_splitter import RecursiveCharacterTextSplitter
5+
from langchain_community.document_transformers import Html2TextTransformer
56
from .base_node import BaseNode
67

78

@@ -36,7 +37,7 @@ def __init__(self, node_name: str, node_type: str = "ParseHTMLNode"):
3637
"""
3738
super().__init__(node_name, node_type)
3839

39-
def execute(self, state):
40+
def execute(self, state):
4041
"""
4142
Executes the node's logic to parse the HTML document based on specified tags.
4243
If tags are provided in the state, the document is parsed accordingly; otherwise,
@@ -63,15 +64,16 @@ def execute(self, state):
6364
print(f"Error: {e} not found in state.")
6465
raise
6566

66-
tags = state.get("tags", None)
67+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
68+
chunk_size=4000,
69+
chunk_overlap=0,
70+
)
6771

68-
if not tags:
69-
print("No specific tags provided; returning document as is.")
70-
return state
72+
docs_transformed = Html2TextTransformer(
73+
).transform_documents(document)[0]
74+
75+
chunks = text_splitter.split_text(docs_transformed.page_content)
76+
77+
state.update({"document_chunks": chunks})
7178

72-
bs_transformer = BeautifulSoupTransformer()
73-
parsed_document = bs_transformer.transform_documents(
74-
document, tags_to_extract=tags)
75-
print("Document parsed with specified tags.")
76-
state.update({"parsed_document": parsed_document})
7779
return state

0 commit comments

Comments
 (0)