Skip to content

Commit e2af232

Browse files
committed
fix: fetch node regex
1 parent 86bf4f2 commit e2af232

File tree

3 files changed

+42
-20
lines changed

3 files changed

+42
-20
lines changed

requirements-dev.lock

+11-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ anyio==4.4.0
3030
astroid==3.2.4
3131
# via pylint
3232
async-timeout==4.0.3
33+
# via aiohttp
34+
# via langchain
3335
# via scrapegraphai
3436
attrs==24.2.0
3537
# via aiohttp
@@ -78,6 +80,9 @@ distro==1.9.0
7880
# via openai
7981
docutils==0.19
8082
# via sphinx
83+
exceptiongroup==1.2.2
84+
# via anyio
85+
# via pytest
8186
fastapi==0.112.0
8287
# via burr
8388
fastapi-pagination==0.12.26
@@ -131,7 +136,6 @@ graphviz==0.20.3
131136
# via burr
132137
greenlet==3.0.3
133138
# via playwright
134-
# via sqlalchemy
135139
grpcio==1.65.4
136140
# via google-api-core
137141
# via grpcio-status
@@ -500,6 +504,9 @@ tokenizers==0.19.1
500504
# via transformers
501505
toml==0.10.2
502506
# via streamlit
507+
tomli==2.1.0
508+
# via pylint
509+
# via pytest
503510
tomlkit==0.13.0
504511
# via pylint
505512
tornado==6.4.1
@@ -517,6 +524,8 @@ transformers==4.44.2
517524
# via scrapegraphai
518525
typing-extensions==4.12.2
519526
# via altair
527+
# via anyio
528+
# via astroid
520529
# via fastapi
521530
# via fastapi-pagination
522531
# via google-generativeai
@@ -531,6 +540,7 @@ typing-extensions==4.12.2
531540
# via sqlalchemy
532541
# via streamlit
533542
# via typing-inspect
543+
# via uvicorn
534544
typing-inspect==0.9.0
535545
# via dataclasses-json
536546
# via sf-hamilton

requirements.lock

+5-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ anyio==4.4.0
1919
# via httpx
2020
# via openai
2121
async-timeout==4.0.3
22+
# via aiohttp
23+
# via langchain
2224
# via scrapegraphai
2325
attrs==23.2.0
2426
# via aiohttp
@@ -48,6 +50,8 @@ dill==0.3.8
4850
# via multiprocess
4951
distro==1.9.0
5052
# via openai
53+
exceptiongroup==1.2.2
54+
# via anyio
5155
fastembed==0.3.6
5256
# via scrapegraphai
5357
filelock==3.15.4
@@ -87,7 +91,6 @@ googlesearch-python==1.2.5
8791
# via scrapegraphai
8892
greenlet==3.0.3
8993
# via playwright
90-
# via sqlalchemy
9194
grpcio==1.65.1
9295
# via google-api-core
9396
# via grpcio-status
@@ -368,6 +371,7 @@ tqdm==4.66.4
368371
transformers==4.44.2
369372
# via scrapegraphai
370373
typing-extensions==4.12.2
374+
# via anyio
371375
# via google-generativeai
372376
# via huggingface-hub
373377
# via langchain-core

scrapegraphai/nodes/fetch_node.py

+26-18
Original file line numberDiff line numberDiff line change
@@ -80,28 +80,30 @@ def __init__(
8080
None if node_config is None else node_config.get("scrape_do", None)
8181
)
8282

83+
def is_valid_url(self, source: str) -> bool:
84+
"""
85+
Validates if the source string is a valid URL using regex.
86+
87+
Parameters:
88+
source (str): The URL string to validate
89+
90+
Raises:
91+
ValueError: If the URL is invalid
92+
"""
93+
import re
94+
url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
95+
if not bool(re.match(url_pattern, source)):
96+
raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.")
97+
return True
98+
8399
def execute(self, state):
84100
"""
85101
Executes the node's logic to fetch HTML content from a specified URL and
86102
update the state with this content.
87-
88-
Args:
89-
state (dict): The current state of the graph. The input keys will be used
90-
to fetch the correct data types from the state.
91-
92-
Returns:
93-
dict: The updated state with a new output key containing the fetched HTML content.
94-
95-
Raises:
96-
KeyError: If the input key is not found in the state, indicating that the
97-
necessary information to perform the operation is missing.
98103
"""
99-
100104
self.logger.info(f"--- Executing {self.node_name} Node ---")
101105

102-
# Interpret input keys based on the provided input expression
103106
input_keys = self.get_input_keys(state)
104-
# Fetching data from the state based on the input keys
105107
input_data = [state[key] for key in input_keys]
106108

107109
source = input_data[0]
@@ -124,10 +126,16 @@ def execute(self, state):
124126
return handlers[input_type](state, input_type, source)
125127
elif self.input == "pdf_dir":
126128
return state
127-
elif not source.startswith("http") and not source.startswith("www"):
128-
return self.handle_local_source(state, source)
129-
else:
130-
return self.handle_web_source(state, source)
129+
130+
# For web sources, validate URL before proceeding
131+
try:
132+
if self.is_valid_url(source):
133+
return self.handle_web_source(state, source)
134+
except ValueError as e:
135+
# Re-raise the exception from is_valid_url
136+
raise
137+
138+
return self.handle_local_source(state, source)
131139

132140
def handle_directory(self, state, input_type, source):
133141
"""

0 commit comments

Comments
 (0)