Skip to content

Commit 419020d

Browse files
committed
Update demo to v0.2.2
1 parent 134082d commit 419020d

File tree

5 files changed

+431
-911
lines changed

5 files changed

+431
-911
lines changed

demo/app.py

+41-41
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,23 @@
11
import argparse
22
import random
3+
from uuid import uuid4
34

45
from sneakpeek.logging import configure_logging
5-
from sneakpeek.models import Scraper, ScraperJobPriority, ScraperSchedule
6-
from sneakpeek.plugins.rate_limiter_plugin import (
7-
RateLimiterPlugin,
8-
RateLimiterPluginConfig,
6+
from sneakpeek.middleware.parser import ParserMiddleware
7+
from sneakpeek.middleware.rate_limiter_middleware import (
8+
RateLimiterMiddleware,
9+
RateLimiterMiddlewareConfig,
910
)
10-
from sneakpeek.plugins.requests_logging_plugin import RequestsLoggingPlugin
11-
from sneakpeek.plugins.robots_txt_plugin import RobotsTxtPlugin
12-
from sneakpeek.plugins.user_agent_injecter_plugin import (
13-
UserAgentInjecterPlugin,
14-
UserAgentInjecterPluginConfig,
15-
)
16-
from sneakpeek.scraper_config import ScraperConfig
11+
from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware
12+
from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage
13+
from sneakpeek.queue.model import TaskPriority
14+
from sneakpeek.scheduler.in_memory_lease_storage import InMemoryLeaseStorage
15+
from sneakpeek.scheduler.model import TaskSchedule
16+
from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage
17+
from sneakpeek.scraper.model import Scraper, ScraperConfig, ScraperStorageABC
1718
from sneakpeek.server import SneakpeekServer
18-
from sneakpeek.storage.base import ScrapersStorage
19-
from sneakpeek.storage.in_memory_storage import (
20-
InMemoryLeaseStorage,
21-
InMemoryScraperJobsStorage,
22-
InMemoryScrapersStorage,
23-
)
19+
from sneakpeek.session_loggers.base import SessionLogger
20+
from sneakpeek.session_loggers.file_logger import FileLoggerHandler
2421

2522
from demo.demo_scraper import DemoScraper
2623

@@ -34,9 +31,7 @@
3431
help="URLs to create demo scrapers for",
3532
default=[
3633
"https://google.com",
37-
"https://www.blogger.com",
38-
"https://youtube.com",
39-
"https://www.ycombinator.com/",
34+
"https://www.docker.com/",
4035
],
4136
)
4237
parser.add_argument(
@@ -50,50 +45,55 @@
5045
def get_scrapers(urls: list[str]) -> list[Scraper]:
5146
return [
5247
Scraper(
53-
id=id,
48+
id=str(uuid4()),
5449
name=f"Demo Scraper ({url})",
55-
schedule=ScraperSchedule.EVERY_MINUTE,
50+
schedule=TaskSchedule.EVERY_MINUTE,
5651
handler=DemoScraper().name,
5752
config=ScraperConfig(params={"start_url": url, "max_pages": 5}),
5853
schedule_priority=random.choice(
5954
[
60-
ScraperJobPriority.HIGH,
61-
ScraperJobPriority.UTMOST,
62-
ScraperJobPriority.NORMAL,
55+
TaskPriority.HIGH,
56+
TaskPriority.UTMOST,
57+
TaskPriority.NORMAL,
6358
]
6459
),
6560
)
66-
for id, url in enumerate(urls)
61+
for url in urls
6762
]
6863

6964

70-
def get_scrapers_storage(urls: list[str], is_read_only: bool) -> ScrapersStorage:
71-
return InMemoryScrapersStorage(
72-
scrapers=get_scrapers(urls), is_read_only=is_read_only
65+
def get_scraper_storage(urls: list[str], is_read_only: bool) -> ScraperStorageABC:
66+
return InMemoryScraperStorage(
67+
initial_scrapers=get_scrapers(urls),
68+
is_read_only=is_read_only,
7369
)
7470

7571

76-
def get_server(urls: list[str], is_read_only: bool) -> SneakpeekServer:
72+
def get_server(
73+
urls: list[str],
74+
is_read_only: bool,
75+
session_logger: SessionLogger,
76+
) -> SneakpeekServer:
7777
return SneakpeekServer.create(
7878
handlers=[DemoScraper()],
79-
scrapers_storage=get_scrapers_storage(urls, is_read_only),
80-
jobs_storage=InMemoryScraperJobsStorage(),
79+
scraper_storage=get_scraper_storage(urls, is_read_only),
80+
queue_storage=InMemoryQueueStorage(),
8181
lease_storage=InMemoryLeaseStorage(),
82-
plugins=[
83-
RequestsLoggingPlugin(),
84-
RobotsTxtPlugin(),
85-
RateLimiterPlugin(RateLimiterPluginConfig(max_rpm=60)),
86-
UserAgentInjecterPlugin(
87-
UserAgentInjecterPluginConfig(use_external_data=False)
88-
),
82+
middlewares=[
83+
RequestsLoggingMiddleware(),
84+
RateLimiterMiddleware(RateLimiterMiddlewareConfig(max_rpm=60)),
85+
ParserMiddleware(),
8986
],
87+
add_dynamic_scraper_handler=True,
88+
session_logger_handler=session_logger,
9089
)
9190

9291

9392
def main():
93+
session_logger = FileLoggerHandler(f"logs/{uuid4()}/")
9494
args = parser.parse_args()
95-
server = get_server(args.urls, args.read_only)
96-
configure_logging()
95+
server = get_server(args.urls, args.read_only, session_logger)
96+
configure_logging(session_logger_handler=session_logger)
9797
server.serve()
9898

9999

demo/demo_scraper.py

+24-20
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,16 @@
44
from urllib.parse import urljoin
55

66
from pydantic import BaseModel
7-
from sneakpeek.scraper_context import ScraperContext
8-
from sneakpeek.scraper_handler import ScraperHandler
9-
from sneakpeek.runner import LocalRunner
10-
from sneakpeek.scraper_config import ScraperConfig
11-
from sneakpeek.plugins.requests_logging_plugin import RequestsLoggingPlugin
12-
from sneakpeek.plugins.rate_limiter_plugin import (
13-
RateLimiterPlugin,
14-
RateLimiterPluginConfig,
7+
from sneakpeek.logging import configure_logging
8+
from sneakpeek.middleware.parser import ParserMiddleware
9+
from sneakpeek.middleware.rate_limiter_middleware import (
10+
RateLimiterMiddleware,
11+
RateLimiterMiddlewareConfig,
1512
)
13+
from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware
14+
from sneakpeek.scraper.context import ScraperContext
15+
from sneakpeek.scraper.model import ScraperConfig, ScraperContextABC, ScraperHandler
16+
from sneakpeek.scraper.runner import ScraperRunner
1617

1718

1819
# Demo class that holds information that
@@ -53,8 +54,8 @@ def process_page(
5354
url: str,
5455
page: str,
5556
) -> PageMetadata:
56-
title = context.regex(page, r"<title>(?P<title>[^<]+)")
57-
description = context.regex(
57+
title = context.parser.regex(page, r"<title>(?P<title>[^<]+)")
58+
description = context.parser.regex(
5859
page, r'meta content="(?P<description>[^"]+)" property="og:description'
5960
)
6061

@@ -67,20 +68,20 @@ def process_page(
6768
# Extract all links in the page
6869
def extract_next_links(
6970
self,
70-
context: ScraperContext,
71+
context: ScraperContextABC,
7172
start_url: str,
7273
page: str,
7374
) -> list[str]:
7475
return [
7576
urljoin(start_url, link.groups["href"])
76-
for link in context.regex(page, r'a[^<]+href="(?P<href>[^"]+)')
77+
for link in context.parser.regex(page, r'a[^<]+href="(?P<href>[^"]+)')
7778
]
7879

7980
# This function is called by the worker to execute the logic
80-
# The only argument that is passed is `sneakpeek.scraper_context.ScraperContext`
81+
# The only argument that is passed is `sneakpeek.scraper_context.ScraperContextABC`
8182
# It implements basic async HTTP client and also provides parameters
8283
# that are defined in the scraper config
83-
async def run(self, context: ScraperContext) -> str:
84+
async def run(self, context: ScraperContextABC) -> str:
8485
params = DemoScraperParams.parse_obj(context.params)
8586

8687
# Download start URL
@@ -115,20 +116,23 @@ async def run(self, context: ScraperContext) -> str:
115116
)
116117

117118

118-
def main():
119-
LocalRunner.run(
119+
async def main():
120+
configure_logging(logging.DEBUG)
121+
result = await ScraperRunner.debug_handler(
120122
DemoScraper(),
121-
ScraperConfig(
123+
config=ScraperConfig(
122124
params=DemoScraperParams(
123125
start_url="https://www.ycombinator.com/",
124126
max_pages=20,
125127
).dict(),
126128
),
127-
plugins=[
128-
RequestsLoggingPlugin(),
129-
RateLimiterPlugin(RateLimiterPluginConfig()),
129+
middlewares=[
130+
RequestsLoggingMiddleware(),
131+
RateLimiterMiddleware(RateLimiterMiddlewareConfig()),
132+
ParserMiddleware(),
130133
],
131134
)
135+
logging.info(f"Finished scraper with result: {result}")
132136

133137

134138
if __name__ == "__main__":

0 commit comments

Comments
 (0)