|
1 | 1 | import argparse
|
2 | 2 | import random
|
| 3 | +from uuid import uuid4 |
3 | 4 |
|
4 | 5 | from sneakpeek.logging import configure_logging
|
5 |
| -from sneakpeek.models import Scraper, ScraperJobPriority, ScraperSchedule |
6 |
| -from sneakpeek.plugins.rate_limiter_plugin import ( |
7 |
| - RateLimiterPlugin, |
8 |
| - RateLimiterPluginConfig, |
| 6 | +from sneakpeek.middleware.parser import ParserMiddleware |
| 7 | +from sneakpeek.middleware.rate_limiter_middleware import ( |
| 8 | + RateLimiterMiddleware, |
| 9 | + RateLimiterMiddlewareConfig, |
9 | 10 | )
|
10 |
| -from sneakpeek.plugins.requests_logging_plugin import RequestsLoggingPlugin |
11 |
| -from sneakpeek.plugins.robots_txt_plugin import RobotsTxtPlugin |
12 |
| -from sneakpeek.plugins.user_agent_injecter_plugin import ( |
13 |
| - UserAgentInjecterPlugin, |
14 |
| - UserAgentInjecterPluginConfig, |
15 |
| -) |
16 |
| -from sneakpeek.scraper_config import ScraperConfig |
| 11 | +from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware |
| 12 | +from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage |
| 13 | +from sneakpeek.queue.model import TaskPriority |
| 14 | +from sneakpeek.scheduler.in_memory_lease_storage import InMemoryLeaseStorage |
| 15 | +from sneakpeek.scheduler.model import TaskSchedule |
| 16 | +from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage |
| 17 | +from sneakpeek.scraper.model import Scraper, ScraperConfig, ScraperStorageABC |
17 | 18 | from sneakpeek.server import SneakpeekServer
|
18 |
| -from sneakpeek.storage.base import ScrapersStorage |
19 |
| -from sneakpeek.storage.in_memory_storage import ( |
20 |
| - InMemoryLeaseStorage, |
21 |
| - InMemoryScraperJobsStorage, |
22 |
| - InMemoryScrapersStorage, |
23 |
| -) |
| 19 | +from sneakpeek.session_loggers.base import SessionLogger |
| 20 | +from sneakpeek.session_loggers.file_logger import FileLoggerHandler |
24 | 21 |
|
25 | 22 | from demo.demo_scraper import DemoScraper
|
26 | 23 |
|
|
34 | 31 | help="URLs to create demo scrapers for",
|
35 | 32 | default=[
|
36 | 33 | "https://google.com",
|
37 |
| - "https://www.blogger.com", |
38 |
| - "https://youtube.com", |
39 |
| - "https://www.ycombinator.com/", |
| 34 | + "https://www.docker.com/", |
40 | 35 | ],
|
41 | 36 | )
|
42 | 37 | parser.add_argument(
|
|
50 | 45 | def get_scrapers(urls: list[str]) -> list[Scraper]:
|
51 | 46 | return [
|
52 | 47 | Scraper(
|
53 |
| - id=id, |
| 48 | + id=str(uuid4()), |
54 | 49 | name=f"Demo Scraper ({url})",
|
55 |
| - schedule=ScraperSchedule.EVERY_MINUTE, |
| 50 | + schedule=TaskSchedule.EVERY_MINUTE, |
56 | 51 | handler=DemoScraper().name,
|
57 | 52 | config=ScraperConfig(params={"start_url": url, "max_pages": 5}),
|
58 | 53 | schedule_priority=random.choice(
|
59 | 54 | [
|
60 |
| - ScraperJobPriority.HIGH, |
61 |
| - ScraperJobPriority.UTMOST, |
62 |
| - ScraperJobPriority.NORMAL, |
| 55 | + TaskPriority.HIGH, |
| 56 | + TaskPriority.UTMOST, |
| 57 | + TaskPriority.NORMAL, |
63 | 58 | ]
|
64 | 59 | ),
|
65 | 60 | )
|
66 |
| - for id, url in enumerate(urls) |
| 61 | + for url in urls |
67 | 62 | ]
|
68 | 63 |
|
69 | 64 |
|
70 |
| -def get_scrapers_storage(urls: list[str], is_read_only: bool) -> ScrapersStorage: |
71 |
| - return InMemoryScrapersStorage( |
72 |
| - scrapers=get_scrapers(urls), is_read_only=is_read_only |
| 65 | +def get_scraper_storage(urls: list[str], is_read_only: bool) -> ScraperStorageABC: |
| 66 | + return InMemoryScraperStorage( |
| 67 | + initial_scrapers=get_scrapers(urls), |
| 68 | + is_read_only=is_read_only, |
73 | 69 | )
|
74 | 70 |
|
75 | 71 |
|
76 |
| -def get_server(urls: list[str], is_read_only: bool) -> SneakpeekServer: |
| 72 | +def get_server( |
| 73 | + urls: list[str], |
| 74 | + is_read_only: bool, |
| 75 | + session_logger: SessionLogger, |
| 76 | +) -> SneakpeekServer: |
77 | 77 | return SneakpeekServer.create(
|
78 | 78 | handlers=[DemoScraper()],
|
79 |
| - scrapers_storage=get_scrapers_storage(urls, is_read_only), |
80 |
| - jobs_storage=InMemoryScraperJobsStorage(), |
| 79 | + scraper_storage=get_scraper_storage(urls, is_read_only), |
| 80 | + queue_storage=InMemoryQueueStorage(), |
81 | 81 | lease_storage=InMemoryLeaseStorage(),
|
82 |
| - plugins=[ |
83 |
| - RequestsLoggingPlugin(), |
84 |
| - RobotsTxtPlugin(), |
85 |
| - RateLimiterPlugin(RateLimiterPluginConfig(max_rpm=60)), |
86 |
| - UserAgentInjecterPlugin( |
87 |
| - UserAgentInjecterPluginConfig(use_external_data=False) |
88 |
| - ), |
| 82 | + middlewares=[ |
| 83 | + RequestsLoggingMiddleware(), |
| 84 | + RateLimiterMiddleware(RateLimiterMiddlewareConfig(max_rpm=60)), |
| 85 | + ParserMiddleware(), |
89 | 86 | ],
|
| 87 | + add_dynamic_scraper_handler=True, |
| 88 | + session_logger_handler=session_logger, |
90 | 89 | )
|
91 | 90 |
|
92 | 91 |
|
93 | 92 | def main():
|
| 93 | + session_logger = FileLoggerHandler(f"logs/{uuid4()}/") |
94 | 94 | args = parser.parse_args()
|
95 |
| - server = get_server(args.urls, args.read_only) |
96 |
| - configure_logging() |
| 95 | + server = get_server(args.urls, args.read_only, session_logger) |
| 96 | + configure_logging(session_logger_handler=session_logger) |
97 | 97 | server.serve()
|
98 | 98 |
|
99 | 99 |
|
|
0 commit comments