|
| 1 | +import pytest |
| 2 | + |
| 3 | +import logging |
| 4 | +import scrapy |
| 5 | +from scrapy import Item, Field |
| 6 | +from scrapy.linkextractors import LinkExtractor |
| 7 | +from scrapy.spiders import CrawlSpider, Rule |
| 8 | +from scrapy.crawler import CrawlerProcess |
| 9 | + |
| 10 | +class Docs404Item(Item): |
| 11 | + referer = Field() |
| 12 | + status = Field() |
| 13 | + url = Field() |
| 14 | + |
| 15 | +class Docs404Spider(CrawlSpider): |
| 16 | + |
| 17 | + def __init__(self, *args, **kwargs): |
| 18 | + loggers = ['scrapy.core.engine', |
| 19 | + 'scrapy.downloadermiddlewares.redirect', |
| 20 | + 'scrapy.spidermiddlewares.offsite', |
| 21 | + 'scrapy.middleware'] |
| 22 | + for l in loggers: |
| 23 | + logger = logging.getLogger(l) |
| 24 | + logger.setLevel(logging.WARNING) |
| 25 | + super().__init__(*args, **kwargs) |
| 26 | + |
| 27 | +# Delay if server is returning lots of 500s |
| 28 | +# DOWNLOAD_DELAY=0.1 |
| 29 | + name = 'docs404' |
| 30 | + allowed_domains = ['localhost' ] |
| 31 | + start_urls = ['http://localhost:1313/docs'] |
| 32 | + handle_httpstatus_list = [404] |
| 33 | + |
| 34 | + rules = ( |
| 35 | + Rule(LinkExtractor(allow=r'/docs/', deny=r'/docs/contribute'), |
| 36 | + callback='parse_item', follow=True), |
| 37 | + ) |
| 38 | + |
| 39 | + def parse_item(self, response): |
| 40 | + item = Docs404Item() |
| 41 | + |
| 42 | + if response.status == 404: |
| 43 | + item['referer'] = response.request.headers.get('Referer') |
| 44 | + item['status'] = response.status |
| 45 | + item['url'] = response.url |
| 46 | + return item |
| 47 | + |
| 48 | + |
| 49 | +def test_404(): |
| 50 | + import os |
| 51 | + process = CrawlerProcess({ 'USER_AGENT': 'docs404', |
| 52 | + 'FEED_URI': 'temp.csv', |
| 53 | + 'FEED_FORMAT': 'csv' }) |
| 54 | + process.crawl(Docs404Spider) |
| 55 | + process.start() |
| 56 | + f = open('temp.csv') |
| 57 | + os.remove('temp.csv') |
| 58 | + assert sum([1 for line in f]) == 1,'404 response in HTML - see scraper logs' |
| 59 | + |
| 60 | + |
0 commit comments