Skip to content

Commit 3911e78

Browse files
committed
Pull sitemaps from disk
* Provide a directory of xml sitemaps on disk as an option to initializing SitemapToContentDatabase * Allow for multiple sitemap urls
1 parent 318412b commit 3911e78

File tree

5 files changed

+125
-6
lines changed

5 files changed

+125
-6
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ following:
2727
from contentmap.sitemap import SitemapToContentDatabase
2828

2929
database = SitemapToContentDatabase(
30-
sitemap_url="https://yourblog.com/sitemap.xml",
30+
sitemap_sources=["https://yourblog.com/sitemap.xml"],
3131
concurrency=10,
3232
include_vss=True
3333
)

contentmap/sitemap.py

+35-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import asyncio
22
import logging
3+
from typing import Literal
34
import requests
5+
import os
46

57
import aiohttp
68
import trafilatura
@@ -11,10 +13,17 @@
1113

1214

1315
class SitemapToContentDatabase:
16+
SOURCE_TYPE_URL: Literal['url'] = 'url'
17+
SOURCE_TYPE_DISK: Literal['disk'] = 'disk'
18+
SourceType = Literal['url', 'disk']
1419

15-
def __init__(self, sitemap_url, seconds_timeout=10, concurrency=None,
20+
def __init__(self, sitemap_sources: list,
21+
source_type: SourceType = SOURCE_TYPE_URL,
22+
seconds_timeout=10,
23+
concurrency=None,
1624
include_vss=False):
17-
self.sitemap_url = sitemap_url
25+
self.sitemap_sources = sitemap_sources
26+
self.source_type = source_type
1827
self.semaphore = asyncio.Semaphore(concurrency) if concurrency is not None else None
1928
self.timeout = aiohttp.ClientTimeout(
2029
sock_connect=seconds_timeout,
@@ -30,13 +39,34 @@ def build(self):
3039
cm.build()
3140

3241
def get_urls(self):
33-
r = requests.get(self.sitemap_url)
42+
all_urls = []
43+
if self.source_type == self.SOURCE_TYPE_URL:
44+
for sitemap_url in self.sitemap_sources:
45+
urls = self._get_urls_from_url(sitemap_url)
46+
all_urls.extend(urls)
47+
elif self.source_type == self.SOURCE_TYPE_DISK:
48+
for directory in self.sitemap_sources:
49+
for filename in os.listdir(directory):
50+
if filename.endswith('.xml'):
51+
filepath = os.path.join(directory, filename)
52+
urls = self._get_urls_from_disk(filepath)
53+
all_urls.extend(urls)
54+
return all_urls
55+
56+
def _get_urls_from_url(self, sitemap_url):
57+
r = requests.get(sitemap_url)
3458
tree = etree.fromstring(r.content)
35-
urls = [
59+
return self._extract_urls_from_tree(tree)
60+
61+
def _get_urls_from_disk(self, filepath):
62+
tree = etree.parse(filepath)
63+
return self._extract_urls_from_tree(tree)
64+
65+
def _extract_urls_from_tree(self, tree):
66+
return [
3667
url.text for url
3768
in tree.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
3869
]
39-
return urls
4070

4171
async def get_contents(self, urls):
4272
async with aiohttp.ClientSession(timeout=self.timeout) as session:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?xml version="1.0"?>
2+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
3+
<url>
4+
<loc>https://www.example.com/docs/en/example/?topic=testing</loc>
5+
</url>
6+
<url>
7+
<loc>https://www.example.com/docs/en/example/?topic=contact-us</loc>
8+
</url>
9+
</urlset>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?xml version="1.0"?>
2+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
3+
<url>
4+
<loc>https://www.example.com/docs/en/example/?topic=library-overview</loc>
5+
</url>
6+
<url>
7+
<loc>https://www.example.com/docs/en/example/?topic=about-this-content</loc>
8+
</url>
9+
</urlset>

tests/test_sitemap.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import os
2+
import unittest
3+
import pytest
4+
5+
from unittest.mock import patch, MagicMock
6+
from contentmap.sitemap import SitemapToContentDatabase
7+
8+
9+
class TestSitemapToContentDatabase(unittest.TestCase):
10+
def create_mock_response(self, content):
11+
mock_response = MagicMock()
12+
mock_response.content = content
13+
return mock_response
14+
15+
def generate_sample_sitemap_xml(self, url):
16+
return f'''
17+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
18+
<url>
19+
<loc>{url}</loc>
20+
</url>
21+
</urlset>'''
22+
@patch('contentmap.sitemap.requests.get')
23+
def test_get_urls_given_one_sitemap_url(self, mock_get):
24+
mock_get.return_value = self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing'))
25+
26+
sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml'], source_type='url')
27+
urls = sitemap_db.get_urls()
28+
29+
self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing'])
30+
mock_get.assert_called_once_with('https://example.com/sitemap.xml')
31+
32+
33+
@patch('contentmap.sitemap.requests.get')
34+
def test_get_urls_given_multiple_sitemap_urls(self, mock_get):
35+
mock_get.side_effect = [
36+
self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing')),
37+
self.create_mock_response(self.generate_sample_sitemap_xml('https://www.anotherexample.com/docs/en/example/?topic=contact-us'))
38+
]
39+
40+
sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml', 'https://anotherexample.com/sitemap.xml'], source_type='url')
41+
urls = sitemap_db.get_urls()
42+
43+
self.assertEqual(urls, [
44+
'https://www.example.com/docs/en/example/?topic=testing',
45+
'https://www.anotherexample.com/docs/en/example/?topic=contact-us'
46+
])
47+
mock_get.assert_any_call('https://example.com/sitemap.xml')
48+
mock_get.assert_any_call('https://anotherexample.com/sitemap.xml')
49+
self.assertEqual(mock_get.call_count, 2)
50+
51+
def test_get_urls_given_one_location_on_disk(self):
52+
sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a')
53+
sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path], source_type='disk')
54+
urls = sitemap_db.get_urls()
55+
56+
self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing',
57+
'https://www.example.com/docs/en/example/?topic=contact-us'
58+
])
59+
60+
61+
def test_get_urls_given_multiple_locations_on_disk(self):
62+
sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a')
63+
sitemap_folder_b_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_b')
64+
sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path, sitemap_folder_b_path], source_type='disk')
65+
urls = sitemap_db.get_urls()
66+
67+
self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing',
68+
'https://www.example.com/docs/en/example/?topic=contact-us',
69+
'https://www.example.com/docs/en/example/?topic=library-overview',
70+
'https://www.example.com/docs/en/example/?topic=about-this-content'
71+
])

0 commit comments

Comments
 (0)