Skip to content

Commit b0418fd

Browse files
derpferdmpenkov
andauthored
Updated iter_bucket to use concurrent futures. (#368)
* Updated iter_bucket to use concurrent futures. This commit addresses issue #340. AWS Lambda environments do not support multiprocessing.Queue or multiprocessing.Pool, which are used by iter_bucket to optimize the pulling of files from s3. Solution: Switch to using concurrent.futures.ThreadPoolExecutor instead. This still optimizes the pulling of files from s3 without using new processes. * disable test_old when mocks are disabled * favor multiprocessing over concurrent.futures * make imap_unordered return an iterator instead of a list * skip tests when their respective features are unavailable * Revert "disable test_old when mocks are disabled" This reverts commit 6506562. * tweak imap_unordered * remove tests_require pins Co-authored-by: Michael Penkov <m@penkov.dev>
1 parent 68a39d9 commit b0418fd

File tree

3 files changed

+80
-10
lines changed

3 files changed

+80
-10
lines changed

setup.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,10 @@ def read(fname):
4141

4242
tests_require = [
4343
'mock',
44-
'moto==1.3.4',
44+
'moto[server]',
4545
'pathlib2',
4646
'responses',
47-
# Temporary pin boto3 & botocore, because moto doesn't work with new version
48-
# See https://github.com/spulec/moto/issues/1793 and
49-
# https://github.com/RaRe-Technologies/smart_open/issues/227
50-
'boto3 < 1.8.0',
51-
# 'botocore < 1.11.0'
47+
'boto3',
5248
# Not used directly but allows boto GCE plugins to load.
5349
# https://github.com/GoogleCloudPlatform/compute-image-packages/issues/262
5450
'google-compute-engine==2.8.12',

smart_open/s3.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,16 @@
2323

2424
logger = logging.getLogger(__name__)
2525

26+
# AWS Lambda environments do not support multiprocessing.Queue or multiprocessing.Pool.
27+
# However they do support Threads and therefore concurrent.futures's ThreadPoolExecutor.
28+
# We use this flag to allow python 2 backward compatibility, where concurrent.futures doesn't exist.
29+
_CONCURRENT_FUTURES = False
30+
try:
31+
import concurrent.futures
32+
_CONCURRENT_FUTURES = True
33+
except ImportError:
34+
warnings.warn("concurrent.futures could not be imported and won't be used")
35+
2636
# Multiprocessing is unavailable in App Engine (and possibly other sandboxes).
2737
# The only method currently relying on it is iter_bucket, which is instructed
2838
# whether to use it by the MULTIPROCESSING flag.
@@ -832,11 +842,28 @@ def terminate(self):
832842
pass
833843

834844

845+
class ConcurrentFuturesPool(object):
846+
"""A class that mimics multiprocessing.pool.Pool but uses concurrent futures instead of processes."""
847+
def __init__(self, max_workers):
848+
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers)
849+
850+
def imap_unordered(self, function, items):
851+
futures = [self.executor.submit(function, item) for item in items]
852+
for future in concurrent.futures.as_completed(futures):
853+
yield future.result()
854+
855+
def terminate(self):
856+
self.executor.shutdown(wait=True)
857+
858+
835859
@contextlib.contextmanager
836860
def _create_process_pool(processes=1):
837861
if _MULTIPROCESSING and processes:
838-
logger.info("creating pool with %i workers", processes)
862+
logger.info("creating multiprocessing pool with %i workers", processes)
839863
pool = multiprocessing.pool.Pool(processes=processes)
864+
elif _CONCURRENT_FUTURES and processes:
865+
logger.info("creating concurrent futures pool with %i workers", processes)
866+
pool = ConcurrentFuturesPool(max_workers=processes)
840867
else:
841868
logger.info("creating dummy pool")
842869
pool = DummyPool()

smart_open/tests/test_s3.py

+50-3
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@ def test_old(self):
512512

513513
# first, create some keys in the bucket
514514
expected = {}
515-
for key_no in range(200):
515+
for key_no in range(42):
516516
key_name = "mykey%s" % key_no
517517
with smart_open.smart_open("s3://%s/%s" % (BUCKET_NAME, key_name), 'wb') as fout:
518518
content = '\n'.join("line%i%i" % (key_no, line_no) for line_no in range(10)).encode('utf8')
@@ -542,16 +542,63 @@ def test_old(self):
542542
self.assertEqual(result, expected)
543543

544544

545+
@maybe_mock_s3
546+
@unittest.skipIf(not smart_open.s3._CONCURRENT_FUTURES, 'concurrent.futures unavailable')
547+
class IterBucketConcurrentFuturesTest(unittest.TestCase):
548+
def setUp(self):
549+
self.old_flag_multi = smart_open.s3._MULTIPROCESSING
550+
smart_open.s3._MULTIPROCESSING = False
551+
ignore_resource_warnings()
552+
553+
def tearDown(self):
554+
smart_open.s3._MULTIPROCESSING = self.old_flag_multi
555+
cleanup_bucket()
556+
557+
def test(self):
558+
num_keys = 101
559+
populate_bucket(num_keys=num_keys)
560+
keys = list(smart_open.s3.iter_bucket(BUCKET_NAME))
561+
self.assertEqual(len(keys), num_keys)
562+
563+
expected = [('key_%d' % x, b'%d' % x) for x in range(num_keys)]
564+
self.assertEqual(sorted(keys), sorted(expected))
565+
566+
567+
@maybe_mock_s3
568+
@unittest.skipIf(not smart_open.s3._MULTIPROCESSING, 'multiprocessing unavailable')
569+
class IterBucketMultiprocessingTest(unittest.TestCase):
570+
def setUp(self):
571+
self.old_flag_concurrent = smart_open.s3._CONCURRENT_FUTURES
572+
smart_open.s3._CONCURRENT_FUTURES = False
573+
ignore_resource_warnings()
574+
575+
def tearDown(self):
576+
smart_open.s3._CONCURRENT_FUTURES = self.old_flag_concurrent
577+
cleanup_bucket()
578+
579+
def test(self):
580+
num_keys = 101
581+
populate_bucket(num_keys=num_keys)
582+
keys = list(smart_open.s3.iter_bucket(BUCKET_NAME))
583+
self.assertEqual(len(keys), num_keys)
584+
585+
expected = [('key_%d' % x, b'%d' % x) for x in range(num_keys)]
586+
self.assertEqual(sorted(keys), sorted(expected))
587+
588+
545589
@maybe_mock_s3
546590
class IterBucketSingleProcessTest(unittest.TestCase):
547591
def setUp(self):
548-
self.old_flag = smart_open.s3._MULTIPROCESSING
592+
self.old_flag_multi = smart_open.s3._MULTIPROCESSING
593+
self.old_flag_concurrent = smart_open.s3._CONCURRENT_FUTURES
549594
smart_open.s3._MULTIPROCESSING = False
595+
smart_open.s3._CONCURRENT_FUTURES = False
550596

551597
ignore_resource_warnings()
552598

553599
def tearDown(self):
554-
smart_open.s3._MULTIPROCESSING = self.old_flag
600+
smart_open.s3._MULTIPROCESSING = self.old_flag_multi
601+
smart_open.s3._CONCURRENT_FUTURES = self.old_flag_concurrent
555602
cleanup_bucket()
556603

557604
def test(self):

0 commit comments

Comments
 (0)