Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support installing smart_open without AWS dependencies #534

Merged
merged 18 commits into from
Sep 3, 2020
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ Installation
::

pip install smart_open // Install with no cloud dependencies
pip install smart_open[aws] // Install AWS deps
pip install smart_open[s3] // Install S3 deps
pip install smart_open[gcp] // Install GCP deps
pip install smart_open[all] // Installs all cloud dependencies

Expand All @@ -129,7 +129,7 @@ If you're upgrading from ``smart_open`` versions 1.8.0 and below, please check o

Version ``2.0`` will introduce a backwards incompatible installation method with regards to the cloud dependencies. A migration path to minimize breaking
was introduced in version ``x.x.x``. If you want to maintain backwards compatibility (installing all dependencies) install this package via ``smart_open[all]`` now
and once the change is made you should not have any issues. If all you care about is AWS dependencies for example you can install via ``smart_open[aws]`` and
and once the change is made you should not have any issues. If all you care about is AWS dependencies for example you can install via ``smart_open[s3]`` and
once the dependency change is made you will simply drop the unwanted dependencies. You can read more about the motivations `here <https://github.com/RaRe-Technologies/smart_open/issues/443>`_


Expand Down Expand Up @@ -311,16 +311,16 @@ Port your old ``boto`` settings to ``boto3`` in order to use them with ``smart_o
Iterating Over an S3 Bucket's Contents
--------------------------------------

Since going over all (or select) keys in an S3 bucket is a very common operation, there's also an extra function ``smart_open.s3_iter_bucket()`` that does this efficiently, **processing the bucket keys in parallel** (using multiprocessing):
Since going over all (or select) keys in an S3 bucket is a very common operation, there's also an extra function ``smart_open.s3.iter_bucket()`` that does this efficiently, **processing the bucket keys in parallel** (using multiprocessing):

.. code-block:: python

>>> from smart_open import s3_iter_bucket
>>> from smart_open import s3
>>> # get data corresponding to 2010 and later under "silo-open-data/annual/monthly_rain"
>>> # we use workers=1 for reproducibility; you should use as many workers as you have cores
>>> bucket = 'silo-open-data'
>>> prefix = 'annual/monthly_rain/'
>>> for key, content in s3_iter_bucket(bucket, prefix=prefix, accept_key=lambda key: '/201' in key, workers=1, key_limit=3):
>>> for key, content in s3.iter_bucket(bucket, prefix=prefix, accept_key=lambda key: '/201' in key, workers=1, key_limit=3):
... print(key, round(len(content) / 2**20))
annual/monthly_rain/2010.monthly_rain.nc 13
annual/monthly_rain/2011.monthly_rain.nc 13
Expand Down
7 changes: 7 additions & 0 deletions extending.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ This is the part that goes before the `://` in a URL, e.g. `s3://`."""
URI_EXAMPLES = ('xxx://foo/bar', 'zzz://baz/boz')
"""This will appear in the documentation of the the `parse_uri` function."""

MISSING_DEPS = False
"""Wrap transport-specific imports in a try/catch and set this to True if
any imports are not found. Seting MISSING_DEPS to True will cause the library
to suggest installing its dependencies with an example pip command.

If your transport has no external dependencies, you can omit this variable.
"""

def parse_uri(uri_as_str):
"""Parse the specified URI into a dict.
Expand Down
8 changes: 3 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def read(fname):
'parameterizedtestcase',
'pytest',
'pytest-rerunfailures',
'python-jose'
]

install_requires = [
Expand Down Expand Up @@ -82,14 +83,11 @@ def read(fname):
license='MIT',
platforms='any',

# Concatenating the lists together is temporary and will
# eventually simply be install_requires dropping the cloud
# dependencies from being installed without explicitly being declared.
install_requires=install_requires + aws_deps,
install_requires=install_requires,
tests_require=tests_require,
extras_require={
'test': tests_require,
'aws': aws_deps,
's3': aws_deps,
'gcp': gcp_deps,
'azure': azure_deps,
'all': all_deps,
Expand Down
38 changes: 36 additions & 2 deletions smart_open/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,45 @@
#
# Prevent regression of #474 and #475
#
logging.getLogger(__name__).addHandler(logging.NullHandler())
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

from smart_open import version # noqa: E402
from .smart_open_lib import open, parse_uri, smart_open, register_compressor # noqa: E402
from .s3 import iter_bucket as s3_iter_bucket # noqa: E402

_WARNING = """smart_open.s3_iter_bucket is deprecated and will stop functioning
in a future version. Please import iter_bucket from the smart_open.s3 module instead:

from smart_open.s3 import iter_bucket as s3_iter_bucket

"""
_WARNED = False


def s3_iter_bucket(
bucket_name,
prefix='',
accept_key=None,
key_limit=None,
workers=16,
retries=3,
**session_kwargs
):
global _WARNED
from .s3 import iter_bucket
if not _WARNED:
logger.warn(_WARNING)
_WARNED = True
return iter_bucket(
bucket_name=bucket_name,
prefix=prefix,
accept_key=accept_key,
key_limit=key_limit,
workers=workers,
retries=retries,
session_kwargs=session_kwargs
)


__all__ = [
'open',
Expand Down
7 changes: 5 additions & 2 deletions smart_open/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
import smart_open.bytebuffer
import smart_open.constants

import azure.storage.blob
import azure.core.exceptions
try:
import azure.storage.blob
import azure.core.exceptions
except ImportError:
MISSING_DEPS = True

logger = logging.getLogger(__name__)

Expand Down
9 changes: 6 additions & 3 deletions smart_open/gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
import io
import logging

import google.cloud.exceptions
import google.cloud.storage
import google.auth.transport.requests
try:
import google.cloud.exceptions
import google.cloud.storage
import google.auth.transport.requests
except ImportError:
MISSING_DEPS = True

import smart_open.bytebuffer
import smart_open.utils
Expand Down
13 changes: 9 additions & 4 deletions smart_open/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
import logging
import time

import boto3
import botocore.client
import botocore.exceptions
try:
import boto3
import botocore.client
import botocore.exceptions
except ImportError:
MISSING_DEPS = True

import smart_open.bytebuffer
import smart_open.concurrency
Expand Down Expand Up @@ -961,7 +964,9 @@ def _retry_if_failed(
partial,
attempts=_UPLOAD_ATTEMPTS,
sleep_seconds=_SLEEP_SECONDS,
exceptions=(botocore.exceptions.EndpointConnectionError, )):
exceptions=None):
if exceptions is None:
exceptions = (botocore.exceptions.EndpointConnectionError, )
for attempt in range(attempts):
try:
return partial()
Expand Down
4 changes: 2 additions & 2 deletions smart_open/smart_open_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
import warnings
import sys

import boto3

#
# This module defines a function called smart_open so we cannot use
# smart_open.submodule to reference to the submodules.
Expand Down Expand Up @@ -289,6 +287,8 @@ def smart_open(uri, mode="rb", **kw):
logger.error('profile_name and s3_session are mutually exclusive, ignoring the former')

if 'profile_name' in kw:
import boto3

transport_params['session'] = boto3.Session(profile_name=kw.pop('profile_name'))

if 's3_session' in kw:
Expand Down
Empty file.
14 changes: 14 additions & 0 deletions smart_open/tests/fixtures/good_transport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
"""A no-op transport that registers scheme 'foo'"""
import io

SCHEME = "foo"
open = io.open


def parse_uri(uri_as_string): # pragma: no cover
...


def open_uri(uri_as_string, mode, transport_params): # pragma: no cover
...
20 changes: 20 additions & 0 deletions smart_open/tests/fixtures/missing_deps_transport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
"""Transport that has missing deps"""
import io


try:
import this_module_does_not_exist_but_we_need_it # noqa
except ImportError:
MISSING_DEPS = True

SCHEME = "missing"
open = io.open


def parse_uri(uri_as_string): # pragma: no cover
...


def open_uri(uri_as_string, mode, transport_params): # pragma: no cover
...
13 changes: 13 additions & 0 deletions smart_open/tests/fixtures/no_schemes_transport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
"""A transport that is missing the required SCHEME/SCHEMAS attributes"""
import io

open = io.open


def parse_uri(uri_as_string): # pragma: no cover
...


def open_uri(uri_as_string, mode, transport_params): # pragma: no cover
...
31 changes: 31 additions & 0 deletions smart_open/tests/test_package.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
import os
import unittest
import pytest

from smart_open import open

skip_tests = "SMART_OPEN_TEST_MISSING_DEPS" not in os.environ


class PackageTests(unittest.TestCase):

@pytest.mark.skipif(skip_tests, reason="requires missing dependencies")
def test_azure_raises_helpful_error_with_missing_deps(self):
with pytest.raises(ImportError, match=r"pip install smart_open\[azure\]"):
open("azure://foo/bar")

@pytest.mark.skipif(skip_tests, reason="requires missing dependencies")
def test_aws_raises_helpful_error_with_missing_deps(self):
match = r"pip install smart_open\[s3\]"
with pytest.raises(ImportError, match=match):
open("s3://foo/bar")
# With the DRY errors in transport, this no longer gets a nice error message
with pytest.raises(ImportError):
from smart_open import smart_open
smart_open('fake-name', profile_name="will produce an error importing s3")

@pytest.mark.skipif(skip_tests, reason="requires missing dependencies")
def test_gcs_raises_helpful_error_with_missing_deps(self):
with pytest.raises(ImportError, match=r"pip install smart_open\[gcs\]"):
open("gs://foo/bar")
12 changes: 12 additions & 0 deletions smart_open/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,18 @@ def test_iter_bucket(self):
results = list(smart_open.s3.iter_bucket(BUCKET_NAME))
self.assertEqual(len(results), 10)

def test_deprecated_top_level_s3_iter_bucket(self):
populate_bucket()
with self.assertLogs(smart_open.logger.name, level='WARN') as cm:
# invoking once will generate a warning
smart_open.s3_iter_bucket(BUCKET_NAME)
# invoking again will not (to reduce spam)
smart_open.s3_iter_bucket(BUCKET_NAME)
# verify only one output
assert len(cm.output) == 1
# verify the suggested new import is in the warning
assert "from smart_open.s3 import iter_bucket as s3_iter_bucket" in cm.output[0]

def test_accepts_boto3_bucket(self):
populate_bucket()
bucket = boto3.resource('s3').Bucket(BUCKET_NAME)
Expand Down
22 changes: 22 additions & 0 deletions smart_open/tests/test_transport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
import pytest
import unittest

from smart_open.transport import register_transport, get_transport


class TransportTest(unittest.TestCase):

def test_registry_requires_declared_schemes(self):
with pytest.raises(ValueError):
register_transport('smart_open.tests.fixtures.no_schemes_transport')

def test_registry_errors_on_double_register_scheme(self):
register_transport('smart_open.tests.fixtures.good_transport')
with pytest.raises(AssertionError):
register_transport('smart_open.tests.fixtures.good_transport')

def test_registry_errors_get_transport_for_module_with_missing_deps(self):
register_transport('smart_open.tests.fixtures.missing_deps_transport')
with pytest.raises(ImportError):
get_transport("missing")
31 changes: 23 additions & 8 deletions smart_open/transport.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@
NO_SCHEME = ''

_REGISTRY = {NO_SCHEME: smart_open.local_file}
_ERRORS = {}
_MISSING_DEPS_ERROR = """You are trying to use the %(module)s functionality of smart_open
but you do not have the correct %(module)s dependencies installed. Try:

pip install smart_open[%(module)s]

"""


def register_transport(submodule):
Expand All @@ -35,12 +42,17 @@ def register_transport(submodule):
Once registered, you can get the submodule by calling :func:`get_transport`.

"""
global _REGISTRY
global _REGISTRY, _ERRORS
module_name = submodule
if isinstance(submodule, str):
try:
submodule = importlib.import_module(submodule)
except ImportError:
return
else:
module_name = submodule.__name__
# Save only the last module name piece
module_name = module_name.rsplit(".")[-1]

if hasattr(submodule, 'SCHEME'):
schemes = [submodule.SCHEME]
Expand All @@ -54,7 +66,10 @@ def register_transport(submodule):

for scheme in schemes:
assert scheme not in _REGISTRY
_REGISTRY[scheme] = submodule
if hasattr(submodule, "MISSING_DEPS") and getattr(submodule, "MISSING_DEPS") is True:
_ERRORS[scheme] = module_name
else:
_REGISTRY[scheme] = submodule


def get_transport(scheme):
Expand All @@ -63,19 +78,19 @@ def get_transport(scheme):
This submodule must have been previously registered via :func:`register_transport`.

"""
global _ERRORS, _MISSING_DEPS_ERROR, _REGISTRY, SUPPORTED_SCHEMES
expected = SUPPORTED_SCHEMES
readme_url = 'https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst'
message = (
"Unable to handle scheme %(scheme)r, expected one of %(expected)r. "
"Extra dependencies required by %(scheme)r may be missing. "
"See <%(readme_url)s> for details." % locals()
)
try:
submodule = _REGISTRY[scheme]
except KeyError as e:
raise NotImplementedError(message) from e
else:
return submodule
if scheme in _ERRORS:
raise ImportError(_MISSING_DEPS_ERROR % dict(module=_ERRORS[scheme]))
if scheme in _REGISTRY:
return _REGISTRY[scheme]
raise NotImplementedError(message)


register_transport(smart_open.local_file)
Expand Down
Loading