Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft for Issue 183 list collection #342

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
77f959c
Draft of `FsspecUrlOperations`
mih Jan 13, 2023
40dd829
Basic smoke testing for fsspec and support `github://`
mih Jan 13, 2023
74bbdce
More fsspec support, in particular credential provisioning for `s3://`
mih Jan 17, 2023
d8c655b
Support arbitrary FSSPEC customizations for S3 access
mih Jan 18, 2023
ce67f4c
Let `FsspecUrlOperations` take customizing arguments for `url_to_fs()`
mih Jan 18, 2023
f34df7d
Update `AnyUrlOperations`'s handler registry to include kwargs
mih Jan 18, 2023
72d350d
Turn off version-awareness in the default S3 handler
mih Jan 18, 2023
aa92043
Avoid necessity to account for (un)chained URLs in config
mih Jan 18, 2023
36d14e1
Fix undefined variable access
mih Jan 18, 2023
d22f6e1
Support non-AWS S3 endpoints for credential lookup
mih Jan 18, 2023
2366720
Streamline and complete dependencies
mih Jan 18, 2023
be271fe
Introduce `FsspecUrlOperations(block_size=...)` parameter
mih Jan 18, 2023
a74b823
Revert turning off version awareness for S3
mih Jan 18, 2023
2d6b80c
Rough approximations or `upload` and `delete` for `FsspecUrlOperations`
mih Jan 20, 2023
6f3c678
Merge branch 'main' into fsspec
adswa May 3, 2023
5fe8122
Remove deprecated and unused intersphinx mapping def
mih May 3, 2023
a861d68
TST: Add a test for S3 version handling
adswa May 3, 2023
744bf8b
TMP: Document a note to not set version_aware fs_kwargs
adswa May 3, 2023
54da154
Merge remote-tracking branch 'origin/main' into fsspec
mih May 4, 2023
68b34b2
Fix typos
mih May 4, 2023
375f7b1
Merge remote-tracking branch 'origin/main' into fsspec
mih May 4, 2023
edc1a34
Protect keyring from modification in test
mih May 4, 2023
4a74ad2
Merge remote-tracking branch 'origin/main' into fsspec
mih May 4, 2023
b12824c
Add error handling for inaccessible URLs
adswa May 4, 2023
d7d3f93
TST: Add a test case for error handling in download
adswa May 4, 2023
8dd3ef8
Merge branch 'fsspec' of github.com:mih/datalad-next into fsspec
adswa May 4, 2023
9e9b153
TST: skip ZIP url download tests on windows while fsspec crashes with it
adswa May 4, 2023
c3e3673
Try to broaden condition to crippledFS and windows
adswa May 4, 2023
30d4a50
add a first version of list-collection
christian-monch May 5, 2023
4c1a9da
add collection type detection code
christian-monch May 7, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions datalad_next/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
(
'datalad_next.commands.download', 'Download', 'download',
),
(
'datalad_next.commands.list_collection',
'ListCollection',
'list-collection',
),
]
)

Expand Down
197 changes: 197 additions & 0 deletions datalad_next/commands/list_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See LICENSE file distributed along with the datalad_osf package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""List content of collection objects like datasets, tar-files, etc"""
from __future__ import annotations

import json
import logging
from pathlib import Path
from urllib.parse import urlparse

import fsspec

from datalad.interface.base import eval_results
from datalad_next.commands import (
EnsureCommandParameterization,
ValidatedInterface,
Parameter,
build_doc,
generic_result_renderer,
get_status_dict,
)
from datalad_next.constraints import EnsureChoice
from datalad_next.datasets import datasetmethod
from datalad_next.uis import ui_switcher as ui
from datalad_next.url_operations.fsspec import FsspecUrlOperations


__docformat__ = 'restructuredtext'

lgr = logging.getLogger('datalad.local.list_collection')

action_name = 'list-collection'


# Collection types that are supported. Most are auto-detected. `annex` has to be
# specified, because it always exists in combination with `git`, and git` takes
# preference. Therefore `annex` is never auto-detected.
#
# DataLad datasets (`dataset`-type) also always exists in combination with `git`
# and take precedence over `git` and `annex`.
#
# A bare git repository is always autodetected as `git-bare`, even if it
# contains a DataLad dataset or an annex
known_types = [
'tar',
'zip',
'dataset',
'git',
'git-bare',
'directory',
'annex',
'7z',
]

# File type identification by suffix
known_suffixes = {
'tar': [['.tar'], ['.tgz'], ['.tar', '.gz']],
'zip': [['.zip']],
'7z': [['.7z']],
}

# Directory type identification by child names, the order of the entries matters
identifying_children = {
'git-bare': [
'branches', 'config', 'description', 'HEAD',
'hooks', 'info', 'objects', 'refs'
],
'dataset': ['.datalad'],
'git': ['.git'],
}


class ListCollectionParamValidator(EnsureCommandParameterization):

def __init__(self):
super().__init__(
param_constraints=dict(
collection_type=EnsureChoice(*known_types)))


@build_doc
class ListCollection(ValidatedInterface):
"""List content of collection objects

"""
result_renderer = 'tailored'

_params_ = dict(
collection_type=Parameter(
args=("-t", "--collection-type"),
doc="""specify the type of the collection that should be listed.
This argument will override automatic type detection. Automated
type detection is based on the collection type, e.g. file or
directory, and on the name of the collection, e.g. 'xyz.tar'
(not implemented yet)."""),
location=Parameter(
doc="""the location of the object that should be listed. The
format defined by `fsspec`"""),
)

_examples_ = [
dict(text='List tar file at an http-location',
code_py="list_collection('tar:///::http://example.com/archive.tar', collection_type='tar')",
code_cmd="datalad list-collection -t tar 'tar:///::http://example.com/archive.tar'"),
]

_validator_ = ListCollectionParamValidator()

@staticmethod
@datasetmethod(name='credentials')
@eval_results
def __call__(location,
collection_type=None):

fsspec_url_ops = FsspecUrlOperations()

# Try to open the file system without providing credentials first
filesystem, url_path, properties = fsspec_url_ops._get_fs(location, credential=None)
for record in show_specfs_tree(filesystem, url_path):
yield {
**record,
'url_path': url_path or location}

@staticmethod
def custom_result_renderer(res, **_):
if res['action'] != action_name:
generic_result_renderer(res)
return
ui.message(json.dumps(res))


def show_specfs_tree(filesystem, current_path=''):
current_element = filesystem.stat(current_path)
if current_element['type'] == 'file':
yield get_status_dict(
action=action_name,
status='ok',
object=current_element)
return

for element in filesystem.ls(current_path):
if isinstance(element, str):
element = filesystem.stat(element)
yield get_status_dict(
action=action_name,
status='ok',
object=element)
if element['type'] == 'directory':
yield from show_specfs_tree(
filesystem, element['name'])


def detect_collection_type(url) -> str:

open_file = fsspec.open(url)
base_path = Path(urlparse(open_file.full_name).path)

# We use an appended '/' to detect subdirectories because
# some file systems signal a file at a directory
# name. For example, a HTTP server provide
# some content, e.g. a HTML page that lists the
# directory. We could work around that by adding
# a trailing '/' (at least on unix) to the path,
# but we can also let fsspec do that vie the `ls`
# file system method. We do the latter, because
# we need the subdirectories anyway

if open_file.fs.isdir(open_file.full_name + '/'):

# For the remaining checks, we need the relative child names.
children = open_file.fs.ls(open_file.full_name)
child_names = [Path(urlparse(url).path).name for url in children]

for type_identifier, children_list in identifying_children.items():
if all([identifying_child in child_names
for identifying_child in children_list]):
collection_type = type_identifier
break
else:
collection_type = 'directory'

else:

for type_identifier, suffixes_list in known_suffixes.items():
if base_path.suffixes in suffixes_list:
collection_type = type_identifier
break
else:
collection_type = 'file'

return collection_type
46 changes: 46 additions & 0 deletions datalad_next/commands/tests/test_list_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from pathlib import Path
from shutil import rmtree

from ..list_collection import detect_collection_type


def test_local_file(tmp_path: Path):

expected_types = [
('', 'file'),
('.x', 'file'),
('.tar', 'tar'),
('.tar.gz', 'tar'),
('.tgz', 'tar'),
('.zip', 'zip'),
('.7z', '7z')
]

# Test plain file detection
for suffix, expected_type in expected_types:
test_file_path = tmp_path / ('test_file' + suffix)
test_file_path.write_text('abc')
assert detect_collection_type(str(test_file_path)) == expected_type


def test_local_dir_types(tmp_path: Path):

bare_subdirs = [
'branches', 'config', 'description', 'HEAD',
'hooks', 'info', 'objects', 'refs',
]

expected_types = [
([], 'directory'),
(['.git'], 'git'),
(['.git', '.datalad'], 'dataset'),
(bare_subdirs, 'git-bare'),
]

test_dir_path = tmp_path / 'test_dir'
for subdir_names, expected_type in expected_types:
test_dir_path.mkdir()
for subdir_name in subdir_names:
(test_dir_path / subdir_name).mkdir()
assert detect_collection_type(str(test_dir_path)) == expected_type
rmtree(str(test_dir_path))
40 changes: 40 additions & 0 deletions datalad_next/url_operations/any.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,27 @@

__all__ = ['AnyUrlOperations']

_defaults = {
's3': (
'datalad_next.url_operations.fsspec.FsspecUrlOperations',
# any S3 is interpreted as AWS-S3 by default.
# attempt anonymous access first (unless a credential is given).
# this can impair performance when, in fact, a credential is
# known to be needed, but need not, or cannot be declared
# explicitly via the `credential` parameter. For such cases,
# define a separate handler with a tailored URL match expression
# and with `anon=False`.
# version awareness is turned off by default. It requires a
# requesting entity to have IAM permissions to perform a
# GetObjectVersion, and it comes with a potential performance
# penalty. Again, define a dedicated handler if this feature
# is needed.
# we need not provide all settings twice, i.e. on top-level and
# in 's3' to match chained and unchained URLs. The S3 filesystem
# helper pulls them from 's3' if needed.
{'fs_kwargs': {'s3': {'anon': True}}},
),
}
# define handlers for each supported URL pattern
# FORMAT OF HANDLER REGISTRY (dict)
# - key: regex match expression to be apply on a URL (to test whether a
Expand All @@ -41,7 +62,26 @@
'http': ('datalad_next.url_operations.http.HttpUrlOperations',),
'file': ('datalad_next.url_operations.file.FileUrlOperations',),
'ssh': ('datalad_next.url_operations.ssh.SshUrlOperations',),
# anything pointing to S3, directly or indirectly
'(^|.*::)s3://': _defaults['s3'],
}
# add anything that we also want fsspec to provide
for regex in (
# archive access
'zip://',
'tar://',
# services
# github projects (down to files in particular versions)
'(^|.*::)github://',
# we occupy ssh:// with our own implementation, but fsspec also does
# have a paramiko-based one, expose as sftp://
'(^|.*::)sftp://',
# file-level caching
'filecache::',
):
_url_handlers[regex] = (
'datalad_next.url_operations.fsspec.FsspecUrlOperations',
)


class AnyUrlOperations(UrlOperations):
Expand Down
6 changes: 3 additions & 3 deletions datalad_next/url_operations/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def upload(self,
FileNotFoundError
If the source file cannot be found.
"""
# get the size, or die if inaccessible
# get the size for progress reporting
props = {}
if from_path:
expected_size = from_path.stat().st_size
Expand All @@ -175,11 +175,11 @@ def upload(self,
))
return props
except FileNotFoundError as e:
raise UrlOperationsResourceUnknown(url) from e
raise UrlOperationsResourceUnknown(to_url) from e
except Exception as e:
# wrap this into the datalad-standard, but keep the
# original exception linked
raise UrlOperationsRemoteError(from_url, message=str(e)) from e
raise UrlOperationsRemoteError(to_url, message=str(e)) from e
finally:
if src_fp and from_path is not None:
src_fp.close()
Expand Down
Loading