Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix second upload file paths (issue 1492) #650

Merged
merged 4 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 60 additions & 68 deletions viewer/download_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,39 +528,36 @@ def _extra_files_zip(ziparchive, target):

num_processed = 0
num_extra_dir = 0
for experiment_upload in target.experimentupload_set.order_by('commit_datetime'):
extra_files = (
Path(settings.MEDIA_ROOT)
.joinpath(settings.TARGET_LOADER_MEDIA_DIRECTORY)
.joinpath(experiment_upload.task_id)
)
# taking the latest upload for now

# taking the latest upload for now
# add unpacked zip directory
extra_files = [d for d in list(extra_files.glob("*")) if d.is_dir()][0]

# add upload_[d] dir
extra_files = next(extra_files.glob("upload_*"))
extra_files = extra_files.joinpath('extra_files')

logger.debug('extra_files path 2: %s', extra_files)
logger.info('Processing extra files (%s)...', extra_files)

if extra_files.is_dir():
num_extra_dir = num_extra_dir + 1
for dirpath, _, files in os.walk(extra_files):
for file in files:
filepath = os.path.join(dirpath, file)
logger.info('Adding extra file "%s"...', filepath)
ziparchive.write(
filepath,
os.path.join(
f'{_ZIP_FILEPATHS["extra_files"]}_{num_extra_dir}', file
),
)
num_processed += 1
else:
logger.info('Directory does not exist (%s)...', extra_files)
experiment_upload = target.experimentupload_set.order_by('commit_datetime').last()
extra_files = (
Path(settings.MEDIA_ROOT)
.joinpath(settings.TARGET_LOADER_MEDIA_DIRECTORY)
.joinpath(target.zip_archive.name)
.joinpath(experiment_upload.upload_data_dir)
)

extra_files = extra_files.joinpath('extra_files')

logger.debug('extra_files path 2: %s', extra_files)
logger.info('Processing extra files (%s)...', extra_files)

if extra_files.is_dir():
num_extra_dir = num_extra_dir + 1
for dirpath, _, files in os.walk(extra_files):
for file in files:
filepath = os.path.join(dirpath, file)
logger.info('Adding extra file "%s"...', filepath)
ziparchive.write(
filepath,
os.path.join(
f'{_ZIP_FILEPATHS["extra_files"]}_{num_extra_dir}', file
),
)
num_processed += 1
else:
logger.info('Directory does not exist (%s)...', extra_files)

if num_processed == 0:
logger.info('No extra files found')
Expand All @@ -571,44 +568,39 @@ def _extra_files_zip(ziparchive, target):
def _yaml_files_zip(ziparchive, target, transforms_requested: bool = False) -> None:
"""Add all yaml files (except transforms) from upload to ziparchive"""

for experiment_upload in target.experimentupload_set.order_by('commit_datetime'):
yaml_paths = (
Path(settings.MEDIA_ROOT)
.joinpath(settings.TARGET_LOADER_MEDIA_DIRECTORY)
.joinpath(experiment_upload.task_id)
experiment_upload = target.experimentupload_set.order_by('commit_datetime').last()
yaml_paths = (
Path(settings.MEDIA_ROOT)
.joinpath(settings.TARGET_LOADER_MEDIA_DIRECTORY)
.joinpath(target.zip_archive.name)
.joinpath(experiment_upload.upload_data_dir)
)

transforms = [
Path(f.name).name
for f in (
experiment_upload.conformer_site_transforms,
experiment_upload.neighbourhood_transforms,
experiment_upload.reference_structure_transforms,
)
]

transforms = [
Path(f.name).name
for f in (
experiment_upload.neighbourhood_transforms,
experiment_upload.neighbourhood_transforms,
experiment_upload.neighbourhood_transforms,
)
]
# taking the latest upload for now
# add unpacked zip directory
yaml_paths = [d for d in list(yaml_paths.glob("*")) if d.is_dir()][0]

# add upload_[d] dir
yaml_paths = next(yaml_paths.glob("upload_*"))

archive_path = Path('yaml_files').joinpath(yaml_paths.parts[-1])

yaml_files = [
f
for f in list(yaml_paths.glob("*.yaml"))
if f.is_file() and f.name not in transforms
]

logger.info('Processing yaml files (%s)...', yaml_files)

for file in yaml_files:
logger.info('Adding yaml file "%s"...', file)
if not transforms_requested and file.name == 'neighbourhoods.yaml':
# don't add this file if transforms are not requested
continue
ziparchive.write(file, str(Path(archive_path).joinpath(file.name)))
archive_path = Path('yaml_files').joinpath(yaml_paths.parts[-1])

yaml_files = [
f
for f in list(yaml_paths.glob("*.yaml"))
if f.is_file() and f.name not in transforms
]

logger.info('Processing yaml files (%s)...', yaml_files)

for file in yaml_files:
logger.info('Adding yaml file "%s"...', file)
if not transforms_requested and file.name == 'neighbourhoods.yaml':
# don't add this file if transforms are not requested
continue
ziparchive.write(file, str(Path(archive_path).joinpath(file.name)))


def _document_file_zip(ziparchive, download_path, original_search, host):
Expand Down
53 changes: 35 additions & 18 deletions viewer/target_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import logging
import math
import os
import shutil
import tarfile
import uuid
from collections.abc import Callable
from dataclasses import dataclass, field
from enum import Enum
Expand Down Expand Up @@ -44,7 +44,7 @@
XtalformQuatAssembly,
XtalformSite,
)
from viewer.utils import alphanumerator
from viewer.utils import alphanumerator, sanitize_directory_name

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -475,29 +475,23 @@ def __init__(
)

# work out where the data finally lands
# path = Path(settings.MEDIA_ROOT).joinpath(TARGET_LOADER_DATA)
path = Path(TARGET_LOADER_MEDIA_DIRECTORY)

# give each upload a unique directory. since I already have
# task_id, why not reuse it
# give each upload a unique directory
# update: resolving issue 1311 introduced a bug, where
# subsequent uploads overwrote file paths and files appeared
# to be missing. changing the directory structure so this
# wouldn't be an issue, the new structure is
# target_loader_data/target_title/upload_(n)/...
if task:
path = path.joinpath(str(task.request.id))
self.experiment_upload.task_id = task.request.id
else:
# unless of course I don't have task..
# TODO: i suspect this will never be used.
path_uuid = uuid.uuid4().hex
path = path.joinpath(path_uuid)
self.experiment_upload.task_id = path_uuid

# figure out absolute and relative paths to final
# location. relative path is added to db field, this will be
# used in url requests to retrieve the file. absolute path is
# for moving the file to the final location
self._final_path = path.joinpath(self.bundle_name)
self._abs_final_path = (
Path(settings.MEDIA_ROOT).joinpath(path).joinpath(self.bundle_name)
)
self._final_path = path
self._abs_final_path = Path(settings.MEDIA_ROOT).joinpath(path)
# but don't create now, this comes later

# to be used in logging messages, if no task, means invoked
Expand Down Expand Up @@ -1501,6 +1495,21 @@ def process_bundle(self):
display_name=self.target_name,
)

if target_created:
# mypy thinks target and target_name are None
target_dir = sanitize_directory_name(self.target_name, self.abs_final_path) # type: ignore [arg-type]
self.target.zip_archive = target_dir # type: ignore [attr-defined]
self.target.save() # type: ignore [attr-defined]
else:
# NB! using existing field zip_archive to point to the
# location of the archives, not the archives
# themselves. The field was unused, and because of the
# versioned uploads, there's no single archive anymore
target_dir = str(self.target.zip_archive) # type: ignore [attr-defined]

self._final_path = self._final_path.joinpath(target_dir)
self._abs_final_path = self._abs_final_path.joinpath(target_dir)

# TODO: original target loader's function get_create_projects
# seems to handle more cases. adopt or copy
visit = self.proposal_ref.split()[0]
Expand Down Expand Up @@ -2200,8 +2209,16 @@ def load_target(

def _move_and_save_target_experiment(target_loader):
# Move the uploaded file to its final location
target_loader.abs_final_path.mkdir(parents=True)
target_loader.raw_data.rename(target_loader.abs_final_path)
try:
target_loader.abs_final_path.mkdir(parents=True)
except FileExistsError:
# subsequent upload, directory already exists
pass

shutil.move(
str(target_loader.raw_data.joinpath(target_loader.version_dir)),
str(target_loader.abs_final_path),
)
Path(target_loader.bundle_path).rename(
target_loader.abs_final_path.parent.joinpath(target_loader.data_bundle)
)
Expand Down
36 changes: 36 additions & 0 deletions viewer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import logging
import os
import re
import shutil
import string
import tempfile
Expand Down Expand Up @@ -530,3 +531,38 @@ def email_task_completion(
send_mail(subject, message, email_from, recipient_list, fail_silently=True)
logger.info('- email_notify_task_completion')
return


def sanitize_directory_name(name: str, path: Path | None = None) -> str:
"""
Sanitize a string to ensure it only contains characters allowed in UNIX directory names.

Parameters:
name: The input string to sanitize.
path (optional): the parent directory where the directory would reside, to check if unique

Returns:
str: A sanitized string with only allowed characters.
"""
# Define allowed characters regex
allowed_chars = re.compile(r'[^a-zA-Z0-9._-]')

# Replace disallowed characters with an underscore
sanitized_name = allowed_chars.sub('_', name.strip())

# Replace multiple underscores with a single underscore
sanitized_name = re.sub(r'__+', '_', sanitized_name)
logger.debug('sanitized name: %s', sanitized_name)
if path:
target_dirs = [d.name for d in list(path.glob("*")) if d.is_dir()]
logger.debug('target dirs: %s', target_dirs)
new_name = sanitized_name
suf = 1
while new_name in target_dirs:
suf = suf + 1
new_name = f'{sanitized_name}_{suf}'
logger.debug('looping suffix: %s', new_name)

sanitized_name = new_name

return sanitized_name