Skip to content

Feat: describe enhancement or feature (Issue #41) #106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ReBACH is run via the command line as outlined in the 'How to Run' section of th
- user - required: Your user email address on AP Trust
- token - required: Your user secret token on AP Trust
- items_per_page - Maximum number of object to be return per page by the API
- alt_identifier_starts_with - Prefix for alternate identifier in AP Trust
- alt_identifier_starts_with - Prefix for alternate identifier in AP Trust
- retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
- retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
- preservation_storage_location - required: The file system location where the preservation folders/packages should be created
Expand All @@ -54,6 +54,7 @@ These parameters are only available on the command line.
|`--xfg` | The path to the configuration file to use.|
|`--ids` | A comma-separated list of article IDs to process. E.g., 12345,12356|
|`--continue-on-error`| If there is an error during the item processing stage for a given item, skip it and continue to the next item.|
|`--dry-run` | Runs all operations, excluding any that involve writing any storage medium |

## Execution notes
- ReBACH will attempt to fetch all items in the institutional instance. Items that are not published (curation_status != 'approved') will be ignored.
Expand Down
3 changes: 3 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def get_args():
help='list of article and/or collection IDs to process. E.g., "2323,4353,5454"')
parser.add_argument('--continue-on-error', action='store_true',
help='If an item encounters an error during the processing stage, continue to the next item.')
parser.add_argument('--dry-run', action='store_true',
help='Fetch, match and verify items only. Do not download, delete, or upload to preservation any files.')
args = parser.parse_args()


Expand Down Expand Up @@ -72,6 +74,7 @@ def main():
config_obj = Config(env_file)

config_obj.add_setting(name='continue-on-error', value=args.continue_on_error)
config_obj.add_setting(name='dry-run', value=args.dry_run)

figshare_config = config_obj.figshare_config()
system_config = config_obj.system_config()
Expand Down
47 changes: 36 additions & 11 deletions figshare/Article.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,10 @@ def __check_file_hash(self, files, version_data, folder_path):
# delete directory if validation failed.
if (delete_folder is True):
self.logs.write_log_in_file("error", f"Validation failed, deleting {preservation_storage_location + folder_path}.", True)
self.delete_folder(preservation_storage_location + folder_path)
if self.system_config['dry-run'] == 'False':
self.delete_folder(preservation_storage_location + folder_path)
else:
self.logs.write_log_in_file("info", "*Dry Run* Folder not deleted.", True)
process_article = True

return process_article
Expand Down Expand Up @@ -1008,8 +1011,14 @@ def process_articles(self, articles):

if (version_data["matched"] is True):
self.logs.write_log_in_file("info", f"------- Processing article {article} version {version_data['version']}.", True)

# call pre process script function for each matched item.
value_pre_process = self.pre_process_script_function()
if self.system_config['dry-run'] == 'False':
value_pre_process = self.pre_process_script_function()
else:
value_pre_process = 0
self.logs.write_log_in_file("info", "*Dry Run* Skipping pre processing.", True)

if (value_pre_process == 0):
self.logs.write_log_in_file("info", "Pre-processing script finished successfully.", True)
# check main folder exists in preservation storage.
Expand All @@ -1026,24 +1035,40 @@ def process_articles(self, articles):
else:
self.logs.write_log_in_file("info", "Exists and is empty", True)
check_files = False
# delete folder if validation fails
self.delete_folder(check_dir)
# call post process script function for each matched item. Code 5 corresponds to step 5 of S4.4 in the spec.
value_post_process = self.processor.post_process_script_function("Article", check_dir, value_pre_process, 5)
if (value_post_process != 0):
self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - "
+ "Post-processing script error found.", True)

if self.system_config['dry-run'] == 'False':
# delete folder if validation fails
self.delete_folder(check_dir)
# call post process script function for each matched item. Code 5 corresponds to step 5 of S4.4 in the spec.
value_post_process = self.processor.post_process_script_function("Article", check_dir, value_pre_process, 5)
if (value_post_process != 0):
self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - "
+ "Post-processing script error found.", True)
else:
self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with "
+ f"{self.system_config['post_process_script_command']} skipped.", True)

break
else:
self.logs.write_log_in_file("info", "Does not exist. Folder will be created", True)
value_post_process = 0
if self.system_config['dry-run'] == 'False':
self.logs.write_log_in_file("info", "Does not exist. Folder will be created", True)
else:
self.logs.write_log_in_file("info", "*Dru Run* Does not exist. Folder will not be created", True)

# end check main folder exists in preservation storage.
# check required files exist in curation UAL_RDM folder
self.logs.write_log_in_file("info", "Checking required files exist in associated curation "
+ f"folder {curation_storage_location}.", True)
copy_files = self.__can_copy_files(version_data)
if self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process):

if self.system_config['dry-run'] == 'False':
if self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process):
processed_count += 1
else:
processed_count += 1
self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with "
+ f"{self.system_config['post_process_script_command']} skipped.", True)
else:
self.logs.write_log_in_file("error", "Pre-processing script failed. Running post-processing script.", True)
# call post process script function for each matched item.
Expand Down
18 changes: 13 additions & 5 deletions figshare/Collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,21 @@ def process_collections(self, collections):
version["license"] = json.loads('{"value": 2,"name": "CC0","url": "https://creativecommons.org/publicdomain/zero/1.0/"}')

self.logs.write_log_in_file("info", f"------- Processing collection {collection} version {version['version']}.", True)
self.__save_json_in_metadata(collection, version, folder_name)
collection_preservation_path = self.preservation_storage_location + os.path.basename(os.path.dirname(os.path.dirname(folder_name)))
value_post_process = self.processor.post_process_script_function("Collection", collection_preservation_path)
if (value_post_process != 0):
self.logs.write_log_in_file("error", f"collection {collection} - post-processing script failed.", True)

if self.system_config['dry-run'] == 'False':
self.__save_json_in_metadata(collection, version, folder_name)
collection_preservation_path = self.preservation_storage_location + \
os.path.basename(os.path.dirname(os.path.dirname(folder_name)))
value_post_process = self.processor.post_process_script_function("Collection", collection_preservation_path)
if (value_post_process != 0):
self.logs.write_log_in_file("error", f"collection {collection} - post-processing script failed.", True)
else:
processed_count += 1
else:
self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with "
+ f"{self.system_config['post_process_script_command']} skipped.", True)
processed_count += 1

return processed_count, self.already_preserved_counts_dict

"""
Expand Down
Loading