Skip to content

Commit 27992d3

Browse files
author
Dominick Leppich
committed
task: delete metadata for intentionally missing vocabulary values
1 parent ccedf2b commit 27992d3

File tree

4 files changed

+60
-5
lines changed

4 files changed

+60
-5
lines changed

migration/lib/mets_context.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
RECORD_PATTERN = re.compile('^(\\d+).*$')
77

88
class Context:
9-
def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic):
9+
def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic, delete_missing_vocabulary_references):
1010
self.api = api
1111
self.dry = dry
1212
self.verbose = verbose
@@ -17,6 +17,8 @@ def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, map
1717
self.manual_id_fix = manual_id_fix
1818
self.trust = trust
1919
self.enable_relation_vocabulary_column_logic = enable_relation_vocabulary_column_logic
20+
self.delete_missing_vocabulary_references = delete_missing_vocabulary_references
21+
self.removable_metadata_map = {}
2022
self.vocabulary_name_id_map = {}
2123
self.vocabulary_id_name_map = {}
2224
self.vocabulary_id_map = {}
@@ -133,6 +135,12 @@ def robust_find_record_id(self, parts):
133135
return vocabulary_id, record_id
134136
except:
135137
return None, None
138+
139+
def is_removable_metadata(self, vocabulary_id, value):
140+
if not vocabulary_id in self.removable_metadata_map:
141+
return False
142+
143+
return value in self.removable_metadata_map[vocabulary_id]
136144

137145
def log_processed(self, file):
138146
with open('mets_migration.log', 'a') as f:

migration/lib/mets_manipulator.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,10 @@ def process_vocabulary_reference_by_value(self, node):
187187
inverse_search_field='Reverse relationship'
188188

189189
try:
190+
# First, try to find the value in the correct column
190191
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=search_field)
191192
except:
193+
# If failed, try to find the value in the other column (assuming the value was stored incorrectly)
192194
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=inverse_search_field)
193195
old_value = node.text
194196
record_data = self.ctx.api.lookup_record(new_record_id)
@@ -223,9 +225,27 @@ def process_vocabulary_reference_by_value(self, node):
223225

224226
self.changed = True
225227
except Exception as e:
226-
error = f'Unable to find record by value: {value}\n\t\t{e}'
227-
logging.error(error)
228-
self.ctx.log_issue(self.file_path, error)
228+
# If this fails as well and the value is not found, remove the metadata if configured
229+
if 'has no results' in e.__str__() and self.ctx.is_removable_metadata(vocabulary_id, node.text):
230+
logging.warn(f'Removing node due to intentionally missing vocabulary value: "{node.text}"')
231+
self.remove_metadata_node(node)
232+
else:
233+
error = f'Unable to find record by value: {value}\n\t\t{e}'
234+
logging.error(error)
235+
self.ctx.log_issue(self.file_path, error)
236+
237+
def remove_metadata_node(self, node):
238+
parent = node.getparent()
239+
if parent != None and parent.attrib['type'] == 'group':
240+
node = parent
241+
parent = node.getparent()
242+
243+
if parent == None:
244+
dump_node(node)
245+
raise Exception(f'Unable to remove node due to missing parent')
246+
247+
parent.remove(node)
248+
self.changed = True
229249

230250
def process_manual_id_reference(self, node):
231251
try:

migration/lib/mets_migrator.py

+26
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def __init__(self, ctx):
1111

1212
def migrate(self):
1313
self.load_mapping_file()
14+
self.load_delete_missing_vocabulary_references_file()
1415
self.mets_files = self.scan_for_mets_files()
1516
logging.info(f'{len(self.mets_files)} mets file(s) found!')
1617
logging.info(f'Start processing ...')
@@ -52,6 +53,31 @@ def load_mapping_file(self):
5253
else:
5354
raise Exception(f'Mapping file contains duplicate entry for old record {record_id_old}')
5455

56+
def load_delete_missing_vocabulary_references_file(self):
57+
if self.ctx.delete_missing_vocabulary_references == None:
58+
return
59+
60+
header = None
61+
with open(self.ctx.delete_missing_vocabulary_references, 'r') as f:
62+
for line in f:
63+
line = line.strip()
64+
if header == None:
65+
header = line
66+
if header != CSV_DELIMITER.join(['vocabulary_id', 'value']):
67+
raise Exception('Header mismatch in mapping file!')
68+
continue
69+
70+
parts = line.split(CSV_DELIMITER)
71+
if len(parts) != 2:
72+
raise Exception(f'Wrong number of fields in line: {line}')
73+
74+
vocabulary_id = int(parts[0])
75+
value = parts[1]
76+
77+
if not vocabulary_id in self.ctx.removable_metadata_map:
78+
self.ctx.removable_metadata_map[vocabulary_id] = []
79+
self.ctx.removable_metadata_map[vocabulary_id].append(value)
80+
5581
def scan_for_mets_files(self):
5682
results = []
5783
for root, dirs, files in os.walk(self.ctx.metadata_directory):

migration/metadata-migrator.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def main():
1414
args.vocabulary_server_port,
1515
args.vocabulary_server_token
1616
)
17-
ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic)
17+
ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic, args.delete_missing_vocabulary_references)
1818

1919
try:
2020
migrator = MetsMigrator(ctx)
@@ -41,6 +41,7 @@ def parse_args():
4141
parser.add_argument('--preferred-mets-main-value-language', type=str, default='eng', help='Default language to use for mets value writing, if present and prior value invalid')
4242
parser.add_argument('--trust', required=False, type=str, default='ID', help='Set the data source to trust for the migration. Possible values are: "ID" and "Value". If "ID" is set, the record ID is parsed from the valueURI and used to find the migrated record. If "Value" is set, the XML elements value is used to find the newly migrated record by value. Defaults to "ID".')
4343
parser.add_argument('--enable-relation-vocabulary-column-logic', required=False, default=False, action='store_const', const=True, help='Activate relationship vocabulary correct column finding logic (reverse vs non-reverse, artist dictionary)')
44+
parser.add_argument('--delete-missing-vocabulary-references', type=str, required=False, default=None, help='vocabulary and value mapping file defining intentionally removed vocabulary values that should be removed in the Mets files as well.')
4445
parser.add_argument('--manual-id-fix', type=str, default=None, help='Manually fix the record ID of elements whose name attribute matches this parameter. Caution, this must not be executed twice!')
4546
parser.add_argument('--log', required=False, default='INFO', help='logger level (possible values are: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL)')
4647
parser.add_argument('--verbose', required=False, default=False, action='store_const', const=True, help='verbose output')

0 commit comments

Comments
 (0)