Add delete option to dump_to_s3

cschloer · cschloer · commit 67772bcb315e · 2023-12-07T20:35:24.000+01:00
diff --git a/bcodmo_frictionless/bcodmo_pipeline_processors/dump_to_s3.py b/bcodmo_frictionless/bcodmo_pipeline_processors/dump_to_s3.py
@@ -95,6 +95,7 @@ def __init__(self, bucket_name, prefix, **options):
         self.submission_id = options.get("submission_id", None)
         self.submission_ids = options.get("submission_ids", [])
         self.cache_id = options.get("cache_id", None)
+        self.delete = options.get("delete", False)
 
         self.prefix = prefix
         self.bucket_name = bucket_name
@@ -115,6 +116,27 @@ def __init__(self, bucket_name, prefix, **options):
         else:
             logging.warn("Using base boto credentials for S3 Dumper")
             self.s3 = boto3.resource("s3")
+        if self.delete:
+            s3_client = boto3.client(
+                "s3",
+                aws_access_key_id=access_key,
+                aws_secret_access_key=secret_access_key,
+                endpoint_url=host,
+            )
+            res = s3_client.list_objects_v2(
+                Bucket=self.bucket_name,
+                Prefix=self.prefix,
+            )
+            if "Contents" in res:
+                contents = res["Contents"]
+                if len(contents) >= 10:
+                    raise Exception(
+                        f"Throwing an error from the dump_to_s3 processor because the number of files to be deleted was more than 10. This is a safety measure to ensure we don't accidently more files than expected."
+                    )
+                s3_client.delete_objects(
+                    Bucket=self.bucket_name,
+                    Delete={"Objects": [{"Key": obj["Key"]} for obj in contents]},
+                )
 
     def process_datapackage(self, datapackage):
         datapackage = super(S3Dumper, self).process_datapackage(datapackage)
@@ -155,10 +177,10 @@ def write_file_to_output(self, contents, path, content_type):
         contents = contents.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING)
 
         start = time.time()
-        print(f"Starting save file {time.time()}")
+        # print(f"Starting save file {time.time()}")
         obj = self.s3.Object(self.bucket_name, obj_name)
         obj.put(Body=contents, ContentType=content_type)
-        print(f"Took {time.time() - start} to save the file ({path})")
+        # print(f"Took {time.time() - start} to save the file ({path})")
 
         return path, len(contents)
 
@@ -210,7 +232,6 @@ def rows_processor(self, resource, writer, stream):
         redis_conn = None
         progress_key = None
         resource_name = resource.res.descriptor["name"]
-        print("RUNNING ROWS PROCESSOR FOR RESOURCE", resource_name)
         if self.cache_id:
             redis_conn = get_redis_connection()
             redis_conn.sadd(
@@ -221,7 +242,7 @@ def rows_processor(self, resource, writer, stream):
             progress_key = get_redis_progress_key(resource_name, self.cache_id)
 
         row_number = None
-        print(f"Received at {time.time()}")
+        # print(f"Received at {time.time()}")
         start1 = time.time()
 
         try:
@@ -240,9 +261,9 @@ def rows_processor(self, resource, writer, stream):
             writer.finalize_file()
             if redis_conn is not None:
                 redis_conn.set(progress_key, REDIS_PROGRESS_SAVING_START_FLAG)
-            print(
-                f"Finished going through loop at {time.time()}, in {time.time() - start1}"
-            )
+            # print(
+            #    f"Finished going through loop at {time.time()}, in {time.time() - start1}"
+            # )
 
             # Get resource descriptor
             resource_descriptor = resource.res.descriptor
@@ -261,7 +282,7 @@ def rows_processor(self, resource, writer, stream):
                 DumperBase.set_attr(
                     resource_descriptor, self.resource_hash, hasher.hexdigest()
                 )
-            print(f"After hash, starting to write file at {time.time()}")
+            # print(f"After hash, starting to write file at {time.time()}")
 
             # Finalise
             stream.seek(0)