Add limit_rows_loader

cschloer · cschloer · commit 256b69b80049 · 2024-08-04T05:17:50.000+02:00
diff --git a/bcodmo_frictionless/bcodmo_pipeline_processors/loaders/bcodmo_aws.py b/bcodmo_frictionless/bcodmo_pipeline_processors/loaders/bcodmo_aws.py
@@ -1,5 +1,15 @@
-from tabulator.loaders.aws import AWSLoader
+from tabulator import Loader
+from tabulator import exceptions
+from tabulator import helpers
 from tabulator import config
+from six.moves.urllib.parse import urlparse
+
+import time
+import os
+import io
+import boto3
+import base64
+import zlib
 
 from bcodmo_frictionless.bcodmo_pipeline_processors.helper import (
     get_redis_progress_key,
@@ -10,12 +20,13 @@
 )
 
 
-class BcodmoAWS(AWSLoader):
+class BcodmoAWS(Loader):
     options = [
         "s3_endpoint_url",
         "loader_cache_id",
         "loader_resource_name",
         "preloaded_chars",
+        "_limit_rows",
     ]
 
     def __init__(
@@ -25,11 +36,84 @@ def __init__(
         loader_cache_id=None,
         loader_resource_name=None,
         preloaded_chars=None,
+        _limit_rows=None,
     ):
-        super(BcodmoAWS, self).__init__(s3_endpoint_url=s3_endpoint_url)
+        self.__bytes_sample_size = bytes_sample_size
+        self.__s3_endpoint_url = (
+            s3_endpoint_url
+            or os.environ.get("S3_ENDPOINT_URL")
+            or config.S3_DEFAULT_ENDPOINT_URL
+        )
+        self.__s3_client = boto3.client("s3", endpoint_url=self.__s3_endpoint_url)
+        self.__stats = None
+        self.encoding = None
+
         self.loader_cache_id = loader_cache_id
         self.loader_resource_name = loader_resource_name
         self.preloaded_chars = preloaded_chars
+        self.limit_rows = _limit_rows
+
+    def _stream_load(self, source, mode="t", encoding=None):
+        ###
+        # This is the same as the previous load but allows streaming
+        ###
+
+        # Prepare bytes
+        try:
+            # print("Not using shared memory")
+            start = time.time()
+            parts = urlparse(source, allow_fragments=False)
+            response = self.__s3_client.get_object(
+                Bucket=parts.netloc, Key=parts.path[1:]
+            )
+            # https://github.com/frictionlessdata/tabulator-py/issues/271
+            bytes = io.BufferedRandom(io.BytesIO())
+            row_count = 0
+            if self.limit_rows:
+                # We limit the number of rows being streamed
+                while True:
+                    contents = response["Body"].read(amt=1024)
+                    if contents:
+                        row_count += contents.count(b"\n")
+                        bytes.write(contents)
+                        # To be extra safe, we multiply by two and add 100. This is meant to deal with
+                        # situations where there is a weird header/empty row situation at the beginning of the file
+                        # which will later be filtered by the parser
+                        if row_count >= self.limit_rows * 2 + 100:
+                            break
+                    else:
+                        break
+            else:
+                contents = response["Body"].read()
+                bytes.write(contents)
+            bytes.seek(0)
+            try:
+                print(
+                    f"Took {round(time.time() - start, 3)} to load in {os.path.basename(source)}"
+                )
+            except:
+                pass
+
+            if self.__stats:
+                bytes = helpers.BytesStatsWrapper(bytes, self.__stats)
+        except Exception as exception:
+            raise exceptions.LoadingError(str(exception))
+
+        # Return bytes
+        if mode == "b":
+            return bytes
+
+        # Detect encoding
+        if self.__bytes_sample_size:
+            sample = bytes.read(self.__bytes_sample_size)
+            bytes.seek(0)
+            encoding = helpers.detect_encoding(sample, encoding)
+            self.encoding = encoding
+
+        # Prepare chars
+        chars = io.TextIOWrapper(bytes, encoding)
+
+        return chars
 
     def load(self, source, mode="t", encoding=None):
         redis_conn = None
@@ -44,7 +128,7 @@ def load(self, source, mode="t", encoding=None):
                     ex=REDIS_EXPIRES,
                 )
         if self.preloaded_chars is None:
-            chars = super(BcodmoAWS, self).load(source, mode=mode, encoding=encoding)
+            chars = self._stream_load(source, mode=mode, encoding=encoding)
         else:
             self.encoding = encoding
             chars = self.preloaded_chars
diff --git a/bcodmo_frictionless/bcodmo_pipeline_processors/standard_load_multiple.py b/bcodmo_frictionless/bcodmo_pipeline_processors/standard_load_multiple.py
@@ -34,12 +34,14 @@ def __init__(
         load_sources,
         names,
         sheets=None,
+        limit_rows_loader=None,
         **options,
     ):
         super(standard_load_multiple, self).__init__("", **options)
         self.load_sources = load_sources
         self.names = names
         self.sheets = sheets
+        self.limit_rows_loader = limit_rows_loader
 
     def _set_individual(self, i):
         load_source = self.load_sources[i]
@@ -176,6 +178,16 @@ def safe_process_datapackage(self, dp: Package):
                 options = self.options
                 if self.preloaded_chars is not None:
                     options["preloaded_chars"] = self.preloaded_chars
+
+                # For all formats that don't require being fully loaded in (AKA everything except for xlsx and xls,
+                # we can limit the rows in the streaming itself
+                if (
+                    self.limit_rows_loader
+                    and self.limit_rows is not None
+                    and self.options.get("format") != "xlsx"
+                    and self.options.get("format") != "xls"
+                ):
+                    options["_limit_rows"] = self.limit_rows
                 stream: Stream = Stream(self.load_source, **options).open()
                 """ Finish change to add preloaded data """