Fix tests, bump dataflows, add infer and cast strategy default, remove print statements

cschloer · cschloer · commit 889c647bed7d · 2024-04-18T12:03:02.000+02:00
diff --git a/bcodmo_frictionless/bcodmo_pipeline_processors/__init__.py b/bcodmo_frictionless/bcodmo_pipeline_processors/__init__.py
@@ -21,4 +21,4 @@
 from .set_types import flow as set_types
 from .edit_cells import flow as edit_cells
 
-from .join import flow as join
+# from .join import flow as join
diff --git a/bcodmo_frictionless/bcodmo_pipeline_processors/dump_to_s3.py b/bcodmo_frictionless/bcodmo_pipeline_processors/dump_to_s3.py
@@ -14,7 +14,7 @@
 from billiard import Process, Queue, Pool
 
 from dataflows.processors.dumpers.dumper_base import DumperBase
-from dataflows.processors.dumpers.file_formats import CSVFormat, JSONFormat, FileFormat
+from dataflows.processors.dumpers.formats import CSVFormat, JSONFormat, FileFormat
 from bcodmo_frictionless.bcodmo_pipeline_processors.helper import (
     get_redis_progress_key,
     get_redis_progress_resource_key,
@@ -25,6 +25,7 @@
     REDIS_PROGRESS_SAVING_DONE_FLAG,
     REDIS_EXPIRES,
 )
+from bcodmo_frictionless.bcodmo_pipeline_processors.helper import get_missing_values
 
 WINDOWS_LINE_ENDING = b"\r\n"
 UNIX_LINE_ENDING = b"\n"
@@ -543,63 +544,33 @@ def rows_processor(self, resource, writer, stream):
         row_number = None
         upload_id = None
 
-        writer_timer_sum = 0
-        process_timer_sum = 0
-        process_timer_count = 0
-        process_timer = None
-        redis_timer_sum = 0
-
         try:
             row_number = 0
             part_number = 0
-            start = time.time()
             timer = time.time()
 
-            yield_start = time.time()
-            yield_total = 0
-            async_total = 0
-            loop_total = 0
             for row in resource:
-                loop_total += time.time() - yield_start
-
                 row_number += 1
 
-                writer_timer = time.time()
                 writer.write_row(row)
-                writer_timer_sum += time.time() - writer_timer
 
-                redis_timer = time.time()
                 if redis_conn is not None and time.time() - timer > 0.75:
                     redis_conn.set(progress_key, row_number, ex=REDIS_EXPIRES)
                     timer = time.time()
-                redis_timer_sum += time.time() - redis_timer
 
-                async_start = time.time()
-                if row_number % 25 == 0 and stream.tell() > calculate_partsize(
+                if row_number % 100 == 0 and stream.tell() > calculate_partsize(
                     part_number
                 ):
                     part_number, upload_id, writer, stream = self.async_write_part(
                         stream, resource, part_number, object_key, upload_id, False
                     )
-                async_total += time.time() - async_start
-
-                yield_total += time.time() - yield_start
-                if (row_number + 1) % 10000 == 0:
-                    # print(
-                    #    f"total {yield_total}. async {async_total}. redis {redis_timer_sum}. writer {writer_timer_sum}. loop {loop_total}"
-                    # )
-                    yield_total = 0
-                    async_total = 0
-                    redis_timer_sum = 0
-                    writer_timer_sum = 0
-                    loop_total = 0
+
                 if (
                     self.limit_yield is None
                     or self.limit_yield < 0
                     or row_number <= self.limit_yield
                 ):
                     yield row
-                yield_start = time.time()
             # Set row number values
             DumperBase.inc_attr(
                 self.datapackage.descriptor, self.datapackage_rowcount, row_number
@@ -613,6 +584,7 @@ def rows_processor(self, resource, writer, stream):
             row_number = None
             writer.finalize_file()
             # Upload final part
+            print("Starting last upload")
             part_number, _, _, stream = self.async_write_part(
                 stream, resource, part_number, object_key, upload_id, True
             )
diff --git a/bcodmo_frictionless/bcodmo_pipeline_processors/load.py b/bcodmo_frictionless/bcodmo_pipeline_processors/load.py
@@ -354,6 +354,10 @@ def process_resource(rows, missing_data_values):
                     ex=REDIS_EXPIRES,
                 )
 
+    # Default all infer and cast strategy to string to handle an update from dataflows that deprecates old pipelines
+    # https://bco-dmo-group.slack.com/archives/CSQ582V4Y/p1712063770616059
+    parameters["infer_strategy"] = "strings"
+    parameters["cast_strategy"] = "strings"
     params.extend(
         [
             count_resources(),
diff --git a/data/test_multiline_header.csv b/data/test_multiline_header.csv
@@ -0,0 +1,6 @@
+1,2,3,4,5
+5,4,3,2,1
+abc,1,1.532,12/29/19,1
+abc,2,35.131,12/30/19,1
+def,1,53.1,12/31/19,1
+ghi,3,54262.5,01/01/20,1
diff --git a/setup.py b/setup.py
@@ -5,11 +5,10 @@
     "datapackage-pipelines==2.2.6",
     # "datapackage-pipelines @ git+https://github.com/frictionlessdata/datapackage-pipelines.git@d78d1391adf6470ca484303e512e038f7dc57483",
     "pyparsing==3.1.2",
-    "dataflows==0.3.1",
+    "dataflows==0.5.5",
     # "dataflows @ git+https://github.com/cschloer/dataflows.git@master",
     # "tabulator==1.53.5",
     "tabulator @ git+https://github.com/BCODMO/tabulator-py.git@main",
-    "tableschema==1.16.4",
     "goodtables==2.5.0",
     "python-dateutil==2.8.0",
     "xlrd==1.2.0",
diff --git a/tests/processors/test_dump_to_s3.py b/tests/processors/test_dump_to_s3.py
@@ -38,6 +38,8 @@ def test_dump_s3():
                 "from": "s3://testing_bucket/test.csv",
                 "name": "res",
                 "format": "csv",
+                "infer_strategy": "strings",
+                "cast_strategy": "strings",
             }
         ),
         dump_to_s3(
@@ -73,6 +75,8 @@ def test_dump_s3_hash():
                 "from": "s3://testing_bucket/test.csv",
                 "name": "res",
                 "format": "csv",
+                "infer_strategy": "strings",
+                "cast_strategy": "strings",
             }
         ),
         dump_to_s3(
diff --git a/tests/processors/test_join.py b/tests/processors/test_join.py
@@ -1,6 +1,6 @@
 import pytest
 import os
-from dataflows import Flow
+from dataflows import Flow, join
 from decimal import Decimal
 
 from bcodmo_frictionless.bcodmo_pipeline_processors import *
@@ -25,6 +25,23 @@ def test_join():
     flows = [
         data2,
         data1,
+        join(
+            "res_1",
+            "{#}",
+            "res_2",
+            "{#}",
+            fields={"col2": {"name": "col2"}},
+            source_delete=True,
+            mode="half-outer",
+        ),
+    ]
+    rows, datapackage, _ = Flow(*flows).results()
+    print(rows)
+    assert rows == [
+        [{"col1": 1, "col2": 1}, {"col1": 2, "col2": 2}, {"col1": 3, "col2": 3}]
+    ]
+
+"""
         join({
             "source": {
                 "name": "res_1",
@@ -38,9 +55,4 @@ def test_join():
             "fields": {"col2": {"name": "col2"}},
             "mode": "half-outer",
         }),
-    ]
-    rows, datapackage, _ = Flow(*flows).results()
-    print(rows)
-    assert rows == [
-        [{"col1": 1, "col2": 1}, {"col1": 2, "col2": 2}, {"col1": 3, "col2": 3}]
-    ]
+"""
diff --git a/tests/processors/test_load.py b/tests/processors/test_load.py
diff --git a/tests/processors/test_set_types.py b/tests/processors/test_set_types.py

Original file line number	Diff line number	Diff line change
`@@ -354,6 +354,10 @@ def process_resource(rows, missing_data_values):`
`354`	`354`	`ex=REDIS_EXPIRES,`
`355`	`355`	`)`
`356`	`356`
	`357`	`+ # Default all infer and cast strategy to string to handle an update from dataflows that deprecates old pipelines`
	`358`	`+ # https://bco-dmo-group.slack.com/archives/CSQ582V4Y/p1712063770616059`
	`359`	`+ parameters["infer_strategy"] = "strings"`
	`360`	`+ parameters["cast_strategy"] = "strings"`
`357`	`361`	`params.extend(`
`358`	`362`	`[`
`359`	`363`	`count_resources(),`