airbytehq · edgao · Jun 14, 2022 · May 9, 2022 · May 12, 2022 · May 13, 2022
diff --git a/...ion/integration_tests/resources/test_simple_streams/data_input/catalog_schema_change.json b/...ion/integration_tests/resources/test_simple_streams/data_input/catalog_schema_change.json
@@ -119,6 +119,38 @@
       "cursor_field": [],
       "destination_sync_mode": "append_dedup",
       "primary_key": [["id"]]
+    },
+    {
+      "stream": {
+        "name": "dedup_cdc_excluded",
+        "json_schema": {
+          "type": ["null", "object"],
+          "properties": {
+            "id": {
+              "type": "integer"
+            },
+            "name": {
+              "type": ["string", "null"]
+            },
+            "_ab_cdc_lsn": {
+              "type": ["null", "number"]
+            },
+            "_ab_cdc_updated_at": {
+              "type": ["null", "number"]
+            },
+            "_ab_cdc_deleted_at": {
+              "type": ["null", "number"]
+            }
+          }
+        },
+        "supported_sync_modes": ["full_refresh", "incremental"],
+        "source_defined_cursor": true,
+        "default_cursor_field": []
+      },
+      "sync_mode": "incremental",
+      "cursor_field": ["_ab_cdc_lsn"],
+      "destination_sync_mode": "append_dedup",
+      "primary_key": [["id"]]
     }
   ]
 }
diff --git a/...ion/integration_tests/resources/test_simple_streams/data_input/messages_schema_change.txt b/...ion/integration_tests/resources/test_simple_streams/data_input/messages_schema_change.txt
@@ -11,3 +11,5 @@
 {"type":"RECORD","record":{"stream":"renamed_dedup_cdc_excluded","data":{"id":8,"name":"vw","column`_'with\"_quotes":"ma\"z`d'a","_ab_cdc_updated_at":1623949314663,"_ab_cdc_lsn":26985264,"_ab_cdc_deleted_at":null},"emitted_at":1623960160}}
 {"type":"RECORD","record":{"stream":"renamed_dedup_cdc_excluded","data":{"id":9,"name":"opel","column`_'with\"_quotes":"ma\"z`d'a","_ab_cdc_updated_at":1623950868109,"_ab_cdc_lsn":28009440,"_ab_cdc_deleted_at":null},"emitted_at":1623961660}}
 {"type":"RECORD","record":{"stream":"renamed_dedup_cdc_excluded","data":{"id":9,"name":null,"column`_'with\"_quotes":"ma\"z`d'a","_ab_cdc_updated_at":1623950868371,"_ab_cdc_lsn":28010232,"_ab_cdc_deleted_at":1623950868371},"emitted_at":1623961660}}
+
+{"type":"RECORD","record":{"stream":"dedup_cdc_excluded","data":{"id":8,"name":"ford","column`_'with\"_quotes":"ma\"z`d'a","_ab_cdc_updated_at":1625000000000,"_ab_cdc_lsn":29020252,"_ab_cdc_deleted_at":1625000000000},"emitted_at":1625000000000}}
diff --git a/.../dbt_test_config/dbt_data_tests_tmp_schema_change/simple_streams_third_run_row_counts.sql b/.../dbt_test_config/dbt_data_tests_tmp_schema_change/simple_streams_third_run_row_counts.sql
@@ -18,13 +18,13 @@ union all
 
 union all
 
-    select distinct '_airbyte_raw_dedup_cdc_excluded' as label, count(*) as row_count, 3 as expected_count
+    select distinct '_airbyte_raw_dedup_cdc_excluded' as label, count(*) as row_count, 4 as expected_count
     from test_normalization._airbyte_raw_dedup_cdc_excluded
 union all
-    select distinct 'dedup_cdc_excluded_scd' as label, count(*) as row_count, 10 as expected_count
+    select distinct 'dedup_cdc_excluded_scd' as label, count(*) as row_count, 11 as expected_count
     from test_normalization.dedup_cdc_excluded_scd
 union all
-    select distinct 'dedup_cdc_excluded' as label, count(*) as row_count, 4 as expected_count
+    select distinct 'dedup_cdc_excluded' as label, count(*) as row_count, 3 as expected_count
     from test_normalization.dedup_cdc_excluded
 )
 select *

diff --git a/...integrations/bases/base-normalization/normalization/transform_catalog/stream_processor.py b/...integrations/bases/base-normalization/normalization/transform_catalog/stream_processor.py
@@ -849,7 +849,7 @@ def generate_scd_type_2_model(self, from_table: str, column_names: Dict[str, Tup
             jinja_variables["scd_columns_sql"] = scd_columns_sql
         sql = Template(
             """
--- depends on: {{ from_table }}
+-- depends_on: {{ from_table }}
 with
 {{ '{% if is_incremental() %}' }}
 new_data as (
@@ -1120,11 +1120,15 @@ def add_to_outputs(
 
                 final_table_name = self.tables_registry.get_file_name(schema, self.json_path, self.stream_name, "", truncate_name)
                 active_row_column_name = self.name_transformer.normalize_column_name("_airbyte_active_row")
+                clickhouse_nullable_join_setting = ""
                 if self.destination_type == DestinationType.CLICKHOUSE:
                     # Clickhouse has special delete syntax
                     delete_statement = "alter table {{ final_table_relation }} delete"
                     unique_key_reference = self.get_unique_key(in_jinja=False)
                     noop_delete_statement = "alter table {{ this }} delete where 1=0"
+                    # Without this, our LEFT JOIN would return empty string for non-matching rows, so our COUNT would include those rows.
+                    # We want to exclude them (this is the default behavior in other DBs) so we have to set join_use_nulls=1
+                    clickhouse_nullable_join_setting = "SETTINGS join_use_nulls=1"
                 elif self.destination_type == DestinationType.BIGQUERY:
                     # Bigquery doesn't like the "delete from project.schema.table where project.schema.table.column in" syntax;
                     # it requires "delete from project.schema.table table_alias where table_alias.column in"
@@ -1154,17 +1158,24 @@ def add_to_outputs(
                     {{ '%}' }}
 
                     -- Delete records which are no longer active:
-                    -- The first subquery finds the most recent increment to the SCD table
-                    -- The second subquery finds, within that increment, the records which are still active
-                    -- We want to delete rows which are in that increment, but are not active
+                    -- This query is equivalent, but the left join version is more performant:
+                    -- delete from final_table where unique_key in (
+                    --     select unique_key from scd_table where 1 = 1 <incremental_clause(normalized_at, final_table)>
+                    -- ) and unique_key not in (
+                    --     select unique_key from scd_table where active_row = 1 <incremental_clause(normalized_at, final_table)>
+                    -- )
                     {{ delete_statement }} where {{ unique_key_reference }} in (
-                        select {{ unique_key }}
-                        from {{ '{{ this }}' }}
-                        where 1 = 1 {{ normalized_at_incremental_clause }}
-                    ) and {{ unique_key_reference }} not in (
-                        select {{ unique_key }}
+                        select distinct {{ unique_key }}
                         from {{ '{{ this }}' }}
-                        where {{ active_row_column_name }} = 1 {{ normalized_at_incremental_clause }}
+                        left join (
+                            select {{ unique_key }} as active_unique_key
+                            from {{ '{{ this }}' }}
+                            where {{ active_row_column_name }} = 1 {{ normalized_at_incremental_clause }}
+                        ) active_recent_scd_rows on {{ unique_key }} = active_unique_key
+                        where 1=1 {{ normalized_at_incremental_clause }}
+                        group by {{ unique_key }}
+                        having count(active_unique_key) = 0
+                        {{ clickhouse_nullable_join_setting }}
                     )
                     {{ '{% else %}' }}
                     -- We have to have a non-empty query, so just do a noop delete
@@ -1183,6 +1194,7 @@ def add_to_outputs(
                         self.get_normalized_at(in_jinja=True),
                     ),
                     unique_key_reference=unique_key_reference,
+                    clickhouse_nullable_join_setting=clickhouse_nullable_join_setting,
                 )
                 hooks.append(deletion_hook)