Skip to content

Commit

Permalink
🐛 Fix normalization issue with quoted & case sensitive columns (#9317)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristopheDuong authored Jan 6, 2022
1 parent e0bac4a commit c5d4a97
Show file tree
Hide file tree
Showing 43 changed files with 1,030 additions and 267 deletions.
1 change: 1 addition & 0 deletions airbyte-integrations/bases/base-normalization/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ integration_tests/normalization_test_output/**/*.yml
# Simple Streams
!integration_tests/normalization_test_output/**/dedup_exchange_rate*.sql
!integration_tests/normalization_test_output/**/exchange_rate.sql
!integration_tests/normalization_test_output/**/test_simple_streams/first_output/airbyte_views/**/multiple_column_names_conflicts_stg.sql
# Nested Streams
# Parent table
!integration_tests/normalization_test_output/**/nested_stream_with*_names_ab*.sql
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@


create or replace view `dataline-integration-testing`._airbyte_test_normalization.`multiple_column_names_conflicts_stg`
OPTIONS()
as
with __dbt__cte__multiple_column_names_conflicts_ab1 as (

-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization._airbyte_raw_multiple_column_names_conflicts
select
json_extract_scalar(_airbyte_data, "$['id']") as id,
json_extract_scalar(_airbyte_data, "$['User Id']") as User_Id,
json_extract_scalar(_airbyte_data, "$['user_id']") as user_id_1,
json_extract_scalar(_airbyte_data, "$['User id']") as User_id_2,
json_extract_scalar(_airbyte_data, "$['user id']") as user_id_3,
json_extract_scalar(_airbyte_data, "$['User@Id']") as User_Id_4,
json_extract_scalar(_airbyte_data, "$['UserId']") as UserId,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization._airbyte_raw_multiple_column_names_conflicts as table_alias
-- multiple_column_names_conflicts
where 1 = 1

), __dbt__cte__multiple_column_names_conflicts_ab2 as (

-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__multiple_column_names_conflicts_ab1
select
cast(id as
int64
) as id,
cast(User_Id as
string
) as User_Id,
cast(user_id_1 as
float64
) as user_id_1,
cast(User_id_2 as
float64
) as User_id_2,
cast(user_id_3 as
float64
) as user_id_3,
cast(User_Id_4 as
string
) as User_Id_4,
cast(UserId as
float64
) as UserId,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__multiple_column_names_conflicts_ab1
-- multiple_column_names_conflicts
where 1 = 1

)-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__multiple_column_names_conflicts_ab2
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), ''), '-', coalesce(cast(User_Id as
string
), ''), '-', coalesce(cast(user_id_1 as
string
), ''), '-', coalesce(cast(User_id_2 as
string
), ''), '-', coalesce(cast(user_id_3 as
string
), ''), '-', coalesce(cast(User_Id_4 as
string
), ''), '-', coalesce(cast(UserId as
string
), '')) as
string
))) as _airbyte_multiple_column_names_conflicts_hashid,
tmp.*
from __dbt__cte__multiple_column_names_conflicts_ab2 tmp
-- multiple_column_names_conflicts
where 1 = 1
;

Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ sources:
- name: _airbyte_raw_dedup_cdc_excluded
- name: _airbyte_raw_dedup_exchange_rate
- name: _airbyte_raw_exchange_rate
- name: _airbyte_raw_multiple_column_names_conflicts
- name: _airbyte_raw_pos_dedup_cdcx
- name: _airbyte_raw_renamed_dedup_cdc_excluded

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@


create view _airbyte_test_normalization.multiple_column_names_conflicts_stg__dbt_tmp

as (

with __dbt__cte__multiple_column_names_conflicts_ab1 as (

-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: test_normalization._airbyte_raw_multiple_column_names_conflicts
select
JSONExtractRaw(_airbyte_data, 'id') as id,
JSONExtractRaw(_airbyte_data, 'User Id') as "User Id",
JSONExtractRaw(_airbyte_data, 'user_id') as user_id,
JSONExtractRaw(_airbyte_data, 'User id') as "User id",
JSONExtractRaw(_airbyte_data, 'user id') as "user id",
JSONExtractRaw(_airbyte_data, 'User@Id') as "User@Id",
JSONExtractRaw(_airbyte_data, 'UserId') as UserId,
_airbyte_ab_id,
_airbyte_emitted_at,
now() as _airbyte_normalized_at
from test_normalization._airbyte_raw_multiple_column_names_conflicts as table_alias
-- multiple_column_names_conflicts
where 1 = 1

), __dbt__cte__multiple_column_names_conflicts_ab2 as (

-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__multiple_column_names_conflicts_ab1
select
accurateCastOrNull(id, '
BIGINT
') as id,
nullif(accurateCastOrNull(trim(BOTH '"' from "User Id"), 'String'), 'null') as "User Id",
accurateCastOrNull(user_id, '
Float64
') as user_id,
accurateCastOrNull("User id", '
Float64
') as "User id",
accurateCastOrNull("user id", '
Float64
') as "user id",
nullif(accurateCastOrNull(trim(BOTH '"' from "User@Id"), 'String'), 'null') as "User@Id",
accurateCastOrNull(UserId, '
Float64
') as UserId,
_airbyte_ab_id,
_airbyte_emitted_at,
now() as _airbyte_normalized_at
from __dbt__cte__multiple_column_names_conflicts_ab1
-- multiple_column_names_conflicts
where 1 = 1

)-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__multiple_column_names_conflicts_ab2
select
assumeNotNull(hex(MD5(

toString(id) || '~' ||


toString("User Id") || '~' ||


toString(user_id) || '~' ||


toString("User id") || '~' ||


toString("user id") || '~' ||


toString("User@Id") || '~' ||


toString(UserId)

))) as _airbyte_multiple_co__ames_conflicts_hashid,
tmp.*
from __dbt__cte__multiple_column_names_conflicts_ab2 tmp
-- multiple_column_names_conflicts
where 1 = 1

)

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Loading

0 comments on commit c5d4a97

Please sign in to comment.