Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛Source Google search console: Fix incremental sync: keep all urls in state object #9194

Merged
merged 10 commits into from
Jan 5, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"sourceDefinitionId": "eb4c9e00-db83-4d63-a386-39cfa91012a8",
"name": "Google Search Console",
"dockerRepository": "airbyte/source-google-search-console",
"dockerImageTag": "0.1.9",
"dockerImageTag": "0.1.11",
"documentationUrl": "https://docs.airbyte.io/integrations/sources/google-search-console",
"icon": "googlesearchconsole.svg"
}
25 changes: 14 additions & 11 deletions airbyte-config/init/src/main/resources/seed/source_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2458,7 +2458,7 @@
- - "client_secret"
oauthFlowOutputParameters:
- - "refresh_token"
- dockerImage: "airbyte/source-google-search-console:0.1.10"
- dockerImage: "airbyte/source-google-search-console:0.1.11"
spec:
documentationUrl: "https://docs.airbyte.io/integrations/sources/google-search-console"
connectionSpecification:
Expand All @@ -2475,6 +2475,7 @@
type: "array"
items:
type: "string"
title: "Site URLs"
description: "Website URLs property; do not include the domain-level property\
\ in the list"
examples:
Expand All @@ -2483,16 +2484,18 @@
order: 0
start_date:
type: "string"
description: "The date from which you'd like to replicate data in the format\
\ YYYY-MM-DD."
title: "Start Date"
description: "UTC date in the format 2017-01-25. Any data before this date\
\ will not be replicated."
examples:
- "2021-01-01"
pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
order: 1
end_date:
type: "string"
description: "The date from which you'd like to replicate data in the format\
\ YYYY-MM-DD. Must be greater or equal start_date field"
title: "End Date"
description: "UTC date in the format 2017-01-25. Any data after this date\
\ will not be replicated. Must be greater or equal to the Start Date field."
examples:
- "2021-12-12"
pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
Expand Down Expand Up @@ -2520,24 +2523,24 @@
client_id:
title: "Client ID"
type: "string"
description: "The Client ID of your developer application"
description: "The Client ID of your Google Search Console developer\
\ application."
airbyte_secret: true
client_secret:
title: "Client Secret"
type: "string"
description: "The client secret of your developer application"
description: "The Client Secret of your Google Search Console developer\
\ application."
airbyte_secret: true
access_token:
title: "Access Token"
type: "string"
description: "An access token generated using the above client ID\
\ and secret"
description: "Access Token for making authenticated requests."
airbyte_secret: true
refresh_token:
title: "Refresh Token"
type: "string"
description: "A refresh token generated using the above client ID\
\ and secret"
description: "The token for obtaining new access token."
airbyte_secret: true
- type: "object"
title: "Service Account Key Authentication"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,23 +236,35 @@ def get_updated_state(
"""
With the existing nested loop implementation, we have to store a `cursor_field` for each `site_url`
and `searchType`. This functionality is placed in `get_update_state`.

{
"stream": {
"http://domain1.com": {
"web": {"date": "2022-01-03"},
"news": {"date": "2022-01-03"},
"image": {"date": "2022-01-03"},
"video": {"date": "2022-01-03"}
},
"http://domain2.com": {
"web": {"date": "2022-01-03"},
"news": {"date": "2022-01-03"},
"image": {"date": "2022-01-03"},
"video": {"date": "2022-01-03"}
},
"date": "2022-01-03",
}
}
"""

latest_benchmark = latest_record[self.cursor_field]

site_url = latest_record.get("site_url")
search_type = latest_record.get("search_type")

if current_stream_state.get(site_url, {}).get(search_type):
current_stream_state[site_url][search_type] = {
self.cursor_field: max(latest_benchmark, current_stream_state[site_url][search_type][self.cursor_field])
}

elif current_stream_state.get(site_url):
current_stream_state[site_url][search_type] = {self.cursor_field: latest_benchmark}

else:
current_stream_state = {site_url: {search_type: {self.cursor_field: latest_benchmark}}}
value = current_stream_state.get(site_url, {}).get(search_type, {}).get(self.cursor_field)
if value:
latest_benchmark = max(latest_benchmark, value)
current_stream_state.setdefault(site_url, {}).setdefault(search_type, {})[self.cursor_field] = latest_benchmark

# we need to get the max date over all searchTypes but the current acceptance test YAML format doesn't
# support that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,19 @@ def test_state(current_stream_state, latest_record, expected):

value = stream.get_updated_state(current_stream_state, latest_record)
assert value == expected


def test_updated_state():
stream = SearchAnalyticsByDate(NoAuth(), ["https://domain1.com", "https://domain2.com"], "start_date", "end_date")

state = {}
record = {"site_url": "https://domain1.com", "search_type": "web", "date": "2022-01-01"}
state = stream.get_updated_state(state, record)
record = {"site_url": "https://domain2.com", "search_type": "web", "date": "2022-01-01"}
state = stream.get_updated_state(state, record)

assert state == {
"https://domain1.com": {"web": {"date": "2022-01-01"}},
"https://domain2.com": {"web": {"date": "2022-01-01"}},
"date": "2022-01-01",
}
2 changes: 1 addition & 1 deletion docs/integrations/sources/google-search-console.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ You should now be ready to use the Google Workspace Admin Reports API connector

| Version | Date | Pull Request | Subject |
| :--- | :--- | :--- | :--- |
| `0.1.11` | 2022-01-05 | [9186](https://github.com/airbytehq/airbyte/pull/9186) | Update titles and descriptions |
| `0.1.11` | 2022-01-05 | [9186](https://github.com/airbytehq/airbyte/pull/9186) [9194](https://github.com/airbytehq/airbyte/pull/9194) | Fix incremental sync: keep all urls in state object |
| `0.1.10` | 2021-12-23 | [9073](https://github.com/airbytehq/airbyte/pull/9073) | Add slicing by date range |
| `0.1.9` | 2021-12-22 | [9047](https://github.com/airbytehq/airbyte/pull/9047) | Add 'order' to spec.json props |
| `0.1.8` | 2021-12-21 | [8248](https://github.com/airbytehq/airbyte/pull/8248) | Enable Sentry for performance and errors tracking |
Expand Down