Skip to content

Commit

Permalink
🐛Source Google search console: Fix incremental sync: keep all urls in…
Browse files Browse the repository at this point in the history
… state object (#9194)

* bugfix: keep all urls in state object

Signed-off-by: Sergey Chvalyuk <grubberr@gmail.com>
  • Loading branch information
grubberr authored Jan 5, 2022
1 parent 0161dd6 commit 8ace9ea
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"sourceDefinitionId": "eb4c9e00-db83-4d63-a386-39cfa91012a8",
"name": "Google Search Console",
"dockerRepository": "airbyte/source-google-search-console",
"dockerImageTag": "0.1.9",
"dockerImageTag": "0.1.11",
"documentationUrl": "https://docs.airbyte.io/integrations/sources/google-search-console",
"icon": "googlesearchconsole.svg"
}
25 changes: 14 additions & 11 deletions airbyte-config/init/src/main/resources/seed/source_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2458,7 +2458,7 @@
- - "client_secret"
oauthFlowOutputParameters:
- - "refresh_token"
- dockerImage: "airbyte/source-google-search-console:0.1.10"
- dockerImage: "airbyte/source-google-search-console:0.1.11"
spec:
documentationUrl: "https://docs.airbyte.io/integrations/sources/google-search-console"
connectionSpecification:
Expand All @@ -2475,6 +2475,7 @@
type: "array"
items:
type: "string"
title: "Site URLs"
description: "Website URLs property; do not include the domain-level property\
\ in the list"
examples:
Expand All @@ -2483,16 +2484,18 @@
order: 0
start_date:
type: "string"
description: "The date from which you'd like to replicate data in the format\
\ YYYY-MM-DD."
title: "Start Date"
description: "UTC date in the format 2017-01-25. Any data before this date\
\ will not be replicated."
examples:
- "2021-01-01"
pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
order: 1
end_date:
type: "string"
description: "The date from which you'd like to replicate data in the format\
\ YYYY-MM-DD. Must be greater or equal start_date field"
title: "End Date"
description: "UTC date in the format 2017-01-25. Any data after this date\
\ will not be replicated. Must be greater or equal to the Start Date field."
examples:
- "2021-12-12"
pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
Expand Down Expand Up @@ -2520,24 +2523,24 @@
client_id:
title: "Client ID"
type: "string"
description: "The Client ID of your developer application"
description: "The Client ID of your Google Search Console developer\
\ application."
airbyte_secret: true
client_secret:
title: "Client Secret"
type: "string"
description: "The client secret of your developer application"
description: "The Client Secret of your Google Search Console developer\
\ application."
airbyte_secret: true
access_token:
title: "Access Token"
type: "string"
description: "An access token generated using the above client ID\
\ and secret"
description: "Access Token for making authenticated requests."
airbyte_secret: true
refresh_token:
title: "Refresh Token"
type: "string"
description: "A refresh token generated using the above client ID\
\ and secret"
description: "The token for obtaining new access token."
airbyte_secret: true
- type: "object"
title: "Service Account Key Authentication"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,23 +236,35 @@ def get_updated_state(
"""
With the existing nested loop implementation, we have to store a `cursor_field` for each `site_url`
and `searchType`. This functionality is placed in `get_update_state`.
{
"stream": {
"http://domain1.com": {
"web": {"date": "2022-01-03"},
"news": {"date": "2022-01-03"},
"image": {"date": "2022-01-03"},
"video": {"date": "2022-01-03"}
},
"http://domain2.com": {
"web": {"date": "2022-01-03"},
"news": {"date": "2022-01-03"},
"image": {"date": "2022-01-03"},
"video": {"date": "2022-01-03"}
},
"date": "2022-01-03",
}
}
"""

latest_benchmark = latest_record[self.cursor_field]

site_url = latest_record.get("site_url")
search_type = latest_record.get("search_type")

if current_stream_state.get(site_url, {}).get(search_type):
current_stream_state[site_url][search_type] = {
self.cursor_field: max(latest_benchmark, current_stream_state[site_url][search_type][self.cursor_field])
}

elif current_stream_state.get(site_url):
current_stream_state[site_url][search_type] = {self.cursor_field: latest_benchmark}

else:
current_stream_state = {site_url: {search_type: {self.cursor_field: latest_benchmark}}}
value = current_stream_state.get(site_url, {}).get(search_type, {}).get(self.cursor_field)
if value:
latest_benchmark = max(latest_benchmark, value)
current_stream_state.setdefault(site_url, {}).setdefault(search_type, {})[self.cursor_field] = latest_benchmark

# we need to get the max date over all searchTypes but the current acceptance test YAML format doesn't
# support that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,19 @@ def test_state(current_stream_state, latest_record, expected):

value = stream.get_updated_state(current_stream_state, latest_record)
assert value == expected


def test_updated_state():
stream = SearchAnalyticsByDate(NoAuth(), ["https://domain1.com", "https://domain2.com"], "start_date", "end_date")

state = {}
record = {"site_url": "https://domain1.com", "search_type": "web", "date": "2022-01-01"}
state = stream.get_updated_state(state, record)
record = {"site_url": "https://domain2.com", "search_type": "web", "date": "2022-01-01"}
state = stream.get_updated_state(state, record)

assert state == {
"https://domain1.com": {"web": {"date": "2022-01-01"}},
"https://domain2.com": {"web": {"date": "2022-01-01"}},
"date": "2022-01-01",
}
2 changes: 1 addition & 1 deletion docs/integrations/sources/google-search-console.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ You should now be ready to use the Google Workspace Admin Reports API connector

| Version | Date | Pull Request | Subject |
| :--- | :--- | :--- | :--- |
| `0.1.11` | 2022-01-05 | [9186](https://github.com/airbytehq/airbyte/pull/9186) | Update titles and descriptions |
| `0.1.11` | 2022-01-05 | [9186](https://github.com/airbytehq/airbyte/pull/9186) [9194](https://github.com/airbytehq/airbyte/pull/9194) | Fix incremental sync: keep all urls in state object |
| `0.1.10` | 2021-12-23 | [9073](https://github.com/airbytehq/airbyte/pull/9073) | Add slicing by date range |
| `0.1.9` | 2021-12-22 | [9047](https://github.com/airbytehq/airbyte/pull/9047) | Add 'order' to spec.json props |
| `0.1.8` | 2021-12-21 | [8248](https://github.com/airbytehq/airbyte/pull/8248) | Enable Sentry for performance and errors tracking |
Expand Down

0 comments on commit 8ace9ea

Please sign in to comment.