Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛Source Google search console: Fix incremental sync: keep all urls in state object #9194

Merged
merged 10 commits into from
Jan 5, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"sourceDefinitionId": "eb4c9e00-db83-4d63-a386-39cfa91012a8",
"name": "Google Search Console",
"dockerRepository": "airbyte/source-google-search-console",
"dockerImageTag": "0.1.9",
"dockerImageTag": "0.1.11",
"documentationUrl": "https://docs.airbyte.io/integrations/sources/google-search-console",
"icon": "googlesearchconsole.svg"
}
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@
- name: Google Search Console
sourceDefinitionId: eb4c9e00-db83-4d63-a386-39cfa91012a8
dockerRepository: airbyte/source-google-search-console
dockerImageTag: 0.1.10
dockerImageTag: 0.1.11
documentationUrl: https://docs.airbyte.io/integrations/sources/google-search-console
icon: googlesearchconsole.svg
sourceType: api
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENV SENTRY_DSN "https://d4b03de0c4574c78999b8d58e55243dc@o1009025.ingest.sentry.io/6102835"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.1.10
LABEL io.airbyte.version=0.1.11
LABEL io.airbyte.name=airbyte/source-google-search-console
Original file line number Diff line number Diff line change
Expand Up @@ -236,23 +236,35 @@ def get_updated_state(
"""
With the existing nested loop implementation, we have to store a `cursor_field` for each `site_url`
and `searchType`. This functionality is placed in `get_update_state`.

{
"stream": {
"http://domain1.com": {
"web": {"date": "2022-01-03"},
"news": {"date": "2022-01-03"},
"image": {"date": "2022-01-03"},
"video": {"date": "2022-01-03"}
},
"http://domain2.com": {
"web": {"date": "2022-01-03"},
"news": {"date": "2022-01-03"},
"image": {"date": "2022-01-03"},
"video": {"date": "2022-01-03"}
},
"date": "2022-01-03",
}
}
"""

latest_benchmark = latest_record[self.cursor_field]

site_url = latest_record.get("site_url")
search_type = latest_record.get("search_type")

if current_stream_state.get(site_url, {}).get(search_type):
current_stream_state[site_url][search_type] = {
self.cursor_field: max(latest_benchmark, current_stream_state[site_url][search_type][self.cursor_field])
}

elif current_stream_state.get(site_url):
current_stream_state[site_url][search_type] = {self.cursor_field: latest_benchmark}

else:
current_stream_state = {site_url: {search_type: {self.cursor_field: latest_benchmark}}}
value = current_stream_state.get(site_url, {}).get(search_type, {}).get(self.cursor_field)
if value:
latest_benchmark = max(latest_benchmark, value)
current_stream_state.setdefault(site_url, {}).setdefault(search_type, {})[self.cursor_field] = latest_benchmark

# we need to get the max date over all searchTypes but the current acceptance test YAML format doesn't
# support that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,19 @@ def test_state(current_stream_state, latest_record, expected):

value = stream.get_updated_state(current_stream_state, latest_record)
assert value == expected


def test_updated_state():
stream = SearchAnalyticsByDate(NoAuth(), ["https://domain1.com", "https://domain2.com"], "start_date", "end_date")

state = {}
record = {"site_url": "https://domain1.com", "search_type": "web", "date": "2022-01-01"}
state = stream.get_updated_state(state, record)
record = {"site_url": "https://domain2.com", "search_type": "web", "date": "2022-01-01"}
state = stream.get_updated_state(state, record)

assert state == {
"https://domain1.com": {"web": {"date": "2022-01-01"}},
"https://domain2.com": {"web": {"date": "2022-01-01"}},
"date": "2022-01-01",
}
1 change: 1 addition & 0 deletions docs/integrations/sources/google-search-console.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ You should now be ready to use the Google Workspace Admin Reports API connector

| Version | Date | Pull Request | Subject |
| :--- | :--- | :--- | :--- |
| `0.1.11` | 2021-12-31 | [9194](https://github.com/airbytehq/airbyte/pull/9194) | Fix incremental sync: keep all urls in state object |
| `0.1.10` | 2021-12-23 | [9073](https://github.com/airbytehq/airbyte/pull/9073) | Add slicing by date range |
| `0.1.9` | 2021-12-22 | [9047](https://github.com/airbytehq/airbyte/pull/9047) | Add 'order' to spec.json props |
| `0.1.8` | 2021-12-21 | [8248](https://github.com/airbytehq/airbyte/pull/8248) | Enable Sentry for performance and errors tracking |
Expand Down