🐛 Source Intercom: switching from scroll to standard endpoints (#8637)

antixar · sherifnada · web-flow · commit e43c53d285ab · 2021-12-10T21:07:03.000+02:00
* backoff for companies scroll

* remove a unused companies stream property

* fix tests

* bump version

* update source_specs

* update scroll logic

* update tests

* Update airbyte-integrations/connectors/source-intercom/source_intercom/source.py

Co-authored-by: Sherif A. Nada &lt;snadalive@gmail.com&gt;

* update change log

* update spec files

Co-authored-by: Sherif A. Nada &lt;snadalive@gmail.com&gt;
diff --git a/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/d8313939-3782-41b0-be29-b3ca20d8dd3a.json b/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/d8313939-3782-41b0-be29-b3ca20d8dd3a.json
@@ -2,7 +2,7 @@
   "sourceDefinitionId": "d8313939-3782-41b0-be29-b3ca20d8dd3a",
   "name": "Intercom",
   "dockerRepository": "airbyte/source-intercom",
-  "dockerImageTag": "0.1.9",
+  "dockerImageTag": "0.1.10",
   "documentationUrl": "https://docs.airbyte.io/integrations/sources/intercom",
   "icon": "intercom.svg"
 }
diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml
@@ -302,7 +302,7 @@
 - name: Intercom
   sourceDefinitionId: d8313939-3782-41b0-be29-b3ca20d8dd3a
   dockerRepository: airbyte/source-intercom
-  dockerImageTag: 0.1.9
+  dockerImageTag: 0.1.10
   documentationUrl: https://docs.airbyte.io/integrations/sources/intercom
   icon: intercom.svg
   sourceType: api
diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml
@@ -2841,7 +2841,7 @@
         oauthFlowInitParameters: []
         oauthFlowOutputParameters:
         - - "access_token"
-- dockerImage: "airbyte/source-intercom:0.1.9"
+- dockerImage: "airbyte/source-intercom:0.1.10"
   spec:
     documentationUrl: "https://docs.airbyte.io/integrations/sources/intercom"
     connectionSpecification:
diff --git a/airbyte-integrations/connectors/source-intercom/integration_tests/integration_test.py b/airbyte-integrations/connectors/source-intercom/integration_tests/integration_test.py
@@ -3,16 +3,17 @@
 #
 
 import json
+import pytest
 import time
+from airbyte_cdk import AirbyteLogger
+from airbyte_cdk.models import SyncMode
 from copy import deepcopy
 from pathlib import Path
+from requests.exceptions import HTTPError
 from typing import Mapping
+from unittest.mock import patch
 
-import pytest
-from airbyte_cdk import AirbyteLogger
-from airbyte_cdk.models import SyncMode
-from requests.exceptions import HTTPError
-from source_intercom.source import Companies, SourceIntercom, VersionApiAuthenticator
+from source_intercom.source import Companies, ConversationParts, SourceIntercom, VersionApiAuthenticator
 
 LOGGER = AirbyteLogger()
 # from unittest.mock import Mock
@@ -27,18 +28,19 @@ def stream_attributes() -> Mapping[str, str]:
         return json.load(json_file)
 
 
+@pytest.mark.skip(reason="need to refresh this test, it is very slow")
 @pytest.mark.parametrize(
     "version,not_supported_streams,custom_companies_data_field",
     (
-        (1.0, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
-        (1.1, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
-        (1.2, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
-        (1.3, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
-        (1.4, ["company_segments"], "companies"),
-        (2.0, [], "data"),
-        (2.1, [], "data"),
-        (2.2, [], "data"),
-        (2.3, [], "data"),
+            (1.0, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
+            (1.1, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
+            (1.2, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
+            (1.3, ["company_segments", "company_attributes", "contact_attributes"], "companies"),
+            (1.4, ["company_segments"], "companies"),
+            (2.0, [], "data"),
+            (2.1, [], "data"),
+            (2.2, [], "data"),
+            (2.3, [], "data"),
     ),
 )
 def test_supported_versions(stream_attributes, version, not_supported_streams, custom_companies_data_field):
@@ -48,17 +50,17 @@ class CustomVersionApiAuthenticator(VersionApiAuthenticator):
     authenticator = CustomVersionApiAuthenticator(token=stream_attributes["access_token"])
     for stream in SourceIntercom().streams(deepcopy(stream_attributes)):
         stream._authenticator = authenticator
-
         if stream.name == "companies":
             stream.data_fields = [custom_companies_data_field]
         elif hasattr(stream, "parent_stream_class") and stream.parent_stream_class == Companies:
             stream.parent_stream_class.data_fields = [custom_companies_data_field]
 
-        slices = list(stream.stream_slices(sync_mode=SyncMode.full_refresh))
         if stream.name in not_supported_streams:
             LOGGER.info(f"version {version} shouldn't be supported the stream '{stream.name}'")
             with pytest.raises(HTTPError) as err:
-                next(stream.read_records(sync_mode=None, stream_slice=slices[0]), None)
+                for slice in stream.stream_slices(sync_mode=SyncMode.full_refresh):
+                    next(stream.read_records(sync_mode=None, stream_slice=slice), None)
+                    break
             # example of response errors:
             # {"type": "error.list", "request_id": "000hjqhpf95ef3b8f8v0",
             #  "errors": [{"code": "intercom_version_invalid", "message": "The requested version could not be found"}]}
@@ -67,12 +69,13 @@ class CustomVersionApiAuthenticator(VersionApiAuthenticator):
             LOGGER.info(f"version {version} doesn't support the stream '{stream.name}', error: {err_data}")
         else:
             LOGGER.info(f"version {version} should be supported the stream '{stream.name}'")
-            records = stream.read_records(sync_mode=None, stream_slice=slices[0])
-            if stream.name == "companies":
-                # need to read all records for scroll resetting
-                list(records)
-            else:
-                next(records, None)
+            for slice in stream.stream_slices(sync_mode=SyncMode.full_refresh):
+                records = stream.read_records(sync_mode=None, stream_slice=slice)
+                if stream.name == "companies":
+                    # need to read all records for scroll resetting
+                    list(records)
+                else:
+                    next(records, None)
 
 
 def test_companies_scroll(stream_attributes):
@@ -82,16 +85,54 @@ def test_companies_scroll(stream_attributes):
     stream3 = Companies(authenticator=authenticator)
 
     # read the first stream and stop
-    next(stream1.read_records(sync_mode=SyncMode.full_refresh))
+    for slice in stream1.stream_slices(sync_mode=SyncMode.full_refresh):
+        next(stream1.read_records(sync_mode=SyncMode.full_refresh, stream_slice=slice), None)
+        break
 
     start_time = time.time()
     # read all records
-    records = list(stream2.read_records(sync_mode=SyncMode.full_refresh))
+    records = []
+    for slice in stream2.stream_slices(sync_mode=SyncMode.full_refresh):
+        records += list(stream2.read_records(sync_mode=SyncMode, stream_slice=slice))
     assert len(records) == 3
     assert (time.time() - start_time) > 60.0
 
     start_time = time.time()
-    # read all records again
-    records = list(stream3.read_records(sync_mode=SyncMode.full_refresh))
+    # read all records
+    records = []
+    for slice in stream3.stream_slices(sync_mode=SyncMode.full_refresh):
+        records += list(stream3.read_records(sync_mode=SyncMode.full_refresh, stream_slice=slice))
     assert len(records) == 3
     assert (time.time() - start_time) < 5.0
+
+
+@patch("source_intercom.source.Companies.can_use_scroll", lambda *args: False)
+def test_switch_to_standard_endpoint(stream_attributes):
+    authenticator = VersionApiAuthenticator(token=stream_attributes["access_token"])
+    stream1 = Companies(authenticator=authenticator)
+    stream2 = Companies(authenticator=authenticator)
+    stream3 = ConversationParts(authenticator=authenticator)
+
+    # read the first stream and stop
+    for slice in stream1.stream_slices(sync_mode=SyncMode.full_refresh):
+        next(stream1.read_records(sync_mode=SyncMode.full_refresh, stream_slice=slice), None)
+        break
+
+    start_time = time.time()
+    # read all records
+    records = []
+    assert stream2._endpoint_type == Companies.EndpointType.scroll
+    for slice in stream2.stream_slices(sync_mode=SyncMode.full_refresh):
+        records += list(stream2.read_records(sync_mode=SyncMode, stream_slice=slice))
+    assert stream2._endpoint_type == Companies.EndpointType.standard
+    assert stream2._total_count == 3
+    assert len(records) == 3
+    assert (time.time() - start_time) < 5.0
+
+    start_time = time.time()
+    # read all children records
+    records = []
+    for slice in stream3.stream_slices(sync_mode=SyncMode.full_refresh):
+        records += list(stream3.read_records(sync_mode=SyncMode, stream_slice=slice))
+    assert len(records) == 12
+    assert (time.time() - start_time) < 5.0
diff --git a/airbyte-integrations/connectors/source-intercom/source_intercom/source.py b/airbyte-integrations/connectors/source-intercom/source_intercom/source.py
@@ -5,6 +5,7 @@
 import time
 from abc import ABC
 from datetime import datetime
+from enum import Enum
 from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple
 from urllib.parse import parse_qsl, urlparse
 
@@ -13,7 +14,8 @@
 from airbyte_cdk.sources import AbstractSource
 from airbyte_cdk.sources.streams import Stream
 from airbyte_cdk.sources.streams.http import HttpStream
-from airbyte_cdk.sources.streams.http.auth import HttpAuthenticator, TokenAuthenticator
+from airbyte_cdk.sources.streams.http.requests_native_auth import TokenAuthenticator
+from requests.auth import AuthBase
 
 
 class IntercomStream(HttpStream, ABC):
@@ -27,14 +29,24 @@ class IntercomStream(HttpStream, ABC):
 
     def __init__(
         self,
-        authenticator: HttpAuthenticator,
+        authenticator: AuthBase,
         start_date: str = None,
         **kwargs,
     ):
         self.start_date = start_date
 
         super().__init__(authenticator=authenticator)
 
+    @property
+    def authenticator(self):
+        """
+        Fix of the bug when isinstance(authenticator, AuthBase) and
+        default logic returns  incorrect authenticator values
+        """
+        if self._session.auth:
+            return self._session.auth
+        return super().authenticator
+
     def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
         """
         Abstract method of HttpStream - should be overwritten.
@@ -95,7 +107,7 @@ def filter_by_state(self, stream_state: Mapping[str, Any] = None, record: Mappin
         during the slicing.
         """
 
-        if not stream_state or record[self.cursor_field] >= stream_state.get(self.cursor_field):
+        if not stream_state or record[self.cursor_field] > stream_state.get(self.cursor_field):
             yield record
 
     def parse_response(self, response: requests.Response, stream_state: Mapping[str, Any], **kwargs) -> Iterable[Mapping]:
@@ -124,10 +136,12 @@ class ChildStreamMixin:
     parent_stream_class: Optional[IntercomStream] = None
 
     def stream_slices(self, sync_mode, **kwargs) -> Iterable[Optional[Mapping[str, any]]]:
-        for item in self.parent_stream_class(authenticator=self.authenticator, start_date=self.start_date).read_records(
-            sync_mode=sync_mode
-        ):
-            yield {"id": item["id"]}
+        parent_stream = self.parent_stream_class(authenticator=self.authenticator, start_date=self.start_date)
+        for slice in parent_stream.stream_slices(sync_mode=sync_mode):
+            for item in self.parent_stream_class(
+                authenticator=self.authenticator, start_date=self.start_date, stream_slice=slice
+            ).read_records(sync_mode=sync_mode):
+                yield {"id": item["id"]}
 
 
 class Admins(IntercomStream):
@@ -144,24 +158,55 @@ def path(self, **kwargs) -> str:
 
 class Companies(IncrementalIntercomStream):
     """Return list of all companies.
-    API Docs: https://developers.intercom.com/intercom-api-reference/reference#iterating-over-all-companies
-    Endpoint: https://api.intercom.io/companies/scroll
+     The Intercom API provides 2 similar endpoint for loading of companies:
+    1) "standard" - https://developers.intercom.com/intercom-api-reference/reference#list-companies.
+       But this endpoint does not work well for huge datasets and can have performance problems.
+    2) "scroll" - https://developers.intercom.com/intercom-api-reference/reference#iterating-over-all-companies
+       It has good performance but at same time only one script/client can use it across the client's entire account.
+
+     According to above circumstances no one endpoint can't be used permanently. That's why this stream tries can
+    apply both endpoints according to the following logic:
+    1) By default the stream tries to load data by "scroll" endpoint.
+    2) Try to wait a "scroll" request within a minute (3 attempts with delay 20,5 seconds)
+       if a "stroll" is busy by another script
+    3) Switch to using of the "standard" endpoint.
     """
 
+    class EndpointType(Enum):
+        scroll = "companies/scroll"
+        standard = "companies"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._backoff_count = 0
+        self._endpoint_type = self.EndpointType.scroll
+        self._total_count = None  # uses for saving of a total_count value once
+
     def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
         """For reset scroll needs to iterate pages untill the last.
         Another way need wait 1 min for the scroll to expire to get a new list for companies segments."""
-
         data = response.json()
-        scroll_param = data.get("scroll_param")
+        if self._total_count is None and data.get("total_count"):
+            self._total_count = data["total_count"]
+            self.logger.info(f"found {self._total_count} companies")
+        if self.can_use_scroll():
+
+            scroll_param = data.get("scroll_param")
+
+            # this stream always has only one data field
+            data_field = self.data_fields[0]
+            if scroll_param and data.get(data_field):
+                return {"scroll_param": scroll_param}
+        elif not data.get("errors"):
+            return super().next_page_token(response)
+        return None
 
-        # this stream always has only one data field
-        data_field = self.data_fields[0]
-        if scroll_param and data.get(data_field):
-            return {"scroll_param": scroll_param}
+    def can_use_scroll(self):
+        """Check backoff count"""
+        return self._backoff_count <= 3
 
     def path(self, **kwargs) -> str:
-        return "companies/scroll"
+        return self._endpoint_type.value
 
     @classmethod
     def check_exists_scroll(cls, response: requests.Response) -> bool:
@@ -174,8 +219,25 @@ def check_exists_scroll(cls, response: requests.Response) -> bool:
 
         return False
 
+    @property
+    def raise_on_http_errors(self) -> bool:
+        if not self.can_use_scroll() and self._endpoint_type == self.EndpointType.scroll:
+            return False
+        return True
+
+    def stream_slices(self, sync_mode, **kwargs) -> Iterable[Optional[Mapping[str, any]]]:
+        yield None
+        if not self.can_use_scroll():
+            self._endpoint_type = self.EndpointType.standard
+            yield None
+
     def should_retry(self, response: requests.Response) -> bool:
         if self.check_exists_scroll(response):
+            self._backoff_count += 1
+            if not self.can_use_scroll():
+                self.logger.error("Can't create a new scroll request within an minute. " "Let's try to use a standard non-scroll endpoint.")
+                return False
+
             return True
         return super().should_retry(response)
 
@@ -186,6 +248,13 @@ def backoff_time(self, response: requests.Response) -> Optional[float]:
             return 20.5
         return super().backoff_time(response)
 
+    def parse_response(self, response: requests.Response, stream_state: Mapping[str, Any], **kwargs) -> Iterable[Mapping]:
+        if not self.raise_on_http_errors:
+            data = response.json()
+            if data.get("errors"):
+                return
+        yield from super().parse_response(response, stream_state=stream_state, **kwargs)
+
 
 class CompanySegments(ChildStreamMixin, IncrementalIntercomStream):
     """Return list of all company segments.
diff --git a/docs/integrations/sources/intercom.md b/docs/integrations/sources/intercom.md
@@ -55,7 +55,7 @@ Please read [How to get your Access Token](https://developers.intercom.com/build
 
 | Version | Date | Pull Request | Subject |
 | :--- | :--- | :--- | :--- |
-| 0.1.10 | 2021-12-07 | [8579](https://github.com/airbytehq/airbyte/pull/8579) | Fix 'conversations' order and sorting |
+| 0.1.10 | 2021-12-10 | [8637](https://github.com/airbytehq/airbyte/pull/8637) | Fix 'conversations' order and sorting. Correction of the companies stream|
 | 0.1.9 | 2021-12-03 | [8395](https://github.com/airbytehq/airbyte/pull/8395) | Fix backoff of 'companies' stream |
 | 0.1.8 | 2021-11-09 | [7060](https://github.com/airbytehq/airbyte/pull/7060) | Added oauth support |
 | 0.1.7 | 2021-11-08 | [7499](https://github.com/airbytehq/airbyte/pull/7499) | Remove base-python dependencies |

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`"sourceDefinitionId": "d8313939-3782-41b0-be29-b3ca20d8dd3a",`
`3`	`3`	`"name": "Intercom",`
`4`	`4`	`"dockerRepository": "airbyte/source-intercom",`
`5`		`- "dockerImageTag": "0.1.9",`
	`5`	`+ "dockerImageTag": "0.1.10",`
`6`	`6`	`"documentationUrl": "https://docs.airbyte.io/integrations/sources/intercom",`
`7`	`7`	`"icon": "intercom.svg"`
`8`	`8`	`}`