Skip to content

Commit

Permalink
🐛 Source Google Sheets: revert 'add row_id to rows and use as primary…
Browse files Browse the repository at this point in the history
… key' (#19574)

* revert PR 19215

* bump
  • Loading branch information
girarda authored Nov 18, 2022
1 parent 5c7affc commit a1da5fc
Show file tree
Hide file tree
Showing 7 changed files with 15 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@
- name: Google Sheets
sourceDefinitionId: 71607ba1-c0ac-4799-8049-7f4b90dd50f7
dockerRepository: airbyte/source-google-sheets
dockerImageTag: 0.2.30
dockerImageTag: 0.2.31
documentationUrl: https://docs.airbyte.com/integrations/sources/google-sheets
icon: google-sheets.svg
sourceType: file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5509,7 +5509,7 @@
oauthFlowOutputParameters:
- - "access_token"
- - "refresh_token"
- dockerImage: "airbyte/source-google-sheets:0.2.30"
- dockerImage: "airbyte/source-google-sheets:0.2.31"
spec:
documentationUrl: "https://docs.airbyte.com/integrations/sources/google-sheets"
connectionSpecification:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ COPY google_sheets_source ./google_sheets_source
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.2.30
LABEL io.airbyte.version=0.2.31
LABEL io.airbyte.name=airbyte/source-google-sheets
Original file line number Diff line number Diff line change
Expand Up @@ -169,15 +169,9 @@ def read(
if len(row_values) == 0:
break

row_id = row_cursor
for row in row_values:
if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
yield AirbyteMessage(
type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row_id, row, column_index_to_name)
)
row_id += 1

row_cursor += ROW_BATCH_SIZE + 1
yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,14 @@ def headers_to_airbyte_stream(logger: AirbyteLogger, sheet_name: str, header_row
if duplicate_fields:
logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them :{duplicate_fields}")

props = {field: {"type": "string"} for field in fields}
props["row_id"] = {"type": "integer"}
sheet_json_schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": ["row_id"],
# For simplicity, the type of every cell is a string
"properties": props,
"properties": {field: {"type": "string"} for field in fields},
}

return AirbyteStream(
name=sheet_name,
json_schema=sheet_json_schema,
supported_sync_modes=[SyncMode.full_refresh],
source_defined_primary_key=[["row_id"]],
)
return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])

@staticmethod
def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
Expand Down Expand Up @@ -129,10 +121,8 @@ def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog)
return sheet_to_column_name

@staticmethod
def row_data_to_record_message(
sheet_name: str, row_id: int, cell_values: List[str], column_index_to_name: Dict[int, str]
) -> AirbyteRecordMessage:
data = {"row_id": row_id}
def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
data = {}
for relevant_index in sorted(column_index_to_name.keys()):
if relevant_index >= len(cell_values):
break
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,14 @@ def test_headers_to_airbyte_stream(self):
sheet_name = "sheet1"
header_values = ["h1", "h2", "h3"]

props = {header: {"type": "string"} for header in header_values}
props["row_id"] = {"type": "integer"}
expected_stream = AirbyteStream(
name=sheet_name,
json_schema={
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": ["row_id"],
# For simplicity, the type of every cell is a string
"properties": props,
"properties": {header: {"type": "string"} for header in header_values},
},
source_defined_primary_key=[["row_id"]],
supported_sync_modes=[SyncMode.full_refresh],
)

Expand All @@ -61,20 +57,15 @@ def test_duplicate_headers_to_ab_stream_ignores_duplicates(self):
header_values = ["h1", "h1", "h3"]

# h1 is ignored because it is duplicate
props = {
"h3": {"type": "string"},
"row_id": {"type": "integer"},
}
expected_stream_header_values = ["h3"]
expected_stream = AirbyteStream(
name=sheet_name,
json_schema={
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": ["row_id"],
# For simplicity, the type of every cell is a string
"properties": props,
"properties": {header: {"type": "string"} for header in expected_stream_header_values},
},
source_defined_primary_key=[["row_id"]],
supported_sync_modes=[SyncMode.full_refresh],
)

Expand All @@ -90,11 +81,9 @@ def test_headers_to_airbyte_stream_blank_values_terminate_row(self):
json_schema={
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": ["row_id"],
# For simplicity, the type of every cell is a string
"properties": {"h1": {"type": "string"}, "row_id": {"type": "integer"}},
"properties": {"h1": {"type": "string"}},
},
source_defined_primary_key=[["row_id"]],
supported_sync_modes=[SyncMode.full_refresh],
)
actual_stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_values)
Expand Down Expand Up @@ -154,11 +143,10 @@ def test_row_data_to_record_message(self):
sheet = "my_sheet"
cell_values = ["v1", "v2", "v3", "v4"]
column_index_to_name = {0: "c1", 3: "c4"}
row_id = 1

actual = Helpers.row_data_to_record_message(sheet, row_id, cell_values, column_index_to_name)
actual = Helpers.row_data_to_record_message(sheet, cell_values, column_index_to_name)

expected = AirbyteRecordMessage(stream=sheet, data={"row_id": row_id, "c1": "v1", "c4": "v4"}, emitted_at=1)
expected = AirbyteRecordMessage(stream=sheet, data={"c1": "v1", "c4": "v4"}, emitted_at=1)
self.assertEqual(expected.stream, actual.stream)
self.assertEqual(expected.data, actual.data)

Expand Down
1 change: 1 addition & 0 deletions docs/integrations/sources/google-sheets.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ The [Google API rate limit](https://developers.google.com/sheets/api/limits) is

| Version | Date | Pull Request | Subject |
| ------- | ---------- | -------------------------------------------------------- | ----------------------------------------------------------------------------- |
| 0.2.31 | 2022-10-09 | [](https://github.com/airbytehq/airbyte/pull/) | Revert 'Add row_id to rows and use as primary key' |
| 0.2.30 | 2022-10-09 | [](https://github.com/airbytehq/airbyte/pull/) | Add row_id to rows and use as primary key |
| 0.2.21 | 2022-10-04 | [15591](https://github.com/airbytehq/airbyte/pull/15591) | Clean instantiation of AirbyteStream |
| 0.2.20 | 2022-10-10 | [17766](https://github.com/airbytehq/airbyte/pull/17766) | Fix null pointer exception when parsing the spreadsheet id. |
Expand Down

0 comments on commit a1da5fc

Please sign in to comment.