From ca8d7ba9c7ec062bd74cff514758b557afa636b1 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Sat, 20 Aug 2022 01:18:13 -0700 Subject: [PATCH 1/5] replace file retrieval with pkgutil to fix getting schema files --- .../sources/declarative/schema/json_schema.py | 31 +++++++++++++++++-- .../declarative/schema/test_json_schema.py | 26 ++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_schema.py diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py index acdf92e6a35f5..b3e217fc48cd1 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py @@ -3,6 +3,7 @@ # import json +import pkgutil from dataclasses import InitVar, dataclass from typing import Any, Mapping, Union @@ -33,8 +34,34 @@ def __post_init__(self, options: Mapping[str, Any]): def get_json_schema(self) -> Mapping[str, Any]: json_schema_path = self._get_json_filepath() - with open(json_schema_path, "r") as f: - return json.loads(f.read()) + resource, schema_path = self.extract_resource_and_schema_path(json_schema_path) + json_schema = pkgutil.get_data(resource, schema_path) + + if not json_schema: + raise FileNotFoundError("File not found: " + json_schema_path) + return json.loads(json_schema) def _get_json_filepath(self): return self.file_path.eval(self.config) + + @staticmethod + def extract_resource_and_schema_path(json_schema_path: str) -> (str, str): + """ + When the connector is running on a docker container, package_data is accessible from the resource (source_), so we extract + the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight + hack to identify the source name while we are in the airbyte_cdk module. + :param json_schema_path: The path to the schema JSON file + :return: Tuple of the resource name and the path to the schema file + """ + split_path = json_schema_path.split("/") + + if split_path[0] == "" or split_path[0] == ".": + split_path = split_path[1:] + + if len(split_path) == 0: + return "", "" + + if len(split_path) == 1: + return "", split_path[0] + + return split_path[0], "/".join(split_path[1:]) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_schema.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_schema.py new file mode 100644 index 0000000000000..4dc237dc30861 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_schema.py @@ -0,0 +1,26 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.schema import JsonSchema + + +@pytest.mark.parametrize( + "test_name, input_path, expected_resource, expected_path", + [ + ("path_prefixed_with_dot", "./source_example/schemas/lists.json", "source_example", "schemas/lists.json"), + ("path_prefixed_with_slash", "/source_example/schemas/lists.json", "source_example", "schemas/lists.json"), + ("path_starting_with_source", "source_example/schemas/lists.json", "source_example", "schemas/lists.json"), + ("path_starting_missing_source", "schemas/lists.json", "schemas", "lists.json"), + ("path_with_file_only", "lists.json", "", "lists.json"), + ("empty_path_does_not_crash", "", "", ""), + ("empty_path_with_slash_does_not_crash", "/", "", ""), + ], +) +def test_extract_resource_and_schema_path(test_name, input_path, expected_resource, expected_path): + json_schema = JsonSchema(input_path, {}, {}) + actual_resource, actual_path = json_schema.extract_resource_and_schema_path(input_path) + + assert actual_resource == expected_resource + assert actual_path == expected_path From 396f57f060f66ee7db3f9f8ec4b7589d56200636 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Sat, 20 Aug 2022 01:24:03 -0700 Subject: [PATCH 2/5] slightly better error handling on missing files --- .../sources/declarative/schema/json_schema.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py index b3e217fc48cd1..2d197d1f481ab 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py @@ -35,11 +35,15 @@ def __post_init__(self, options: Mapping[str, Any]): def get_json_schema(self) -> Mapping[str, Any]: json_schema_path = self._get_json_filepath() resource, schema_path = self.extract_resource_and_schema_path(json_schema_path) - json_schema = pkgutil.get_data(resource, schema_path) + raw_json_file = pkgutil.get_data(resource, schema_path) - if not json_schema: - raise FileNotFoundError("File not found: " + json_schema_path) - return json.loads(json_schema) + if not raw_json_file: + raise IOError(f"Cannot find file {json_schema_path}") + try: + raw_schema = json.loads(raw_json_file) + except ValueError as err: + raise RuntimeError(f"Invalid JSON file format for file {json_schema_path}") from err + return raw_schema def _get_json_filepath(self): return self.file_path.eval(self.config) From 092f5e01979b6750bdfa421d7dfbc0b3833c0914 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Mon, 22 Aug 2022 00:43:03 -0700 Subject: [PATCH 3/5] filter our schema gen warnings for some classes that cannot generate schemas --- .../airbyte_cdk/sources/declarative/parsers/factory.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py index 77487aa4f9f52..d31a7d5896b82 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py @@ -9,6 +9,7 @@ import importlib import inspect import typing +import warnings from dataclasses import fields from typing import Any, List, Literal, Mapping, Type, Union, get_args, get_origin, get_type_hints @@ -153,7 +154,12 @@ def build(self, class_or_class_name: Union[str, Type], config, instantiate: bool # concrete classes that implement the interface before generating the schema class_copy = copy.deepcopy(class_) DeclarativeComponentFactory._transform_interface_to_union(class_copy) - schema = class_copy.json_schema() + + # dataclasses_jsonschema can throw warnings when a declarative component has a fields cannot be turned into a schema. + # Some builtin field types like Any or DateTime get flagged, but are not as critical to schema generation and validation + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + schema = class_copy.json_schema() component_definition = { **updated_kwargs, From b5531606228c27133d7e8092043560f312b2ad27 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Mon, 22 Aug 2022 15:03:12 -0700 Subject: [PATCH 4/5] add comment for todo --- .../airbyte_cdk/sources/declarative/schema/json_schema.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py index 2d197d1f481ab..6aa6766a70abb 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py @@ -33,6 +33,8 @@ def __post_init__(self, options: Mapping[str, Any]): self.file_path = InterpolatedString.create(self.file_path, options=options) def get_json_schema(self) -> Mapping[str, Any]: + # todo: It is worth revisiting if we can replace file_path with just file_name if every schema is in the /schemas directory + # this would require that we find a creative solution to store or retrieve source_name in here since the files are mounted there json_schema_path = self._get_json_filepath() resource, schema_path = self.extract_resource_and_schema_path(json_schema_path) raw_json_file = pkgutil.get_data(resource, schema_path) From 06c4e59d83377f4423ca58692b3c82447a8136c7 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Mon, 22 Aug 2022 16:50:06 -0700 Subject: [PATCH 5/5] add changelog and setup before publish --- airbyte-cdk/python/CHANGELOG.md | 3 +++ airbyte-cdk/python/setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/airbyte-cdk/python/CHANGELOG.md b/airbyte-cdk/python/CHANGELOG.md index 9a5555d8645ad..bbb91804051e8 100644 --- a/airbyte-cdk/python/CHANGELOG.md +++ b/airbyte-cdk/python/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 0.1.79 +- Fix yaml schema parsing when running from docker container + ## 0.1.78 - Fix yaml config parsing when running from docker container diff --git a/airbyte-cdk/python/setup.py b/airbyte-cdk/python/setup.py index 8629dd9157f88..1e4c303622b3b 100644 --- a/airbyte-cdk/python/setup.py +++ b/airbyte-cdk/python/setup.py @@ -15,7 +15,7 @@ setup( name="airbyte-cdk", - version="0.1.78", + version="0.1.79", description="A framework for writing Airbyte Connectors.", long_description=README, long_description_content_type="text/markdown",