airbytehq · brianjlai · Aug 23, 2022 · Aug 20, 2022 · Aug 20, 2022 · Aug 22, 2022
diff --git a/airbyte-cdk/python/CHANGELOG.md b/airbyte-cdk/python/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## 0.1.79
+- Fix yaml schema parsing when running from docker container
+
 ## 0.1.78
 - Fix yaml config parsing when running from docker container
 

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py
@@ -9,6 +9,7 @@
 import importlib
 import inspect
 import typing
+import warnings
 from dataclasses import fields
 from typing import Any, List, Literal, Mapping, Type, Union, get_args, get_origin, get_type_hints
 
@@ -153,7 +154,12 @@ def build(self, class_or_class_name: Union[str, Type], config, instantiate: bool
             # concrete classes that implement the interface before generating the schema
             class_copy = copy.deepcopy(class_)
             DeclarativeComponentFactory._transform_interface_to_union(class_copy)
-            schema = class_copy.json_schema()
+
+            # dataclasses_jsonschema can throw warnings when a declarative component has a fields cannot be turned into a schema.
+            # Some builtin field types like Any or DateTime get flagged, but are not as critical to schema generation and validation
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", category=UserWarning)
+                schema = class_copy.json_schema()
 
             component_definition = {
                 **updated_kwargs,

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py
@@ -3,6 +3,7 @@
 #
 
 import json
+import pkgutil
 from dataclasses import InitVar, dataclass
 from typing import Any, Mapping, Union
 
@@ -32,9 +33,41 @@ def __post_init__(self, options: Mapping[str, Any]):
         self.file_path = InterpolatedString.create(self.file_path, options=options)
 
     def get_json_schema(self) -> Mapping[str, Any]:
+        # todo: It is worth revisiting if we can replace file_path with just file_name if every schema is in the /schemas directory
+        # this would require that we find a creative solution to store or retrieve source_name in here since the files are mounted there
         json_schema_path = self._get_json_filepath()
-        with open(json_schema_path, "r") as f:
-            return json.loads(f.read())
+        resource, schema_path = self.extract_resource_and_schema_path(json_schema_path)
+        raw_json_file = pkgutil.get_data(resource, schema_path)
+
+        if not raw_json_file:
+            raise IOError(f"Cannot find file {json_schema_path}")
+        try:
+            raw_schema = json.loads(raw_json_file)
+        except ValueError as err:
+            raise RuntimeError(f"Invalid JSON file format for file {json_schema_path}") from err
+        return raw_schema
 
     def _get_json_filepath(self):
         return self.file_path.eval(self.config)
+
+    @staticmethod
+    def extract_resource_and_schema_path(json_schema_path: str) -> (str, str):
+        """
+        When the connector is running on a docker container, package_data is accessible from the resource (source_<name>), so we extract
+        the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight
+        hack to identify the source name while we are in the airbyte_cdk module.
+        :param json_schema_path: The path to the schema JSON file
+        :return: Tuple of the resource name and the path to the schema file
+        """
+        split_path = json_schema_path.split("/")
+
+        if split_path[0] == "" or split_path[0] == ".":
+            split_path = split_path[1:]
+
+        if len(split_path) == 0:
+            return "", ""
+
+        if len(split_path) == 1:
+            return "", split_path[0]
+
+        return split_path[0], "/".join(split_path[1:])
diff --git a/airbyte-cdk/python/setup.py b/airbyte-cdk/python/setup.py
@@ -15,7 +15,7 @@
 
 setup(
     name="airbyte-cdk",
-    version="0.1.78",
+    version="0.1.79",
     description="A framework for writing Airbyte Connectors.",
     long_description=README,
     long_description_content_type="text/markdown",

diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_schema.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_schema.py
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
+#
+
+import pytest
+from airbyte_cdk.sources.declarative.schema import JsonSchema
+
+
+@pytest.mark.parametrize(
+    "test_name, input_path, expected_resource, expected_path",
+    [
+        ("path_prefixed_with_dot", "./source_example/schemas/lists.json", "source_example", "schemas/lists.json"),
+        ("path_prefixed_with_slash", "/source_example/schemas/lists.json", "source_example", "schemas/lists.json"),
+        ("path_starting_with_source", "source_example/schemas/lists.json", "source_example", "schemas/lists.json"),
+        ("path_starting_missing_source", "schemas/lists.json", "schemas", "lists.json"),
+        ("path_with_file_only", "lists.json", "", "lists.json"),
+        ("empty_path_does_not_crash", "", "", ""),
+        ("empty_path_with_slash_does_not_crash", "/", "", ""),
+    ],
+)
+def test_extract_resource_and_schema_path(test_name, input_path, expected_resource, expected_path):
+    json_schema = JsonSchema(input_path, {}, {})
+    actual_resource, actual_path = json_schema.extract_resource_and_schema_path(input_path)
+
+    assert actual_resource == expected_resource
+    assert actual_path == expected_path