Skip to content

Commit 6be0272

Browse files
authored
feat: adds StorageDescriptor and tests (#2109)
* feat: adds StorageDescriptor and tests * updates attr names, corrects type hinting
1 parent 62960f2 commit 6be0272

File tree

2 files changed

+246
-0
lines changed

2 files changed

+246
-0
lines changed

google/cloud/bigquery/schema.py

+118
Original file line numberDiff line numberDiff line change
@@ -644,3 +644,121 @@ def from_api_repr(cls, api_repr: dict) -> SerDeInfo:
644644
config = cls("PLACEHOLDER")
645645
config._properties = api_repr
646646
return config
647+
648+
649+
class StorageDescriptor:
650+
"""Contains information about how a table's data is stored and accessed by open
651+
source query engines.
652+
653+
Args:
654+
input_format (Optional[str]): Specifies the fully qualified class name of
655+
the InputFormat (e.g.
656+
"org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"). The maximum
657+
length is 128 characters.
658+
location_uri (Optional[str]): The physical location of the table (e.g.
659+
'gs://spark-dataproc-data/pangea-data/case_sensitive/' or
660+
'gs://spark-dataproc-data/pangea-data/'). The maximum length is
661+
2056 bytes.
662+
output_format (Optional[str]): Specifies the fully qualified class name
663+
of the OutputFormat (e.g.
664+
"org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"). The maximum
665+
length is 128 characters.
666+
serde_info (Union[SerDeInfo, dict, None]): Serializer and deserializer information.
667+
"""
668+
669+
def __init__(
670+
self,
671+
input_format: Optional[str] = None,
672+
location_uri: Optional[str] = None,
673+
output_format: Optional[str] = None,
674+
serde_info: Union[SerDeInfo, dict, None] = None,
675+
):
676+
self._properties: Dict[str, Any] = {}
677+
self.input_format = input_format
678+
self.location_uri = location_uri
679+
self.output_format = output_format
680+
# Using typing.cast() because mypy cannot wrap it's head around the fact that:
681+
# the setter can accept Union[SerDeInfo, dict, None]
682+
# but the getter will only ever return Optional[SerDeInfo].
683+
self.serde_info = typing.cast(Optional[SerDeInfo], serde_info)
684+
685+
@property
686+
def input_format(self) -> Optional[str]:
687+
"""Optional. Specifies the fully qualified class name of the InputFormat
688+
(e.g. "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"). The maximum
689+
length is 128 characters."""
690+
691+
return self._properties.get("inputFormat")
692+
693+
@input_format.setter
694+
def input_format(self, value: Optional[str]):
695+
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
696+
self._properties["inputFormat"] = value
697+
698+
@property
699+
def location_uri(self) -> Optional[str]:
700+
"""Optional. The physical location of the table (e.g. 'gs://spark-
701+
dataproc-data/pangea-data/case_sensitive/' or 'gs://spark-dataproc-
702+
data/pangea-data/'). The maximum length is 2056 bytes."""
703+
704+
return self._properties.get("locationUri")
705+
706+
@location_uri.setter
707+
def location_uri(self, value: Optional[str]):
708+
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
709+
self._properties["locationUri"] = value
710+
711+
@property
712+
def output_format(self) -> Optional[str]:
713+
"""Optional. Specifies the fully qualified class name of the
714+
OutputFormat (e.g. "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").
715+
The maximum length is 128 characters."""
716+
717+
return self._properties.get("outputFormat")
718+
719+
@output_format.setter
720+
def output_format(self, value: Optional[str]):
721+
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
722+
self._properties["outputFormat"] = value
723+
724+
@property
725+
def serde_info(self) -> Optional[SerDeInfo]:
726+
"""Optional. Serializer and deserializer information."""
727+
728+
prop = _helpers._get_sub_prop(self._properties, ["serDeInfo"])
729+
if prop is not None:
730+
return typing.cast(SerDeInfo, SerDeInfo.from_api_repr(prop))
731+
return None
732+
733+
@serde_info.setter
734+
def serde_info(self, value: Union[SerDeInfo, dict, None]):
735+
value = _helpers._isinstance_or_raise(
736+
value, (SerDeInfo, dict), none_allowed=True
737+
)
738+
739+
if isinstance(value, SerDeInfo):
740+
self._properties["serDeInfo"] = value.to_api_repr()
741+
else:
742+
self._properties["serDeInfo"] = value
743+
744+
def to_api_repr(self) -> dict:
745+
"""Build an API representation of this object.
746+
Returns:
747+
Dict[str, Any]:
748+
A dictionary in the format used by the BigQuery API.
749+
"""
750+
return self._properties
751+
752+
@classmethod
753+
def from_api_repr(cls, resource: dict) -> StorageDescriptor:
754+
"""Factory: constructs an instance of the class (cls)
755+
given its API representation.
756+
Args:
757+
resource (Dict[str, Any]):
758+
API representation of the object to be instantiated.
759+
Returns:
760+
An instance of the class initialized with data from 'resource'.
761+
"""
762+
config = cls()
763+
config._properties = resource
764+
return config

tests/unit/test_schema.py

+128
Original file line numberDiff line numberDiff line change
@@ -1213,3 +1213,131 @@ def test_from_api_repr(self):
12131213
# We convert both to dict format because these classes do not have a
12141214
# __eq__() method to facilitate direct equality comparisons.
12151215
assert result.to_api_repr() == expected.to_api_repr()
1216+
1217+
1218+
class TestStorageDescriptor:
1219+
"""Tests for the StorageDescriptor class."""
1220+
1221+
@staticmethod
1222+
def _get_target_class():
1223+
return schema.StorageDescriptor
1224+
1225+
def _make_one(self, *args, **kwargs):
1226+
return self._get_target_class()(*args, **kwargs)
1227+
1228+
serdeinfo_resource = {
1229+
"serialization_library": "testpath.to.LazySimpleSerDe",
1230+
"name": "serde_lib_name",
1231+
"parameters": {"key": "value"},
1232+
}
1233+
1234+
SERDEINFO = schema.SerDeInfo("PLACEHOLDER").from_api_repr(serdeinfo_resource)
1235+
1236+
STORAGEDESCRIPTOR = {
1237+
"inputFormat": "testpath.to.OrcInputFormat",
1238+
"locationUri": "gs://test/path/",
1239+
"outputFormat": "testpath.to.OrcOutputFormat",
1240+
"serDeInfo": SERDEINFO.to_api_repr(),
1241+
}
1242+
1243+
@pytest.mark.parametrize(
1244+
"input_format,location_uri,output_format,serde_info",
1245+
[
1246+
(None, None, None, None),
1247+
("testpath.to.OrcInputFormat", None, None, None),
1248+
(None, "gs://test/path/", None, None),
1249+
(None, None, "testpath.to.OrcOutputFormat", None),
1250+
(None, None, None, SERDEINFO),
1251+
(
1252+
"testpath.to.OrcInputFormat",
1253+
"gs://test/path/",
1254+
"testpath.to.OrcOutputFormat",
1255+
SERDEINFO, # uses SERDEINFO class format
1256+
),
1257+
(
1258+
"testpath.to.OrcInputFormat",
1259+
"gs://test/path/",
1260+
"testpath.to.OrcOutputFormat",
1261+
serdeinfo_resource, # uses api resource format (dict)
1262+
),
1263+
],
1264+
)
1265+
def test_ctor_valid_input(
1266+
self, input_format, location_uri, output_format, serde_info
1267+
):
1268+
storage_descriptor = self._make_one(
1269+
input_format=input_format,
1270+
location_uri=location_uri,
1271+
output_format=output_format,
1272+
serde_info=serde_info,
1273+
)
1274+
assert storage_descriptor.input_format == input_format
1275+
assert storage_descriptor.location_uri == location_uri
1276+
assert storage_descriptor.output_format == output_format
1277+
if isinstance(serde_info, schema.SerDeInfo):
1278+
assert (
1279+
storage_descriptor.serde_info.to_api_repr() == serde_info.to_api_repr()
1280+
)
1281+
elif isinstance(serde_info, dict):
1282+
assert storage_descriptor.serde_info.to_api_repr() == serde_info
1283+
else:
1284+
assert storage_descriptor.serde_info is None
1285+
1286+
@pytest.mark.parametrize(
1287+
"input_format,location_uri,output_format,serde_info",
1288+
[
1289+
(123, None, None, None),
1290+
(None, 123, None, None),
1291+
(None, None, 123, None),
1292+
(None, None, None, 123),
1293+
],
1294+
)
1295+
def test_ctor_invalid_input(
1296+
self, input_format, location_uri, output_format, serde_info
1297+
):
1298+
with pytest.raises(TypeError) as e:
1299+
self._make_one(
1300+
input_format=input_format,
1301+
location_uri=location_uri,
1302+
output_format=output_format,
1303+
serde_info=serde_info,
1304+
)
1305+
1306+
# Looking for the first word from the string "Pass <variable> as..."
1307+
assert "Pass " in str(e.value)
1308+
1309+
def test_to_api_repr(self):
1310+
storage_descriptor = self._make_one(
1311+
input_format="input_format",
1312+
location_uri="location_uri",
1313+
output_format="output_format",
1314+
serde_info=self.SERDEINFO,
1315+
)
1316+
expected_repr = {
1317+
"inputFormat": "input_format",
1318+
"locationUri": "location_uri",
1319+
"outputFormat": "output_format",
1320+
"serDeInfo": self.SERDEINFO.to_api_repr(),
1321+
}
1322+
assert storage_descriptor.to_api_repr() == expected_repr
1323+
1324+
def test_from_api_repr(self):
1325+
"""GIVEN an api representation of a StorageDescriptor (i.e. STORAGEDESCRIPTOR)
1326+
WHEN converted into a StorageDescriptor using from_api_repr() and
1327+
displayed as a dict
1328+
THEN it will have the same representation a StorageDescriptor created
1329+
directly (via the _make_one() func) and displayed as a dict.
1330+
"""
1331+
1332+
# generate via STORAGEDESCRIPTOR
1333+
resource = self.STORAGEDESCRIPTOR
1334+
result = self._get_target_class().from_api_repr(resource)
1335+
# result = klass.from_api_repr(resource)
1336+
1337+
expected = self._make_one(
1338+
input_format="testpath.to.OrcInputFormat",
1339+
location_uri="gs://test/path/",
1340+
output_format="testpath.to.OrcOutputFormat",
1341+
serde_info=self.SERDEINFO,
1342+
)
1343+
assert result.to_api_repr() == expected.to_api_repr()

0 commit comments

Comments
 (0)