[AIRFLOW-2997] Support cluster fields in bigquery (#3838)

chronitis · kaxil · commit dfff4a0039b3 · 2018-12-29T23:40:31.000Z
This adds a cluster_fields argument to the bigquery hook, GCS to
bigquery operator and bigquery query operators. This field requests that
bigquery store the result of the query/load operation sorted according
to the specified fields (the order of fields given is significant).
diff --git a/airflow/contrib/hooks/bigquery_hook.py b/airflow/contrib/hooks/bigquery_hook.py
@@ -496,7 +496,8 @@ def run_query(self,
                   schema_update_options=(),
                   priority='INTERACTIVE',
                   time_partitioning=None,
-                  api_resource_configs=None):
+                  api_resource_configs=None,
+                  cluster_fields=None):
         """
         Executes a BigQuery SQL query. Optionally persists results in a BigQuery
         table. See here:
@@ -563,8 +564,12 @@ def run_query(self,
         :param time_partitioning: configure optional time partitioning fields i.e.
             partition by field, type and expiration as per API specifications.
         :type time_partitioning: dict
-
+        :param cluster_fields: Request that the result of this query be stored sorted
+            by one or more columns. This is only available in combination with
+            time_partitioning. The order of columns given determines the sort order.
+        :type cluster_fields: list of str
         """
+
         if not api_resource_configs:
             api_resource_configs = self.api_resource_configs
         else:
@@ -629,6 +634,9 @@ def run_query(self,
                 'tableId': destination_table,
             }
 
+        if cluster_fields:
+            cluster_fields = {'fields': cluster_fields}
+
         query_param_list = [
             (sql, 'query', None, str),
             (priority, 'priority', 'INTERACTIVE', str),
@@ -639,7 +647,8 @@ def run_query(self,
             (maximum_bytes_billed, 'maximumBytesBilled', None, float),
             (time_partitioning, 'timePartitioning', {}, dict),
             (schema_update_options, 'schemaUpdateOptions', None, tuple),
-            (destination_dataset_table, 'destinationTable', None, dict)
+            (destination_dataset_table, 'destinationTable', None, dict),
+            (cluster_fields, 'clustering', None, dict),
         ]
 
         for param_tuple in query_param_list:
@@ -854,7 +863,8 @@ def run_load(self,
                  allow_jagged_rows=False,
                  schema_update_options=(),
                  src_fmt_configs=None,
-                 time_partitioning=None):
+                 time_partitioning=None,
+                 cluster_fields=None):
         """
         Executes a BigQuery load command to load data from Google Cloud Storage
         to BigQuery. See here:
@@ -916,6 +926,10 @@ def run_load(self,
         :param time_partitioning: configure optional time partitioning fields i.e.
             partition by field, type and  expiration as per API specifications.
         :type time_partitioning: dict
+        :param cluster_fields: Request that the result of this load be stored sorted
+            by one or more columns. This is only available in combination with
+            time_partitioning. The order of columns given determines the sort order.
+        :type cluster_fields: list of str
         """
 
         # bigquery only allows certain source formats
@@ -979,6 +993,9 @@ def run_load(self,
                 'timePartitioning': time_partitioning
             })
 
+        if cluster_fields:
+            configuration['load'].update({'clustering': {'fields': cluster_fields}})
+
         if schema_fields:
             configuration['load']['schema'] = {'fields': schema_fields}
 
diff --git a/airflow/contrib/operators/bigquery_operator.py b/airflow/contrib/operators/bigquery_operator.py
@@ -98,6 +98,10 @@ class BigQueryOperator(BaseOperator):
     :param time_partitioning: configure optional time partitioning fields i.e.
         partition by field, type and expiration as per API specifications.
     :type time_partitioning: dict
+    :param cluster_fields: Request that the result of this query be stored sorted
+        by one or more columns. This is only available in conjunction with
+        time_partitioning. The order of columns given determines the sort order.
+    :type cluster_fields: list of str
     """
 
     template_fields = ('bql', 'sql', 'destination_dataset_table', 'labels')
@@ -125,6 +129,7 @@ def __init__(self,
                  priority='INTERACTIVE',
                  time_partitioning=None,
                  api_resource_configs=None,
+                 cluster_fields=None,
                  *args,
                  **kwargs):
         super(BigQueryOperator, self).__init__(*args, **kwargs)
@@ -150,6 +155,7 @@ def __init__(self,
             self.time_partitioning = {}
         if api_resource_configs is None:
             self.api_resource_configs = {}
+        self.cluster_fields = cluster_fields
 
         # TODO remove `bql` in Airflow 2.0
         if self.bql:
@@ -190,6 +196,7 @@ def execute(self, context):
             priority=self.priority,
             time_partitioning=self.time_partitioning,
             api_resource_configs=self.api_resource_configs,
+            cluster_fields=self.cluster_fields,
         )
 
     def on_kill(self):
diff --git a/airflow/contrib/operators/gcs_to_bq.py b/airflow/contrib/operators/gcs_to_bq.py
@@ -114,6 +114,11 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator):
         Note that 'field' is not available in concurrency with
         dataset.table$partition.
     :type time_partitioning: dict
+    :param cluster_fields: Request that the result of this load be stored sorted
+        by one or more columns. This is only available in conjunction with
+        time_partitioning. The order of columns given determines the sort order.
+        Not applicable for external tables.
+    :type cluster_fields: list of str
     """
     template_fields = ('bucket', 'source_objects',
                        'schema_object', 'destination_project_dataset_table')
@@ -146,6 +151,7 @@ def __init__(self,
                  src_fmt_configs=None,
                  external_table=False,
                  time_partitioning=None,
+                 cluster_fields=None,
                  *args, **kwargs):
 
         super(GoogleCloudStorageToBigQueryOperator, self).__init__(*args, **kwargs)
@@ -183,6 +189,7 @@ def __init__(self,
         self.schema_update_options = schema_update_options
         self.src_fmt_configs = src_fmt_configs
         self.time_partitioning = time_partitioning
+        self.cluster_fields = cluster_fields
 
     def execute(self, context):
         bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
@@ -238,7 +245,8 @@ def execute(self, context):
                 allow_jagged_rows=self.allow_jagged_rows,
                 schema_update_options=self.schema_update_options,
                 src_fmt_configs=self.src_fmt_configs,
-                time_partitioning=self.time_partitioning)
+                time_partitioning=self.time_partitioning,
+                cluster_fields=self.cluster_fields)
 
         if self.max_id_key:
             cursor.execute('SELECT MAX({}) FROM {}'.format(
diff --git a/tests/contrib/hooks/test_bigquery_hook.py b/tests/contrib/hooks/test_bigquery_hook.py
@@ -448,6 +448,94 @@ def test_extra_time_partitioning_options(self):
         self.assertEqual(tp_out, expect)
 
 
+class TestClusteringInRunJob(unittest.TestCase):
+
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.LoggingMixin")
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.time")
+    @mock.patch.object(hook.BigQueryBaseCursor, 'run_with_configuration')
+    def test_run_load_default(self, mocked_rwc, mocked_time, mocked_logging):
+        project_id = 12345
+
+        def run_with_config(config):
+            self.assertIsNone(config['load'].get('clustering'))
+        mocked_rwc.side_effect = run_with_config
+
+        bq_hook = hook.BigQueryBaseCursor(mock.Mock(), project_id)
+        bq_hook.run_load(
+            destination_project_dataset_table='my_dataset.my_table',
+            schema_fields=[],
+            source_uris=[],
+        )
+
+        mocked_rwc.assert_called_once()
+
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.LoggingMixin")
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.time")
+    @mock.patch.object(hook.BigQueryBaseCursor, 'run_with_configuration')
+    def test_run_load_with_arg(self, mocked_rwc, mocked_time, mocked_logging):
+        project_id = 12345
+
+        def run_with_config(config):
+            self.assertEqual(
+                config['load']['clustering'],
+                {
+                    'fields': ['field1', 'field2']
+                }
+            )
+        mocked_rwc.side_effect = run_with_config
+
+        bq_hook = hook.BigQueryBaseCursor(mock.Mock(), project_id)
+        bq_hook.run_load(
+            destination_project_dataset_table='my_dataset.my_table',
+            schema_fields=[],
+            source_uris=[],
+            cluster_fields=['field1', 'field2'],
+            time_partitioning={'type': 'DAY'}
+        )
+
+        mocked_rwc.assert_called_once()
+
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.LoggingMixin")
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.time")
+    @mock.patch.object(hook.BigQueryBaseCursor, 'run_with_configuration')
+    def test_run_query_default(self, mocked_rwc, mocked_time, mocked_logging):
+        project_id = 12345
+
+        def run_with_config(config):
+            self.assertIsNone(config['query'].get('clustering'))
+        mocked_rwc.side_effect = run_with_config
+
+        bq_hook = hook.BigQueryBaseCursor(mock.Mock(), project_id)
+        bq_hook.run_query(sql='select 1')
+
+        mocked_rwc.assert_called_once()
+
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.LoggingMixin")
+    @mock.patch("airflow.contrib.hooks.bigquery_hook.time")
+    @mock.patch.object(hook.BigQueryBaseCursor, 'run_with_configuration')
+    def test_run_query_with_arg(self, mocked_rwc, mocked_time, mocked_logging):
+        project_id = 12345
+
+        def run_with_config(config):
+            self.assertEqual(
+                config['query']['clustering'],
+                {
+                    'fields': ['field1', 'field2']
+                }
+            )
+        mocked_rwc.side_effect = run_with_config
+
+        bq_hook = hook.BigQueryBaseCursor(mock.Mock(), project_id)
+        bq_hook.run_query(
+            sql='select 1',
+            destination_dataset_table='my_dataset.my_table',
+            cluster_fields=['field1', 'field2'],
+            time_partitioning={'type': 'DAY'}
+        )
+
+        mocked_rwc.assert_called_once()
+
+
 class TestBigQueryHookLegacySql(unittest.TestCase):
     """Ensure `use_legacy_sql` param in `BigQueryHook` propagates properly."""