@@ -33,6 +33,43 @@ class DataFlowJavaOperator(BaseOperator):
33
33
Start a Java Cloud DataFlow batch job. The parameters of the operation
34
34
will be passed to the job.
35
35
36
+ .. seealso::
37
+ For more detail on job submission have a look at the reference:
38
+ https://cloud.google.com/dataflow/pipelines/specifying-exec-params
39
+
40
+ :param jar: The reference to a self executing DataFlow jar (templated).
41
+ :type jar: str
42
+ :param job_name: The 'jobName' to use when executing the DataFlow job
43
+ (templated). This ends up being set in the pipeline options, so any entry
44
+ with key ``'jobName'`` in ``options`` will be overwritten.
45
+ :type job_name: str
46
+ :param dataflow_default_options: Map of default job options.
47
+ :type dataflow_default_options: dict
48
+ :param options: Map of job specific options.
49
+ :type options: dict
50
+ :param gcp_conn_id: The connection ID to use connecting to Google Cloud
51
+ Platform.
52
+ :type gcp_conn_id: str
53
+ :param delegate_to: The account to impersonate, if any.
54
+ For this to work, the service account making the request must have
55
+ domain-wide delegation enabled.
56
+ :type delegate_to: str
57
+ :param poll_sleep: The time in seconds to sleep between polling Google
58
+ Cloud Platform for the dataflow job status while the job is in the
59
+ JOB_STATE_RUNNING state.
60
+ :type poll_sleep: int
61
+ :param job_class: The name of the dataflow job class to be executued, it
62
+ is often not the main class configured in the dataflow jar file.
63
+ :type job_class: str
64
+
65
+ ``jar``, ``options``, and ``job_name`` are templated so you can use variables in them.
66
+
67
+ Note that both
68
+ ``dataflow_default_options`` and ``options`` will be merged to specify pipeline
69
+ execution parameter, and ``dataflow_default_options`` is expected to save
70
+ high-level options, for instances, project and zone information, which
71
+ apply to all dataflow operators in the DAG.
72
+
36
73
It's a good practice to define dataflow_* parameters in the default_args of the dag
37
74
like the project, zone and staging location.
38
75
@@ -68,13 +105,14 @@ class DataFlowJavaOperator(BaseOperator):
68
105
69
106
Both ``jar`` and ``options`` are templated so you can use variables in them.
70
107
"""
71
- template_fields = ['options' , 'jar' ]
108
+ template_fields = ['options' , 'jar' , 'job_name' ]
72
109
ui_color = '#0273d4'
73
110
74
111
@apply_defaults
75
112
def __init__ (
76
113
self ,
77
114
jar ,
115
+ job_name = '{{task.task_id}}' ,
78
116
dataflow_default_options = None ,
79
117
options = None ,
80
118
gcp_conn_id = 'google_cloud_default' ,
@@ -125,6 +163,7 @@ def __init__(
125
163
self .gcp_conn_id = gcp_conn_id
126
164
self .delegate_to = delegate_to
127
165
self .jar = jar
166
+ self .job_name = job_name
128
167
self .dataflow_default_options = dataflow_default_options
129
168
self .options = options
130
169
self .poll_sleep = poll_sleep
@@ -141,14 +180,35 @@ def execute(self, context):
141
180
dataflow_options = copy .copy (self .dataflow_default_options )
142
181
dataflow_options .update (self .options )
143
182
144
- hook .start_java_dataflow (self .task_id , dataflow_options ,
183
+ hook .start_java_dataflow (self .job_name , dataflow_options ,
145
184
self .jar , self .job_class )
146
185
147
186
148
187
class DataflowTemplateOperator (BaseOperator ):
149
188
"""
150
189
Start a Templated Cloud DataFlow batch job. The parameters of the operation
151
190
will be passed to the job.
191
+
192
+ :param template: The reference to the DataFlow template.
193
+ :type template: str
194
+ :param job_name: The 'jobName' to use when executing the DataFlow template
195
+ (templated).
196
+ :param dataflow_default_options: Map of default job environment options.
197
+ :type dataflow_default_options: dict
198
+ :param parameters: Map of job specific parameters for the template.
199
+ :type parameters: dict
200
+ :param gcp_conn_id: The connection ID to use connecting to Google Cloud
201
+ Platform.
202
+ :type gcp_conn_id: str
203
+ :param delegate_to: The account to impersonate, if any.
204
+ For this to work, the service account making the request must have
205
+ domain-wide delegation enabled.
206
+ :type delegate_to: str
207
+ :param poll_sleep: The time in seconds to sleep between polling Google
208
+ Cloud Platform for the dataflow job status while the job is in the
209
+ JOB_STATE_RUNNING state.
210
+ :type poll_sleep: int
211
+
152
212
It's a good practice to define dataflow_* parameters in the default_args of the dag
153
213
like the project, zone and staging location.
154
214
@@ -183,16 +243,27 @@ class DataflowTemplateOperator(BaseOperator):
183
243
gcp_conn_id='gcp-airflow-service-account',
184
244
dag=my-dag)
185
245
186
- ``template``, ``dataflow_default_options`` and ``parameters`` are templated so you can
187
- use variables in them.
246
+ ``template``, ``dataflow_default_options``, ``parameters``, and ``job_name`` are
247
+ templated so you can use variables in them.
248
+
249
+ Note that ``dataflow_default_options`` is expected to save high-level options
250
+ for project information, which apply to all dataflow operators in the DAG.
251
+
252
+ .. seealso::
253
+ https://cloud.google.com/dataflow/docs/reference/rest/v1b3
254
+ /LaunchTemplateParameters
255
+ https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment
256
+ For more detail on job template execution have a look at the reference:
257
+ https://cloud.google.com/dataflow/docs/templates/executing-templates
188
258
"""
189
- template_fields = ['parameters' , 'dataflow_default_options' , 'template' ]
259
+ template_fields = ['parameters' , 'dataflow_default_options' , 'template' , 'job_name' ]
190
260
ui_color = '#0273d4'
191
261
192
262
@apply_defaults
193
263
def __init__ (
194
264
self ,
195
265
template ,
266
+ job_name = '{{task.task_id}}' ,
196
267
dataflow_default_options = None ,
197
268
parameters = None ,
198
269
gcp_conn_id = 'google_cloud_default' ,
@@ -240,14 +311,15 @@ def __init__(
240
311
self .dataflow_default_options = dataflow_default_options
241
312
self .poll_sleep = poll_sleep
242
313
self .template = template
314
+ self .job_name = job_name
243
315
self .parameters = parameters
244
316
245
317
def execute (self , context ):
246
318
hook = DataFlowHook (gcp_conn_id = self .gcp_conn_id ,
247
319
delegate_to = self .delegate_to ,
248
320
poll_sleep = self .poll_sleep )
249
321
250
- hook .start_template_dataflow (self .task_id , self .dataflow_default_options ,
322
+ hook .start_template_dataflow (self .job_name , self .dataflow_default_options ,
251
323
self .parameters , self .template )
252
324
253
325
@@ -266,6 +338,10 @@ class DataFlowPythonOperator(BaseOperator):
266
338
:param py_file: Reference to the python dataflow pipleline file.py, e.g.,
267
339
/some/local/file/path/to/your/python/pipeline/file.
268
340
:type py_file: string
341
+ :param job_name: The 'job_name' to use when executing the DataFlow job
342
+ (templated). This ends up being set in the pipeline options, so any entry
343
+ with key ``'jobName'`` or ``'job_name'`` in ``options`` will be overwritten.
344
+ :type job_name: str
269
345
:param py_options: Additional python options.
270
346
:type pyt_options: list of strings, e.g., ["-m", "-v"].
271
347
:param dataflow_default_options: Map of default job options.
@@ -284,13 +360,13 @@ class DataFlowPythonOperator(BaseOperator):
284
360
JOB_STATE_RUNNING state.
285
361
:type poll_sleep: int
286
362
"""
287
-
288
- template_fields = ['options' , 'dataflow_default_options' ]
363
+ template_fields = ['options' , 'dataflow_default_options' , 'job_name' ]
289
364
290
365
@apply_defaults
291
366
def __init__ (
292
367
self ,
293
368
py_file ,
369
+ job_name = '{{task.task_id}}' ,
294
370
py_options = None ,
295
371
dataflow_default_options = None ,
296
372
options = None ,
@@ -303,6 +379,7 @@ def __init__(
303
379
super (DataFlowPythonOperator , self ).__init__ (* args , ** kwargs )
304
380
305
381
self .py_file = py_file
382
+ self .job_name = job_name
306
383
self .py_options = py_options or []
307
384
self .dataflow_default_options = dataflow_default_options or {}
308
385
self .options = options or {}
@@ -328,7 +405,7 @@ def execute(self, context):
328
405
formatted_options = {camel_to_snake (key ): dataflow_options [key ]
329
406
for key in dataflow_options }
330
407
hook .start_python_dataflow (
331
- self .task_id , formatted_options ,
408
+ self .job_name , formatted_options ,
332
409
self .py_file , self .py_options )
333
410
334
411
0 commit comments