29
29
__all__ = ['CompExecution' , 'FuncExecution' , 'MechExecution' ]
30
30
31
31
32
- def _convert_ctype_to_python (x ):
33
- if isinstance (x , ctypes .Structure ):
34
- return [_convert_ctype_to_python (getattr (x , field_name )) for field_name , _ in x ._fields_ ]
35
- if isinstance (x , ctypes .Array ):
36
- return [_convert_ctype_to_python (el ) for el in x ]
37
- if isinstance (x , (ctypes .c_double , ctypes .c_float )):
38
- return x .value
39
- if isinstance (x , (float , int )):
40
- return x
41
-
42
- assert False , "Don't know how to convert: {}" .format (x )
43
-
44
-
45
32
def _tupleize (x ):
46
33
try :
47
34
return tuple (_tupleize (y ) for y in x )
@@ -557,7 +544,8 @@ def _bin_run_func(self):
557
544
if self .__bin_run_func is None :
558
545
self .__bin_run_func = pnlvm .LLVMBinaryFunction .from_obj (self ._composition ,
559
546
tags = self .__tags .union ({"run" }),
560
- ctype_ptr_args = (3 , 4 ))
547
+ ctype_ptr_args = (3 ,),
548
+ dynamic_size_args = (4 ,))
561
549
562
550
return self .__bin_run_func
563
551
@@ -572,53 +560,53 @@ def _prepare_run(self, inputs, runs, num_input_sets):
572
560
inputs = self ._get_run_input_struct (inputs , num_input_sets )
573
561
574
562
# Create output buffer
575
- outputs = (self ._bin_run_func .byref_arg_types [4 ] * runs )()
563
+ outputs = self ._bin_func .np_buffer_for_arg (4 , extra_dimensions = (runs ,))
564
+ assert ctypes .sizeof (self ._bin_run_func .byref_arg_types [4 ]) * runs == outputs .nbytes
576
565
577
566
if "stat" in self ._debug_env :
578
- print ("Output struct size:" , _pretty_size (ctypes .sizeof (outputs )),
579
- "for" , self ._composition .name )
567
+ print ("Output struct size:" , _pretty_size (outputs .nbytes ), "for" , self ._composition .name )
580
568
581
569
runs_count = np .asarray (runs , dtype = np .uint32 ).copy ()
582
570
input_count = np .asarray (num_input_sets , dtype = np .uint32 )
583
571
584
572
return inputs , outputs , runs_count , input_count
585
573
586
574
def run (self , inputs , runs , num_input_sets ):
587
- ct_inputs , ct_outputs , runs_count , input_count = self ._prepare_run (inputs , runs , num_input_sets )
575
+ ct_inputs , outputs , runs_count , input_count = self ._prepare_run (inputs , runs , num_input_sets )
588
576
589
577
self ._bin_run_func (self ._state_struct ,
590
578
self ._param_struct ,
591
579
self ._data_struct ,
592
580
ct_inputs ,
593
- ct_outputs ,
581
+ outputs ,
594
582
runs_count ,
595
583
input_count )
596
584
597
585
# Extract only #trials elements in case the run exited early
598
586
assert runs_count <= runs , "Composition ran more times than allowed!"
599
- return _convert_ctype_to_python ( ct_outputs ) [0 :runs_count ]
587
+ return self . _get_indexable ( outputs [0 :runs_count ])
600
588
601
589
def cuda_run (self , inputs , runs , num_input_sets ):
602
- ct_inputs , ct_outputs , runs_count , input_count = self ._prepare_run (inputs , runs , num_input_sets )
590
+ ct_inputs , outputs , runs_count , input_count = self ._prepare_run (inputs , runs , num_input_sets )
603
591
604
592
self ._bin_run_func .cuda_call (self ._cuda_state_struct ,
605
593
self ._cuda_param_struct ,
606
594
self ._cuda_data_struct ,
607
595
jit_engine .pycuda .driver .In (np .ctypeslib .as_array (ct_inputs )),
608
- jit_engine .pycuda .driver .Out (np . ctypeslib . as_array ( ct_outputs ) ),
596
+ jit_engine .pycuda .driver .Out (outputs ),
609
597
jit_engine .pycuda .driver .InOut (runs_count ),
610
598
jit_engine .pycuda .driver .In (input_count ))
611
599
612
600
# Extract only #trials elements in case the run exited early
613
601
assert runs_count <= runs , "Composition ran more times than allowed: {}" .format (runs )
614
- return _convert_ctype_to_python ( ct_outputs ) [0 :runs_count ]
602
+ return self . _get_indexable ( outputs [0 :runs_count ])
615
603
616
604
def _prepare_evaluate (self , inputs , num_input_sets , num_evaluations , all_results :bool ):
617
605
ocm = self ._composition .controller
618
606
619
607
eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective"
620
608
tags = {"evaluate" , "alloc_range" , eval_type }
621
- bin_func = pnlvm .LLVMBinaryFunction .from_obj (ocm , tags = frozenset (tags ), ctype_ptr_args = (4 , 5 ))
609
+ bin_func = pnlvm .LLVMBinaryFunction .from_obj (ocm , tags = frozenset (tags ), ctype_ptr_args = (5 ,), dynamic_size_args = ( 4 , ))
622
610
self .__bin_func = bin_func
623
611
624
612
# There are 8 arguments to evaluate_alloc_range:
@@ -635,42 +623,42 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
635
623
# Construct input variable, the 5th parameter of the evaluate function
636
624
ct_inputs = self ._get_run_input_struct (inputs , num_input_sets , 5 )
637
625
638
- # Output ctype
639
- out_el_ty = bin_func . byref_arg_types [ 4 ]
626
+ # Output buffer
627
+ extra_dims = ( num_evaluations ,)
640
628
if all_results :
641
629
num_trials = ocm .parameters .num_trials_per_estimate .get (self ._execution_context )
642
- if num_trials is None :
643
- num_trials = num_input_sets
644
- out_el_ty *= num_trials
645
- out_ty = out_el_ty * num_evaluations
630
+ assert num_trials is not None
631
+ extra_dims = extra_dims + ( num_trials ,)
632
+
633
+ outputs = self . _bin_func . np_buffer_for_arg ( 4 , extra_dimensions = extra_dims )
646
634
647
635
num_inputs = np .asarray (num_input_sets , dtype = np .uint32 )
648
636
if "stat" in self ._debug_env :
649
637
print ("Evaluate result struct type size:" ,
650
- _pretty_size (ctypes .sizeof (out_ty )),
638
+ _pretty_size (ctypes .sizeof (outputs . nbytes )),
651
639
"( evaluations:" , num_evaluations , "element size:" , ctypes .sizeof (out_el_ty ), ")" ,
652
640
"for" , self ._obj .name )
653
641
654
- return comp_params , comp_state , comp_data , ct_inputs , out_ty () , num_inputs
642
+ return comp_params , comp_state , comp_data , ct_inputs , outputs , num_inputs
655
643
656
644
def cuda_evaluate (self , inputs , num_input_sets , num_evaluations , all_results :bool = False ):
657
- comp_params , comp_state , comp_data , ct_inputs , ct_results , num_inputs = \
645
+ comp_params , comp_state , comp_data , ct_inputs , results , num_inputs = \
658
646
self ._prepare_evaluate (inputs , num_input_sets , num_evaluations , all_results )
659
647
660
648
cuda_args = (jit_engine .pycuda .driver .In (comp_params ),
661
649
jit_engine .pycuda .driver .In (comp_state ),
662
- jit_engine .pycuda .driver .Out (np . ctypeslib . as_array ( ct_results )), # results
650
+ jit_engine .pycuda .driver .Out (results ), # results
663
651
jit_engine .pycuda .driver .In (np .ctypeslib .as_array (ct_inputs )), # inputs
664
652
jit_engine .pycuda .driver .In (comp_data ), # composition data
665
653
jit_engine .pycuda .driver .In (num_inputs ), # number of inputs
666
654
)
667
655
668
656
self .__bin_func .cuda_call (* cuda_args , threads = int (num_evaluations ))
669
657
670
- return ct_results
658
+ return results
671
659
672
660
def thread_evaluate (self , inputs , num_input_sets , num_evaluations , all_results :bool = False ):
673
- comp_params , comp_state , comp_data , ct_inputs , ct_results , num_inputs = \
661
+ comp_params , comp_state , comp_data , ct_inputs , outputs , num_inputs = \
674
662
self ._prepare_evaluate (inputs , num_input_sets , num_evaluations , all_results )
675
663
676
664
jobs = min (os .cpu_count (), num_evaluations )
@@ -679,19 +667,21 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b
679
667
parallel_start = time .time ()
680
668
with concurrent .futures .ThreadPoolExecutor (max_workers = jobs ) as ex :
681
669
682
- # Create input and result typed casts once, they are the same
683
- # for every submitted job.
684
- results_arg = ctypes .cast (ct_results , self .__bin_func .c_func .argtypes [4 ])
670
+ # Create input typed cast once, it is the same for every submitted job.
685
671
input_arg = ctypes .cast (ct_inputs , self .__bin_func .c_func .argtypes [5 ])
686
672
673
+ # numpy dynamic args expect only one extra dimension
674
+ output_arg = outputs .reshape (- 1 , * self .__bin_func .np_arg_dtypes [4 ].shape )
675
+ assert output_arg .base is outputs
676
+
687
677
# There are 8 arguments to evaluate_alloc_range:
688
678
# comp_param, comp_state, from, to, results, input, comp_data, input length
689
679
results = [ex .submit (self .__bin_func ,
690
680
comp_params ,
691
681
comp_state ,
692
682
int (i * evals_per_job ),
693
683
min ((i + 1 ) * evals_per_job , num_evaluations ),
694
- results_arg ,
684
+ output_arg ,
695
685
input_arg ,
696
686
comp_data ,
697
687
num_inputs )
@@ -707,4 +697,4 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b
707
697
exceptions = [r .exception () for r in results ]
708
698
assert all (e is None for e in exceptions ), "Not all jobs finished sucessfully: {}" .format (exceptions )
709
699
710
- return ct_results
700
+ return outputs
0 commit comments