Skip to content

Commit 2d0899e

Browse files
authoredAug 21, 2024··
llvm: Use numpy array for dynamically sized output arguments (#3033)
Use numpy arrays for "run()" results. Use numpy arrays for "evaluate()" results. Simplify GridSearch compiled search function invocation.
2 parents 7047302 + 7267a1a commit 2d0899e

File tree

3 files changed

+46
-59
lines changed

3 files changed

+46
-59
lines changed
 

‎psyneulink/core/components/functions/nonstateful/optimizationfunctions.py

+6-12
Original file line numberDiff line numberDiff line change
@@ -2096,14 +2096,11 @@ def _function(self,
20962096
# if ocm is not None and ocm.parameters.comp_execution_mode._get(context) in {"PTX", "LLVM"}:
20972097
if ocm is not None and ocm.parameters.comp_execution_mode._get(context) in {"PTX", "LLVM"}:
20982098

2099-
ct_values = all_values
2100-
num_values = len(ct_values)
2101-
21022099
# Reduce array of values to min/max
21032100
# select_min params are:
2104-
# params, state, min_sample_ptr, sample_ptr, min_value_ptr, value_ptr, opt_count_ptr, count
2101+
# params, state, min_sample_ptr, sample_ptr, min_value_ptr, value_ptr, opt_count_ptr, start, stop
21052102
min_tags = frozenset({"select_min", "evaluate_type_objective"})
2106-
bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, ctype_ptr_args=(0, 1, 3, 5))
2103+
bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, ctype_ptr_args=(0, 1, 3), dynamic_size_args=(5,))
21072104

21082105
ct_param = bin_func.byref_arg_types[0](*self._get_param_initializer(context))
21092106
ct_state = bin_func.byref_arg_types[1](*self._get_state_initializer(context))
@@ -2114,15 +2111,12 @@ def _function(self,
21142111
bin_func(ct_param,
21152112
ct_state,
21162113
optimal_sample,
2117-
None, # samples. NULL, it's generated by the function.
2114+
None, # samples. NULL, it's generated by the function.
21182115
optimal_value,
2119-
ct_values,
2116+
all_values,
21202117
number_of_optimal_values,
2121-
bin_func.c_func.argtypes[7](0), # start
2122-
bin_func.c_func.argtypes[8](num_values)) # stop
2123-
2124-
# Convert outputs to Numpy/Python
2125-
all_values = np.ctypeslib.as_array(ct_values)
2118+
0, # start
2119+
len(all_values)) # stop
21262120

21272121
# Python version
21282122
else:

‎psyneulink/core/llvm/__init__.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def _llvm_build(target_generation=_binary_generation + 1):
123123

124124

125125
class LLVMBinaryFunction:
126-
def __init__(self, name: str, *, ctype_ptr_args=()):
126+
def __init__(self, name: str, *, ctype_ptr_args:tuple=(), dynamic_size_args:tuple=()):
127127
self.name = name
128128

129129
self.__c_func = None
@@ -154,7 +154,10 @@ def __init__(self, name: str, *, ctype_ptr_args=()):
154154

155155
for i, arg in enumerate(self.np_arg_dtypes):
156156
if i not in ctype_ptr_args and self.byref_arg_types[i] is not None:
157-
args[i] = np.ctypeslib.ndpointer(dtype=arg.base, shape=arg.shape)
157+
if i in dynamic_size_args:
158+
args[i] = np.ctypeslib.ndpointer(dtype=arg.base, ndim=len(arg.shape) + 1, flags='C_CONTIGUOUS')
159+
else:
160+
args[i] = np.ctypeslib.ndpointer(dtype=arg.base, shape=arg.shape, flags='C_CONTIGUOUS')
158161

159162
middle = time.perf_counter()
160163
self.__c_func_type = ctypes.CFUNCTYPE(return_type, *args)
@@ -233,14 +236,14 @@ def np_buffer_for_arg(self, arg_num, *, extra_dimensions=(), fill_value=np.nan):
233236

234237
@staticmethod
235238
@functools.lru_cache(maxsize=32)
236-
def from_obj(obj, *, tags:frozenset=frozenset(), ctype_ptr_args:tuple=()):
239+
def from_obj(obj, *, tags:frozenset=frozenset(), ctype_ptr_args:tuple=(), dynamic_size_args:tuple=()):
237240
name = LLVMBuilderContext.get_current().gen_llvm_function(obj, tags=tags).name
238-
return LLVMBinaryFunction.get(name, ctype_ptr_args=ctype_ptr_args)
241+
return LLVMBinaryFunction.get(name, ctype_ptr_args=ctype_ptr_args, dynamic_size_args=dynamic_size_args)
239242

240243
@staticmethod
241244
@functools.lru_cache(maxsize=32)
242-
def get(name: str, *, ctype_ptr_args:tuple=()):
243-
return LLVMBinaryFunction(name, ctype_ptr_args=ctype_ptr_args)
245+
def get(name: str, *, ctype_ptr_args:tuple=(), dynamic_size_args:tuple=()):
246+
return LLVMBinaryFunction(name, ctype_ptr_args=ctype_ptr_args, dynamic_size_args=dynamic_size_args)
244247

245248

246249
_cpu_engine = None

‎psyneulink/core/llvm/execution.py

+31-41
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,6 @@
2929
__all__ = ['CompExecution', 'FuncExecution', 'MechExecution']
3030

3131

32-
def _convert_ctype_to_python(x):
33-
if isinstance(x, ctypes.Structure):
34-
return [_convert_ctype_to_python(getattr(x, field_name)) for field_name, _ in x._fields_]
35-
if isinstance(x, ctypes.Array):
36-
return [_convert_ctype_to_python(el) for el in x]
37-
if isinstance(x, (ctypes.c_double, ctypes.c_float)):
38-
return x.value
39-
if isinstance(x, (float, int)):
40-
return x
41-
42-
assert False, "Don't know how to convert: {}".format(x)
43-
44-
4532
def _tupleize(x):
4633
try:
4734
return tuple(_tupleize(y) for y in x)
@@ -557,7 +544,8 @@ def _bin_run_func(self):
557544
if self.__bin_run_func is None:
558545
self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj(self._composition,
559546
tags=self.__tags.union({"run"}),
560-
ctype_ptr_args=(3, 4))
547+
ctype_ptr_args=(3,),
548+
dynamic_size_args=(4,))
561549

562550
return self.__bin_run_func
563551

@@ -572,53 +560,53 @@ def _prepare_run(self, inputs, runs, num_input_sets):
572560
inputs = self._get_run_input_struct(inputs, num_input_sets)
573561

574562
# Create output buffer
575-
outputs = (self._bin_run_func.byref_arg_types[4] * runs)()
563+
outputs = self._bin_func.np_buffer_for_arg(4, extra_dimensions=(runs,))
564+
assert ctypes.sizeof(self._bin_run_func.byref_arg_types[4]) * runs == outputs.nbytes
576565

577566
if "stat" in self._debug_env:
578-
print("Output struct size:", _pretty_size(ctypes.sizeof(outputs)),
579-
"for", self._composition.name)
567+
print("Output struct size:", _pretty_size(outputs.nbytes), "for", self._composition.name)
580568

581569
runs_count = np.asarray(runs, dtype=np.uint32).copy()
582570
input_count = np.asarray(num_input_sets, dtype=np.uint32)
583571

584572
return inputs, outputs, runs_count, input_count
585573

586574
def run(self, inputs, runs, num_input_sets):
587-
ct_inputs, ct_outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets)
575+
ct_inputs, outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets)
588576

589577
self._bin_run_func(self._state_struct,
590578
self._param_struct,
591579
self._data_struct,
592580
ct_inputs,
593-
ct_outputs,
581+
outputs,
594582
runs_count,
595583
input_count)
596584

597585
# Extract only #trials elements in case the run exited early
598586
assert runs_count <= runs, "Composition ran more times than allowed!"
599-
return _convert_ctype_to_python(ct_outputs)[0:runs_count]
587+
return self._get_indexable(outputs[0:runs_count])
600588

601589
def cuda_run(self, inputs, runs, num_input_sets):
602-
ct_inputs, ct_outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets)
590+
ct_inputs, outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets)
603591

604592
self._bin_run_func.cuda_call(self._cuda_state_struct,
605593
self._cuda_param_struct,
606594
self._cuda_data_struct,
607595
jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)),
608-
jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_outputs)),
596+
jit_engine.pycuda.driver.Out(outputs),
609597
jit_engine.pycuda.driver.InOut(runs_count),
610598
jit_engine.pycuda.driver.In(input_count))
611599

612600
# Extract only #trials elements in case the run exited early
613601
assert runs_count <= runs, "Composition ran more times than allowed: {}".format(runs)
614-
return _convert_ctype_to_python(ct_outputs)[0:runs_count]
602+
return self._get_indexable(outputs[0:runs_count])
615603

616604
def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool):
617605
ocm = self._composition.controller
618606

619607
eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective"
620608
tags = {"evaluate", "alloc_range", eval_type}
621-
bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), ctype_ptr_args=(4, 5))
609+
bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), ctype_ptr_args=(5,), dynamic_size_args=(4,))
622610
self.__bin_func = bin_func
623611

624612
# There are 8 arguments to evaluate_alloc_range:
@@ -635,42 +623,42 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
635623
# Construct input variable, the 5th parameter of the evaluate function
636624
ct_inputs = self._get_run_input_struct(inputs, num_input_sets, 5)
637625

638-
# Output ctype
639-
out_el_ty = bin_func.byref_arg_types[4]
626+
# Output buffer
627+
extra_dims = (num_evaluations,)
640628
if all_results:
641629
num_trials = ocm.parameters.num_trials_per_estimate.get(self._execution_context)
642-
if num_trials is None:
643-
num_trials = num_input_sets
644-
out_el_ty *= num_trials
645-
out_ty = out_el_ty * num_evaluations
630+
assert num_trials is not None
631+
extra_dims = extra_dims + (num_trials,)
632+
633+
outputs = self._bin_func.np_buffer_for_arg(4, extra_dimensions=extra_dims)
646634

647635
num_inputs = np.asarray(num_input_sets, dtype=np.uint32)
648636
if "stat" in self._debug_env:
649637
print("Evaluate result struct type size:",
650-
_pretty_size(ctypes.sizeof(out_ty)),
638+
_pretty_size(ctypes.sizeof(outputs.nbytes)),
651639
"( evaluations:", num_evaluations, "element size:", ctypes.sizeof(out_el_ty), ")",
652640
"for", self._obj.name)
653641

654-
return comp_params, comp_state, comp_data, ct_inputs, out_ty(), num_inputs
642+
return comp_params, comp_state, comp_data, ct_inputs, outputs, num_inputs
655643

656644
def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False):
657-
comp_params, comp_state, comp_data, ct_inputs, ct_results, num_inputs = \
645+
comp_params, comp_state, comp_data, ct_inputs, results, num_inputs = \
658646
self._prepare_evaluate(inputs, num_input_sets, num_evaluations, all_results)
659647

660648
cuda_args = (jit_engine.pycuda.driver.In(comp_params),
661649
jit_engine.pycuda.driver.In(comp_state),
662-
jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_results)), # results
650+
jit_engine.pycuda.driver.Out(results), # results
663651
jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), # inputs
664652
jit_engine.pycuda.driver.In(comp_data), # composition data
665653
jit_engine.pycuda.driver.In(num_inputs), # number of inputs
666654
)
667655

668656
self.__bin_func.cuda_call(*cuda_args, threads=int(num_evaluations))
669657

670-
return ct_results
658+
return results
671659

672660
def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False):
673-
comp_params, comp_state, comp_data, ct_inputs, ct_results, num_inputs = \
661+
comp_params, comp_state, comp_data, ct_inputs, outputs, num_inputs = \
674662
self._prepare_evaluate(inputs, num_input_sets, num_evaluations, all_results)
675663

676664
jobs = min(os.cpu_count(), num_evaluations)
@@ -679,19 +667,21 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b
679667
parallel_start = time.time()
680668
with concurrent.futures.ThreadPoolExecutor(max_workers=jobs) as ex:
681669

682-
# Create input and result typed casts once, they are the same
683-
# for every submitted job.
684-
results_arg = ctypes.cast(ct_results, self.__bin_func.c_func.argtypes[4])
670+
# Create input typed cast once, it is the same for every submitted job.
685671
input_arg = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5])
686672

673+
# numpy dynamic args expect only one extra dimension
674+
output_arg = outputs.reshape(-1, *self.__bin_func.np_arg_dtypes[4].shape)
675+
assert output_arg.base is outputs
676+
687677
# There are 8 arguments to evaluate_alloc_range:
688678
# comp_param, comp_state, from, to, results, input, comp_data, input length
689679
results = [ex.submit(self.__bin_func,
690680
comp_params,
691681
comp_state,
692682
int(i * evals_per_job),
693683
min((i + 1) * evals_per_job, num_evaluations),
694-
results_arg,
684+
output_arg,
695685
input_arg,
696686
comp_data,
697687
num_inputs)
@@ -707,4 +697,4 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b
707697
exceptions = [r.exception() for r in results]
708698
assert all(e is None for e in exceptions), "Not all jobs finished sucessfully: {}".format(exceptions)
709699

710-
return ct_results
700+
return outputs

0 commit comments

Comments
 (0)
Please sign in to comment.